diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index 22590043f..09546467e 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -5,97 +5,148 @@ import re
 from collections import namedtuple
 
 from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
 from .tag_map import TAG_MAP
+from .tag_orth_map import TAG_ORTH_MAP
+from .tag_bigram_map import TAG_BIGRAM_MAP
 from ...attrs import LANG
-from ...language import Language
-from ...tokens import Doc
 from ...compat import copy_reg
-from ...util import DummyTokenizer
+from ...language import Language
+from ...symbols import POS
+from ...tokens import Doc
+from ...util import DummyTokenizer, get_words_and_spaces
+
+# Hold the attributes we need with convenient names
+DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])
 
 # Handling for multiple spaces in a row is somewhat awkward, this simplifies
 # the flow by creating a dummy with the same interface.
-DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
-DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
-DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" "))
+DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
+DummySpace = DummyNode(" ", " ", " ")
 
 
-def try_fugashi_import():
-    """Fugashi is required for Japanese support, so check for it.
+def try_sudachi_import():
+    """SudachiPy is required for Japanese support, so check for it.
     It it's not available blow up and explain how to fix it."""
     try:
-        import fugashi
+        from sudachipy import dictionary, tokenizer
 
-        return fugashi
+        tok = dictionary.Dictionary().create(
+            mode=tokenizer.Tokenizer.SplitMode.A
+        )
+        return tok
     except ImportError:
         raise ImportError(
-            "Japanese support requires Fugashi: " "https://github.com/polm/fugashi"
+            "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy"
         )
 
 
-def resolve_pos(token):
+def resolve_pos(token, next_token):
     """If necessary, add a field to the POS tag for UD mapping.
     Under Universal Dependencies, sometimes the same Unidic POS tag can
     be mapped differently depending on the literal token or its context
-    in the sentence. This function adds information to the POS tag to
-    resolve ambiguous mappings.
+    in the sentence. This function returns resolved POSs for both token
+    and next_token by tuple.
     """
 
-    # this is only used for consecutive ascii spaces
-    if token.surface == " ":
-        return "空白"
+    # Some tokens have their UD tag decided based on the POS of the following
+    # token.
 
-    # TODO: This is a first take. The rules here are crude approximations.
-    # For many of these, full dependencies are needed to properly resolve
-    # PoS mappings.
-    if token.pos == "連体詞,*,*,*":
-        if re.match(r"[こそあど此其彼]の", token.surface):
-            return token.pos + ",DET"
-        if re.match(r"[こそあど此其彼]", token.surface):
-            return token.pos + ",PRON"
-        return token.pos + ",ADJ"
-    return token.pos
+    # orth based rules
+    if token.pos in TAG_ORTH_MAP:
+        orth_map = TAG_ORTH_MAP[token.pos[0]]
+        if token.surface in orth_map:
+            return orth_map[token.surface], None
+
+    # tag bi-gram mapping
+    if next_token:
+        tag_bigram = token.pos[0], next_token.pos[0]
+        if tag_bigram in TAG_BIGRAM_MAP:
+            bipos = TAG_BIGRAM_MAP[tag_bigram]
+            if bipos[0] is None:
+                return TAG_MAP[token.pos[0]][POS], bipos[1]
+            else:
+                return bipos
+
+    return TAG_MAP[token.pos[0]][POS], None
 
 
-def get_words_and_spaces(tokenizer, text):
-    """Get the individual tokens that make up the sentence and handle white space.
+# Use a mapping of paired punctuation to avoid splitting quoted sentences.
+pairpunct = {'「':'」', '『': '』', '【': '】'}
 
-    Japanese doesn't usually use white space, and MeCab's handling of it for
-    multiple spaces in a row is somewhat awkward.
+
+def separate_sentences(doc):
+    """Given a doc, mark tokens that start sentences based on Unidic tags.
     """
 
-    tokens = tokenizer.parseToNodeList(text)
+    stack = [] # save paired punctuation
 
+    for i, token in enumerate(doc[:-2]):
+        # Set all tokens after the first to false by default. This is necessary
+        # for the doc code to be aware we've done sentencization, see
+        # `is_sentenced`.
+        token.sent_start = (i == 0)
+        if token.tag_:
+            if token.tag_ == "補助記号-括弧開":
+                ts = str(token)
+                if ts in pairpunct:
+                    stack.append(pairpunct[ts])
+                elif stack and ts == stack[-1]:
+                    stack.pop()
+
+            if token.tag_ == "補助記号-句点":
+                next_token = doc[i+1]
+                if next_token.tag_ != token.tag_ and not stack:
+                    next_token.sent_start = True
+
+
+def get_dtokens(tokenizer, text):
+    tokens = tokenizer.tokenize(text)
     words = []
-    spaces = []
-    for token in tokens:
-        # If there's more than one space, spaces after the first become tokens
-        for ii in range(len(token.white_space) - 1):
-            words.append(DummySpace)
-            spaces.append(False)
-
-        words.append(token)
-        spaces.append(bool(token.white_space))
-    return words, spaces
+    for ti, token in enumerate(tokens):
+        tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
+        inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
+        dtoken = DetailedToken(
+                token.surface(),
+                (tag, inf),
+                token.dictionary_form())
+        if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
+            # don't add multiple space tokens in a row
+            continue
+        words.append(dtoken)
 
+    # remove empty tokens. These can be produced with characters like … that
+    # Sudachi normalizes internally. 
+    words = [ww for ww in words if len(ww.surface) > 0]
+    return words
 
 class JapaneseTokenizer(DummyTokenizer):
     def __init__(self, cls, nlp=None):
         self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-        self.tokenizer = try_fugashi_import().Tagger()
-        self.tokenizer.parseToNodeList("")  # see #2901
+        self.tokenizer = try_sudachi_import()
 
     def __call__(self, text):
-        dtokens, spaces = get_words_and_spaces(self.tokenizer, text)
+        dtokens = get_dtokens(self.tokenizer, text)
+
         words = [x.surface for x in dtokens]
+        words, spaces = get_words_and_spaces(words, text)
+        unidic_tags = [",".join(x.pos) for x in dtokens]
         doc = Doc(self.vocab, words=words, spaces=spaces)
-        unidic_tags = []
-        for token, dtoken in zip(doc, dtokens):
-            unidic_tags.append(dtoken.pos)
-            token.tag_ = resolve_pos(dtoken)
+        next_pos = None
+        for ii, (token, dtoken) in enumerate(zip(doc, dtokens)):
+            ntoken = dtokens[ii+1] if ii+1 < len(dtokens) else None
+            token.tag_ = dtoken.pos[0]
+            if next_pos:
+                token.pos = next_pos
+                next_pos = None
+            else:
+                token.pos, next_pos = resolve_pos(dtoken, ntoken)
 
             # if there's no lemma info (it's an unk) just use the surface
-            token.lemma_ = dtoken.feature.lemma or dtoken.surface
+            token.lemma_ = dtoken.lemma
         doc.user_data["unidic_tags"] = unidic_tags
+
+        separate_sentences(doc)
         return doc
 
 
@@ -104,6 +155,7 @@ class JapaneseDefaults(Language.Defaults):
     lex_attr_getters[LANG] = lambda _text: "ja"
     stop_words = STOP_WORDS
     tag_map = TAG_MAP
+    syntax_iterators = SYNTAX_ITERATORS
     writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 
     @classmethod
diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py
new file mode 100644
index 000000000..7c3eee336
--- /dev/null
+++ b/spacy/lang/ja/bunsetu.py
@@ -0,0 +1,144 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+
+
+POS_PHRASE_MAP = {
+    "NOUN": "NP",
+    "NUM": "NP",
+    "PRON": "NP",
+    "PROPN": "NP",
+
+    "VERB": "VP",
+
+    "ADJ": "ADJP",
+
+    "ADV": "ADVP",
+
+    "CCONJ": "CCONJP",
+}
+
+
+# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)]
+def yield_bunsetu(doc, debug=False):
+    bunsetu = []
+    bunsetu_may_end = False
+    phrase_type = None
+    phrase = None
+    prev = None
+    prev_tag = None
+    prev_dep = None
+    prev_head = None
+    for t in doc:
+        pos = t.pos_
+        pos_type = POS_PHRASE_MAP.get(pos, None)
+        tag = t.tag_
+        dep = t.dep_
+        head = t.head.i
+        if debug:
+            print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
+
+        # DET is always an individual bunsetu
+        if pos == "DET":
+            if bunsetu:
+                yield bunsetu, phrase_type, phrase
+            yield [t], None, None
+            bunsetu = []
+            bunsetu_may_end = False
+            phrase_type = None
+            phrase = None
+
+        # PRON or Open PUNCT always splits bunsetu
+        elif tag == "補助記号-括弧開":
+            if bunsetu:
+                yield bunsetu, phrase_type, phrase
+            bunsetu = [t]
+            bunsetu_may_end = True
+            phrase_type = None
+            phrase = None
+
+        # bunsetu head not appeared
+        elif phrase_type is None:
+            if bunsetu and prev_tag == "補助記号-読点":
+                yield bunsetu, phrase_type, phrase
+                bunsetu = []
+                bunsetu_may_end = False
+                phrase_type = None
+                phrase = None
+            bunsetu.append(t)
+            if pos_type:  # begin phrase
+                phrase = [t]
+                phrase_type = pos_type
+                if pos_type in {"ADVP", "CCONJP"}:
+                    bunsetu_may_end = True
+
+        # entering new bunsetu
+        elif pos_type and (
+            pos_type != phrase_type or  # different phrase type arises
+            bunsetu_may_end  # same phrase type but bunsetu already ended
+        ):
+            # exceptional case: NOUN to VERB
+            if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
+                bunsetu.append(t)
+                phrase_type = "VP"
+                phrase.append(t)
+            # exceptional case: VERB to NOUN
+            elif phrase_type == "VP" and pos_type == "NP" and (
+                    prev_dep == 'compound' and prev_head == t.i or
+                    dep == 'compound' and prev == head or
+                    prev_dep == 'nmod' and prev_head == t.i
+            ):
+                bunsetu.append(t)
+                phrase_type = "NP"
+                phrase.append(t)
+            else:
+                yield bunsetu, phrase_type, phrase
+                bunsetu = [t]
+                bunsetu_may_end = False
+                phrase_type = pos_type
+                phrase = [t]
+
+        # NOUN bunsetu
+        elif phrase_type == "NP":
+            bunsetu.append(t)
+            if not bunsetu_may_end and ((
+                (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
+            ) or (
+                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
+            )):
+                phrase.append(t)
+            else:
+                bunsetu_may_end = True
+
+        # VERB bunsetu
+        elif phrase_type == "VP":
+            bunsetu.append(t)
+            if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
+                phrase.append(t)
+            else:
+                bunsetu_may_end = True
+
+        # ADJ bunsetu
+        elif phrase_type == "ADJP" and tag != '連体詞':
+            bunsetu.append(t)
+            if not bunsetu_may_end and ((
+                pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
+            ) or (
+                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
+            )):
+                phrase.append(t)
+            else:
+                bunsetu_may_end = True
+
+        # other bunsetu
+        else:
+            bunsetu.append(t)
+
+        prev = t.i
+        prev_tag = t.tag_
+        prev_dep = t.dep_
+        prev_head = head
+
+    if bunsetu:
+        yield bunsetu, phrase_type, phrase
diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py
new file mode 100644
index 000000000..cd1e4fde7
--- /dev/null
+++ b/spacy/lang/ja/syntax_iterators.py
@@ -0,0 +1,55 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON, VERB
+
+# XXX this can probably be pruned a bit
+labels = [
+        "nsubj",
+        "nmod",
+        "dobj",
+        "nsubjpass",
+        "pcomp",
+        "pobj",
+        "obj",
+        "obl",
+        "dative",
+        "appos",
+        "attr",
+        "ROOT",
+]
+
+def noun_chunks(obj):
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings.add(label) for label in labels]
+    conj = doc.vocab.strings.add("conj")
+    np_label = doc.vocab.strings.add("NP")
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            unseen = [w.i for w in word.subtree if w.i not in seen]
+            if not unseen:
+                continue
+
+            # this takes care of particles etc.
+            seen.update(j.i for j in word.subtree)
+            # This avoids duplicating embedded clauses
+            seen.update(range(word.i + 1))
+
+            # if the head of this is a verb, mark that and rights seen
+            # Don't do the subtree as that can hide other phrases
+            if word.head.pos == VERB:
+                seen.add(word.head.i)
+                seen.update(w.i for w in word.head.rights)
+            yield unseen[0], word.i + 1, np_label
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ja/tag_bigram_map.py b/spacy/lang/ja/tag_bigram_map.py
new file mode 100644
index 000000000..5ed9aec89
--- /dev/null
+++ b/spacy/lang/ja/tag_bigram_map.py
@@ -0,0 +1,37 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB
+
+# mapping from tag bi-gram to pos of previous token
+TAG_BIGRAM_MAP = {
+    # This covers only small part of AUX.
+    ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None),
+
+    ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None),
+    # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ),
+
+    # This covers acl, advcl, obl and root, but has side effect for compound.
+    ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX),
+    # This covers almost all of the deps
+    ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX),
+
+    ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB),
+    ("副詞", "動詞-非自立可能"): (None, VERB),
+    ("形容詞-一般", "動詞-非自立可能"): (None, VERB),
+    ("形容詞-非自立可能", "動詞-非自立可能"): (None, VERB),
+    ("接頭辞", "動詞-非自立可能"): (None, VERB),
+    ("助詞-係助詞", "動詞-非自立可能"): (None, VERB),
+    ("助詞-副助詞", "動詞-非自立可能"): (None, VERB),
+    ("助詞-格助詞", "動詞-非自立可能"): (None, VERB),
+    ("補助記号-読点", "動詞-非自立可能"): (None, VERB),
+
+    ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART),
+
+    ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN),
+    ("連体詞", "形状詞-助動詞語幹"): (None, NOUN),
+
+    ("動詞-一般", "助詞-副助詞"): (None, PART),
+    ("動詞-非自立可能", "助詞-副助詞"): (None, PART),
+    ("助動詞", "助詞-副助詞"): (None, PART),
+}
diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py
index 4ff0a35ee..ad416e109 100644
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@@ -1,82 +1,104 @@
 # encoding: utf8
 from __future__ import unicode_literals
 
-from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
+from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, CCONJ, SCONJ, NOUN
 from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
 
 
 TAG_MAP = {
     # Explanation of Unidic tags:
     # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
-    # Universal Dependencies Mapping:
+    # Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below)
     # http://universaldependencies.org/ja/overview/morphology.html
     # http://universaldependencies.org/ja/pos/all.html
-    "記号,一般,*,*": {
-        POS: PUNCT
+    "記号-一般": {
+        POS: NOUN
     },  # this includes characters used to represent sounds like ドレミ
-    "記号,文字,*,*": {
-        POS: PUNCT
-    },  # this is for Greek and Latin characters used as sumbols, as in math
-    "感動詞,フィラー,*,*": {POS: INTJ},
-    "感動詞,一般,*,*": {POS: INTJ},
-    # this is specifically for unicode full-width space
-    "空白,*,*,*": {POS: X},
-    # This is used when sequential half-width spaces are present
+    "記号-文字": {
+        POS: NOUN
+    },  # this is for Greek and Latin characters having some meanings, or used as symbols, as in math
+    "感動詞-フィラー": {POS: INTJ},
+    "感動詞-一般": {POS: INTJ},
+
     "空白": {POS: SPACE},
-    "形状詞,一般,*,*": {POS: ADJ},
-    "形状詞,タリ,*,*": {POS: ADJ},
-    "形状詞,助動詞語幹,*,*": {POS: ADJ},
-    "形容詞,一般,*,*": {POS: ADJ},
-    "形容詞,非自立可能,*,*": {POS: AUX},  # XXX ADJ if alone, AUX otherwise
-    "助詞,格助詞,*,*": {POS: ADP},
-    "助詞,係助詞,*,*": {POS: ADP},
-    "助詞,終助詞,*,*": {POS: PART},
-    "助詞,準体助詞,*,*": {POS: SCONJ},  # の as in 走るのが速い
-    "助詞,接続助詞,*,*": {POS: SCONJ},  # verb ending て
-    "助詞,副助詞,*,*": {POS: PART},  # ばかり, つつ after a verb
-    "助動詞,*,*,*": {POS: AUX},
-    "接続詞,*,*,*": {POS: SCONJ},  # XXX: might need refinement
-    "接頭辞,*,*,*": {POS: NOUN},
-    "接尾辞,形状詞的,*,*": {POS: ADJ},  # がち, チック
-    "接尾辞,形容詞的,*,*": {POS: ADJ},  # -らしい
-    "接尾辞,動詞的,*,*": {POS: NOUN},  # -じみ
-    "接尾辞,名詞的,サ変可能,*": {POS: NOUN},  # XXX see 名詞,普通名詞,サ変可能,*
-    "接尾辞,名詞的,一般,*": {POS: NOUN},
-    "接尾辞,名詞的,助数詞,*": {POS: NOUN},
-    "接尾辞,名詞的,副詞可能,*": {POS: NOUN},  # -後, -過ぎ
-    "代名詞,*,*,*": {POS: PRON},
-    "動詞,一般,*,*": {POS: VERB},
-    "動詞,非自立可能,*,*": {POS: VERB},  # XXX VERB if alone, AUX otherwise
-    "動詞,非自立可能,*,*,AUX": {POS: AUX},
-    "動詞,非自立可能,*,*,VERB": {POS: VERB},
-    "副詞,*,*,*": {POS: ADV},
-    "補助記号,ＡＡ,一般,*": {POS: SYM},  # text art
-    "補助記号,ＡＡ,顔文字,*": {POS: SYM},  # kaomoji
-    "補助記号,一般,*,*": {POS: SYM},
-    "補助記号,括弧開,*,*": {POS: PUNCT},  # open bracket
-    "補助記号,括弧閉,*,*": {POS: PUNCT},  # close bracket
-    "補助記号,句点,*,*": {POS: PUNCT},  # period or other EOS marker
-    "補助記号,読点,*,*": {POS: PUNCT},  # comma
-    "名詞,固有名詞,一般,*": {POS: PROPN},  # general proper noun
-    "名詞,固有名詞,人名,一般": {POS: PROPN},  # person's name
-    "名詞,固有名詞,人名,姓": {POS: PROPN},  # surname
-    "名詞,固有名詞,人名,名": {POS: PROPN},  # first name
-    "名詞,固有名詞,地名,一般": {POS: PROPN},  # place name
-    "名詞,固有名詞,地名,国": {POS: PROPN},  # country name
-    "名詞,助動詞語幹,*,*": {POS: AUX},
-    "名詞,数詞,*,*": {POS: NUM},  # includes Chinese numerals
-    "名詞,普通名詞,サ変可能,*": {POS: NOUN},  # XXX: sometimes VERB in UDv2; suru-verb noun
-    "名詞,普通名詞,サ変可能,*,NOUN": {POS: NOUN},
-    "名詞,普通名詞,サ変可能,*,VERB": {POS: VERB},
-    "名詞,普通名詞,サ変形状詞可能,*": {POS: NOUN},  # ex: 下手
-    "名詞,普通名詞,一般,*": {POS: NOUN},
-    "名詞,普通名詞,形状詞可能,*": {POS: NOUN},  # XXX: sometimes ADJ in UDv2
-    "名詞,普通名詞,形状詞可能,*,NOUN": {POS: NOUN},
-    "名詞,普通名詞,形状詞可能,*,ADJ": {POS: ADJ},
-    "名詞,普通名詞,助数詞可能,*": {POS: NOUN},  # counter / unit
-    "名詞,普通名詞,副詞可能,*": {POS: NOUN},
-    "連体詞,*,*,*": {POS: ADJ},  # XXX this has exceptions based on literal token
-    "連体詞,*,*,*,ADJ": {POS: ADJ},
-    "連体詞,*,*,*,PRON": {POS: PRON},
-    "連体詞,*,*,*,DET": {POS: DET},
+
+    "形状詞-一般": {POS: ADJ},
+    "形状詞-タリ": {POS: ADJ},
+    "形状詞-助動詞語幹": {POS: AUX},
+
+    "形容詞-一般": {POS: ADJ},
+
+    "形容詞-非自立可能": {POS: ADJ},  # XXX ADJ if alone, AUX otherwise
+
+    "助詞-格助詞": {POS: ADP},
+
+    "助詞-係助詞": {POS: ADP},
+
+    "助詞-終助詞": {POS: PART},
+    "助詞-準体助詞": {POS: SCONJ},  # の as in 走るのが速い
+    "助詞-接続助詞": {POS: SCONJ},  # verb ending て0
+
+    "助詞-副助詞": {POS: ADP},  # ばかり, つつ after a verb
+
+    "助動詞": {POS: AUX},
+
+    "接続詞": {POS: CCONJ},  # XXX: might need refinement
+    "接頭辞": {POS: NOUN},
+    "接尾辞-形状詞的": {POS: PART},  # がち, チック
+
+    "接尾辞-形容詞的": {POS: AUX},  # -らしい
+
+    "接尾辞-動詞的": {POS: PART},  # -じみ
+    "接尾辞-名詞的-サ変可能": {POS: NOUN},  # XXX see 名詞,普通名詞,サ変可能,*
+    "接尾辞-名詞的-一般": {POS: NOUN},
+    "接尾辞-名詞的-助数詞": {POS: NOUN},
+    "接尾辞-名詞的-副詞可能": {POS: NOUN},  # -後, -過ぎ
+
+    "代名詞": {POS: PRON},
+
+    "動詞-一般": {POS: VERB},
+
+    "動詞-非自立可能": {POS: AUX},  # XXX VERB if alone, AUX otherwise
+
+    "副詞": {POS: ADV},
+
+    "補助記号-ＡＡ-一般": {POS: SYM},  # text art
+    "補助記号-ＡＡ-顔文字": {POS: PUNCT},  # kaomoji
+
+    "補助記号-一般": {POS: SYM},
+
+    "補助記号-括弧開": {POS: PUNCT},  # open bracket
+    "補助記号-括弧閉": {POS: PUNCT},  # close bracket
+    "補助記号-句点": {POS: PUNCT},  # period or other EOS marker
+    "補助記号-読点": {POS: PUNCT},  # comma
+
+    "名詞-固有名詞-一般": {POS: PROPN},  # general proper noun
+    "名詞-固有名詞-人名-一般": {POS: PROPN},  # person's name
+    "名詞-固有名詞-人名-姓": {POS: PROPN},  # surname
+    "名詞-固有名詞-人名-名": {POS: PROPN},  # first name
+    "名詞-固有名詞-地名-一般": {POS: PROPN},  # place name
+    "名詞-固有名詞-地名-国": {POS: PROPN},  # country name
+
+    "名詞-助動詞語幹": {POS: AUX},
+    "名詞-数詞": {POS: NUM},  # includes Chinese numerals
+
+    "名詞-普通名詞-サ変可能": {POS: NOUN},  # XXX: sometimes VERB in UDv2; suru-verb noun
+
+    "名詞-普通名詞-サ変形状詞可能": {POS: NOUN},
+
+    "名詞-普通名詞-一般": {POS: NOUN},
+
+    "名詞-普通名詞-形状詞可能": {POS: NOUN},  # XXX: sometimes ADJ in UDv2
+
+    "名詞-普通名詞-助数詞可能": {POS: NOUN},  # counter / unit
+
+    "名詞-普通名詞-副詞可能": {POS: NOUN},
+
+    "連体詞": {POS: DET},  # XXX this has exceptions based on literal token
+
+    # GSD tags. These aren't in Unidic, but we need them for the GSD data.
+    "外国語": {POS: PROPN},  # Foreign words
+
+    "絵文字・記号等": {POS: SYM},  # emoji / kaomoji ^^;
+
 }
diff --git a/spacy/lang/ja/tag_orth_map.py b/spacy/lang/ja/tag_orth_map.py
new file mode 100644
index 000000000..355cc655b
--- /dev/null
+++ b/spacy/lang/ja/tag_orth_map.py
@@ -0,0 +1,30 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X
+
+# mapping from tag bi-gram to pos of previous token
+TAG_ORTH_MAP = {
+    "空白": {
+        " ": SPACE,
+        "　": X,
+    },
+    "助詞-副助詞": {
+        "たり": PART,
+    },
+    "連体詞": {
+        "あの": DET,
+        "かの": DET,
+        "この": DET,
+        "その": DET,
+        "どの": DET,
+        "彼の": DET,
+        "此の": DET,
+        "其の": DET,
+        "ある": PRON,
+        "こんな": PRON,
+        "そんな": PRON,
+        "どんな": PRON,
+        "あらゆる": PRON,
+    },
+}
diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py
index cfff0fcfe..58cd3f3bf 100644
--- a/spacy/tests/lang/ja/test_lemmatization.py
+++ b/spacy/tests/lang/ja/test_lemmatization.py
@@ -6,7 +6,7 @@ import pytest
 
 @pytest.mark.parametrize(
     "word,lemma",
-    [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")],
+    [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")],
 )
 def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
     test_lemma = ja_tokenizer(word)[0].lemma_
diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py
index ad8bfaa00..5213aed58 100644
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@@ -14,20 +14,26 @@ TOKENIZER_TESTS = [
 ]
 
 TAG_TESTS = [
-    ("日本語だよ", ['名詞,固有名詞,地名,国', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '助詞,終助詞,*,*']),
-    ("東京タワーの近くに住んでいます。", ['名詞,固有名詞,地名,一般', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '動詞,非自立可能,*,*', '助動詞,*,*,*', '補助記号,句点,*,*']),
-    ("吾輩は猫である。", ['代名詞,*,*,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '動詞,非自立可能,*,*', '補助記号,句点,*,*']),
-    ("月に代わって、お仕置きよ!", ['名詞,普通名詞,助数詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '補助記号,読点,*,*', '接頭辞,*,*,*', '名詞,普通名詞,一般,*', '助詞,終助詞,*,*', '補助記号,句点,*,*']),
-    ("すもももももももものうち", ['名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*'])
+    ("日本語だよ", ['名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
+    ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
+    ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
+    ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点']),
+    ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
 ]
 
 POS_TESTS = [
-    ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
+    ('日本語だよ', ['fish', 'NOUN', 'AUX', 'PART']),
     ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']),
     ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']),
     ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
     ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
 ]
+
+SENTENCE_TESTS = [
+        ('あれ。これ。', ['あれ。', 'これ。']),
+        ('「伝染るんです。」という漫画があります。', 
+            ['「伝染るんです。」という漫画があります。']),
+        ]
 # fmt: on
 
 
@@ -43,14 +49,27 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
     assert tags == expected_tags
 
 
+#XXX This isn't working? Always passes
 @pytest.mark.parametrize("text,expected_pos", POS_TESTS)
 def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
     pos = [token.pos_ for token in ja_tokenizer(text)]
     assert pos == expected_pos
 
+@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
+def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents):
+    sents = [str(sent) for sent in ja_tokenizer(text).sents]
+    assert sents == expected_sents
+
 
 def test_extra_spaces(ja_tokenizer):
     # note: three spaces after "I"
     tokens = ja_tokenizer("I   like cheese.")
-    assert tokens[1].orth_ == " "
-    assert tokens[2].orth_ == " "
+    assert tokens[1].orth_ == "  "
+
+from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
+
+@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
+def test_tokenizer_naughty_strings(ja_tokenizer, text):
+    tokens = ja_tokenizer(text)
+    assert tokens.text_with_ws == text
+