Add Japanese Model (#5544)

* Add more rules to deal with Japanese UD mappings Japanese UD rules sometimes give different UD tags to tokens with the same underlying POS tag. The UD spec indicates these cases should be disambiguated using the output of a tool called "comainu", but rules are enough to get the right result. These rules are taken from Ginza at time of writing, see #3756. * Add new tags from GSD This is a few rare tags that aren't in Unidic but are in the GSD data. * Add basic Japanese sentencization This code is taken from Ginza again. * Add sentenceizer quote handling Could probably add more paired characters but this will do for now. Also includes some tests. * Replace fugashi with SudachiPy * Modify tag format to match GSD annotations Some of the tests still need to be updated, but I want to get this up for testing training. * Deal with case with closing punct without opening * refactor resolve_pos() * change tag field separator from "," to "-" * add TAG_ORTH_MAP * add TAG_BIGRAM_MAP * revise rules for 連体詞 * revise rules for 連体詞 * improve POS about 2% * add syntax_iterator.py (not mature yet) * improve syntax_iterators.py * improve syntax_iterators.py * add phrases including nouns and drop NPs consist of STOP_WORDS * First take at noun chunks This works in many situations but still has issues in others. If the start of a subtree has no noun, then nested phrases can be generated. また行きたい、そんな気持ちにさせてくれるお店です。 [そんな気持ち, また行きたい、そんな気持ちにさせてくれるお店] For some reason て gets included sometimes. Not sure why. ゲンに連れ添って円盤生物を調査するパートナーとなる。 [て円盤生物, ...] Some phrases that look like they should be split are grouped together; not entirely sure that's wrong. This whole thing becomes one chunk: 道の駅遠山郷北側からかぐら大橋南詰現道交点までの1.060kmのみ開通済み * Use new generic get_words_and_spaces The new get_words_and_spaces function is simpler than what was used in Japanese, so it's good to be able to switch to it. However, there was an issue. The new function works just on text, so POS info could get out of sync. Fixing this required a small change to the way dtokens (tokens with POS and lemma info) were generated. Specifically, multiple extraneous spaces now become a single token, so when generating dtokens multiple space tokens should be created in a row. * Fix noun_chunks, should be working now * Fix some tests, add naughty strings tests Some of the existing tests changed because the tokenization mode of Sudachi changed to the more fine-grained A mode. Sudachi also has issues with some strings, so this adds a test against the naughty strings. * Remove empty Sudachi tokens Not doing this creates zero-length tokens and causes errors in the internal spaCy processing. * Add yield_bunsetu back in as a separate piece of code Co-authored-by: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Co-authored-by: hiroshi <hiroshi_matsuda@megagon.ai>
2025-11-08 11:57:39 +03:00 · 2020-06-05 02:15:43 +09:00 · 2020-06-05 02:15:43 +09:00 · 410fb7ee43
commit 410fb7ee43
parent d79964bcb1
8 changed files with 486 additions and 127 deletions
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -5,97 +5,148 @@ import re
 from collections import namedtuple

 from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
 from .tag_map import TAG_MAP
+from .tag_orth_map import TAG_ORTH_MAP
+from .tag_bigram_map import TAG_BIGRAM_MAP
 from ...attrs import LANG
-from ...language import Language
-from ...tokens import Doc
 from ...compat import copy_reg
-from ...util import DummyTokenizer
+from ...language import Language
+from ...symbols import POS
+from ...tokens import Doc
+from ...util import DummyTokenizer, get_words_and_spaces
+
+# Hold the attributes we need with convenient names
+DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])

 # Handling for multiple spaces in a row is somewhat awkward, this simplifies
 # the flow by creating a dummy with the same interface.
-DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
-DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
-DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" "))
+DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
+DummySpace = DummyNode(" ", " ", " ")


-def try_fugashi_import():
-    """Fugashi is required for Japanese support, so check for it.
+def try_sudachi_import():
+    """SudachiPy is required for Japanese support, so check for it.
    It it's not available blow up and explain how to fix it."""
    try:
-        import fugashi
+        from sudachipy import dictionary, tokenizer

-        return fugashi
+        tok = dictionary.Dictionary().create(
+            mode=tokenizer.Tokenizer.SplitMode.A
+        )
+        return tok
    except ImportError:
        raise ImportError(
-            "Japanese support requires Fugashi: " "https://github.com/polm/fugashi"
+            "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy"
        )


-def resolve_pos(token):
+def resolve_pos(token, next_token):
    """If necessary, add a field to the POS tag for UD mapping.
    Under Universal Dependencies, sometimes the same Unidic POS tag can
    be mapped differently depending on the literal token or its context
-    in the sentence. This function adds information to the POS tag to
-    resolve ambiguous mappings.
+    in the sentence. This function returns resolved POSs for both token
+    and next_token by tuple.
    """

-    # this is only used for consecutive ascii spaces
-    if token.surface == " ":
-        return "空白"
+    # Some tokens have their UD tag decided based on the POS of the following
+    # token.

-    # TODO: This is a first take. The rules here are crude approximations.
-    # For many of these, full dependencies are needed to properly resolve
-    # PoS mappings.
-    if token.pos == "連体詞,*,*,*":
-        if re.match(r"[こそあど此其彼]の", token.surface):
-            return token.pos + ",DET"
-        if re.match(r"[こそあど此其彼]", token.surface):
-            return token.pos + ",PRON"
-        return token.pos + ",ADJ"
-    return token.pos
+    # orth based rules
+    if token.pos in TAG_ORTH_MAP:
+        orth_map = TAG_ORTH_MAP[token.pos[0]]
+        if token.surface in orth_map:
+            return orth_map[token.surface], None
+
+    # tag bi-gram mapping
+    if next_token:
+        tag_bigram = token.pos[0], next_token.pos[0]
+        if tag_bigram in TAG_BIGRAM_MAP:
+            bipos = TAG_BIGRAM_MAP[tag_bigram]
+            if bipos[0] is None:
+                return TAG_MAP[token.pos[0]][POS], bipos[1]
+            else:
+                return bipos
+
+    return TAG_MAP[token.pos[0]][POS], None


-def get_words_and_spaces(tokenizer, text):
-    """Get the individual tokens that make up the sentence and handle white space.
+# Use a mapping of paired punctuation to avoid splitting quoted sentences.
+pairpunct = {'「':'」', '『': '』', '【': '】'}

-    Japanese doesn't usually use white space, and MeCab's handling of it for
-    multiple spaces in a row is somewhat awkward.
+
+def separate_sentences(doc):
+    """Given a doc, mark tokens that start sentences based on Unidic tags.
    """

-    tokens = tokenizer.parseToNodeList(text)
+    stack = [] # save paired punctuation

+    for i, token in enumerate(doc[:-2]):
+        # Set all tokens after the first to false by default. This is necessary
+        # for the doc code to be aware we've done sentencization, see
+        # `is_sentenced`.
+        token.sent_start = (i == 0)
+        if token.tag_:
+            if token.tag_ == "補助記号-括弧開":
+                ts = str(token)
+                if ts in pairpunct:
+                    stack.append(pairpunct[ts])
+                elif stack and ts == stack[-1]:
+                    stack.pop()
+
+            if token.tag_ == "補助記号-句点":
+                next_token = doc[i+1]
+                if next_token.tag_ != token.tag_ and not stack:
+                    next_token.sent_start = True
+
+
+def get_dtokens(tokenizer, text):
+    tokens = tokenizer.tokenize(text)
    words = []
-    spaces = []
-    for token in tokens:
-        # If there's more than one space, spaces after the first become tokens
-        for ii in range(len(token.white_space) - 1):
-            words.append(DummySpace)
-            spaces.append(False)
-
-        words.append(token)
-        spaces.append(bool(token.white_space))
-    return words, spaces
+    for ti, token in enumerate(tokens):
+        tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*'])
+        inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*'])
+        dtoken = DetailedToken(
+                token.surface(),
+                (tag, inf),
+                token.dictionary_form())
+        if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白':
+            # don't add multiple space tokens in a row
+            continue
+        words.append(dtoken)

+    # remove empty tokens. These can be produced with characters like … that
+    # Sudachi normalizes internally. 
+    words = [ww for ww in words if len(ww.surface) > 0]
+    return words

 class JapaneseTokenizer(DummyTokenizer):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-        self.tokenizer = try_fugashi_import().Tagger()
-        self.tokenizer.parseToNodeList("")  # see #2901
+        self.tokenizer = try_sudachi_import()

    def __call__(self, text):
-        dtokens, spaces = get_words_and_spaces(self.tokenizer, text)
+        dtokens = get_dtokens(self.tokenizer, text)
+
        words = [x.surface for x in dtokens]
+        words, spaces = get_words_and_spaces(words, text)
+        unidic_tags = [",".join(x.pos) for x in dtokens]
        doc = Doc(self.vocab, words=words, spaces=spaces)
-        unidic_tags = []
-        for token, dtoken in zip(doc, dtokens):
-            unidic_tags.append(dtoken.pos)
-            token.tag_ = resolve_pos(dtoken)
+        next_pos = None
+        for ii, (token, dtoken) in enumerate(zip(doc, dtokens)):
+            ntoken = dtokens[ii+1] if ii+1 < len(dtokens) else None
+            token.tag_ = dtoken.pos[0]
+            if next_pos:
+                token.pos = next_pos
+                next_pos = None
+            else:
+                token.pos, next_pos = resolve_pos(dtoken, ntoken)

            # if there's no lemma info (it's an unk) just use the surface
-            token.lemma_ = dtoken.feature.lemma or dtoken.surface
+            token.lemma_ = dtoken.lemma
        doc.user_data["unidic_tags"] = unidic_tags
+
+        separate_sentences(doc)
        return doc


@ -104,6 +155,7 @@ class JapaneseDefaults(Language.Defaults):
    lex_attr_getters[LANG] = lambda _text: "ja"
    stop_words = STOP_WORDS
    tag_map = TAG_MAP
+    syntax_iterators = SYNTAX_ITERATORS
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}

    @classmethod
--- a/spacy/lang/ja/bunsetu.py
+++ b/spacy/lang/ja/bunsetu.py
@ -0,0 +1,144 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+
+
+POS_PHRASE_MAP = {
+    "NOUN": "NP",
+    "NUM": "NP",
+    "PRON": "NP",
+    "PROPN": "NP",
+
+    "VERB": "VP",
+
+    "ADJ": "ADJP",
+
+    "ADV": "ADVP",
+
+    "CCONJ": "CCONJP",
+}
+
+
+# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)]
+def yield_bunsetu(doc, debug=False):
+    bunsetu = []
+    bunsetu_may_end = False
+    phrase_type = None
+    phrase = None
+    prev = None
+    prev_tag = None
+    prev_dep = None
+    prev_head = None
+    for t in doc:
+        pos = t.pos_
+        pos_type = POS_PHRASE_MAP.get(pos, None)
+        tag = t.tag_
+        dep = t.dep_
+        head = t.head.i
+        if debug:
+            print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
+
+        # DET is always an individual bunsetu
+        if pos == "DET":
+            if bunsetu:
+                yield bunsetu, phrase_type, phrase
+            yield [t], None, None
+            bunsetu = []
+            bunsetu_may_end = False
+            phrase_type = None
+            phrase = None
+
+        # PRON or Open PUNCT always splits bunsetu
+        elif tag == "補助記号-括弧開":
+            if bunsetu:
+                yield bunsetu, phrase_type, phrase
+            bunsetu = [t]
+            bunsetu_may_end = True
+            phrase_type = None
+            phrase = None
+
+        # bunsetu head not appeared
+        elif phrase_type is None:
+            if bunsetu and prev_tag == "補助記号-読点":
+                yield bunsetu, phrase_type, phrase
+                bunsetu = []
+                bunsetu_may_end = False
+                phrase_type = None
+                phrase = None
+            bunsetu.append(t)
+            if pos_type:  # begin phrase
+                phrase = [t]
+                phrase_type = pos_type
+                if pos_type in {"ADVP", "CCONJP"}:
+                    bunsetu_may_end = True
+
+        # entering new bunsetu
+        elif pos_type and (
+            pos_type != phrase_type or  # different phrase type arises
+            bunsetu_may_end  # same phrase type but bunsetu already ended
+        ):
+            # exceptional case: NOUN to VERB
+            if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
+                bunsetu.append(t)
+                phrase_type = "VP"
+                phrase.append(t)
+            # exceptional case: VERB to NOUN
+            elif phrase_type == "VP" and pos_type == "NP" and (
+                    prev_dep == 'compound' and prev_head == t.i or
+                    dep == 'compound' and prev == head or
+                    prev_dep == 'nmod' and prev_head == t.i
+            ):
+                bunsetu.append(t)
+                phrase_type = "NP"
+                phrase.append(t)
+            else:
+                yield bunsetu, phrase_type, phrase
+                bunsetu = [t]
+                bunsetu_may_end = False
+                phrase_type = pos_type
+                phrase = [t]
+
+        # NOUN bunsetu
+        elif phrase_type == "NP":
+            bunsetu.append(t)
+            if not bunsetu_may_end and ((
+                (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
+            ) or (
+                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
+            )):
+                phrase.append(t)
+            else:
+                bunsetu_may_end = True
+
+        # VERB bunsetu
+        elif phrase_type == "VP":
+            bunsetu.append(t)
+            if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
+                phrase.append(t)
+            else:
+                bunsetu_may_end = True
+
+        # ADJ bunsetu
+        elif phrase_type == "ADJP" and tag != '連体詞':
+            bunsetu.append(t)
+            if not bunsetu_may_end and ((
+                pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
+            ) or (
+                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
+            )):
+                phrase.append(t)
+            else:
+                bunsetu_may_end = True
+
+        # other bunsetu
+        else:
+            bunsetu.append(t)
+
+        prev = t.i
+        prev_tag = t.tag_
+        prev_dep = t.dep_
+        prev_head = head
+
+    if bunsetu:
+        yield bunsetu, phrase_type, phrase
--- a/spacy/lang/ja/syntax_iterators.py
+++ b/spacy/lang/ja/syntax_iterators.py
@ -0,0 +1,55 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON, VERB
+
+# XXX this can probably be pruned a bit
+labels = [
+        "nsubj",
+        "nmod",
+        "dobj",
+        "nsubjpass",
+        "pcomp",
+        "pobj",
+        "obj",
+        "obl",
+        "dative",
+        "appos",
+        "attr",
+        "ROOT",
+]
+
+def noun_chunks(obj):
+    """
+    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+    """
+
+    doc = obj.doc  # Ensure works on both Doc and Span.
+    np_deps = [doc.vocab.strings.add(label) for label in labels]
+    conj = doc.vocab.strings.add("conj")
+    np_label = doc.vocab.strings.add("NP")
+    seen = set()
+    for i, word in enumerate(obj):
+        if word.pos not in (NOUN, PROPN, PRON):
+            continue
+        # Prevent nested chunks from being produced
+        if word.i in seen:
+            continue
+        if word.dep in np_deps:
+            unseen = [w.i for w in word.subtree if w.i not in seen]
+            if not unseen:
+                continue
+
+            # this takes care of particles etc.
+            seen.update(j.i for j in word.subtree)
+            # This avoids duplicating embedded clauses
+            seen.update(range(word.i + 1))
+
+            # if the head of this is a verb, mark that and rights seen
+            # Don't do the subtree as that can hide other phrases
+            if word.head.pos == VERB:
+                seen.add(word.head.i)
+                seen.update(w.i for w in word.head.rights)
+            yield unseen[0], word.i + 1, np_label
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/ja/tag_bigram_map.py
+++ b/spacy/lang/ja/tag_bigram_map.py
@ -0,0 +1,37 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import POS, ADJ, AUX, NOUN, PART, VERB
+
+# mapping from tag bi-gram to pos of previous token
+TAG_BIGRAM_MAP = {
+    # This covers only small part of AUX.
+    ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None),
+
+    ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None),
+    # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ),
+
+    # This covers acl, advcl, obl and root, but has side effect for compound.
+    ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX),
+    # This covers almost all of the deps
+    ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX),
+
+    ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB),
+    ("副詞", "動詞-非自立可能"): (None, VERB),
+    ("形容詞-一般", "動詞-非自立可能"): (None, VERB),
+    ("形容詞-非自立可能", "動詞-非自立可能"): (None, VERB),
+    ("接頭辞", "動詞-非自立可能"): (None, VERB),
+    ("助詞-係助詞", "動詞-非自立可能"): (None, VERB),
+    ("助詞-副助詞", "動詞-非自立可能"): (None, VERB),
+    ("助詞-格助詞", "動詞-非自立可能"): (None, VERB),
+    ("補助記号-読点", "動詞-非自立可能"): (None, VERB),
+
+    ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART),
+
+    ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN),
+    ("連体詞", "形状詞-助動詞語幹"): (None, NOUN),
+
+    ("動詞-一般", "助詞-副助詞"): (None, PART),
+    ("動詞-非自立可能", "助詞-副助詞"): (None, PART),
+    ("助動詞", "助詞-副助詞"): (None, PART),
+}
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@ -1,82 +1,104 @@
 # encoding: utf8
 from __future__ import unicode_literals

-from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
+from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, CCONJ, SCONJ, NOUN
 from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE


 TAG_MAP = {
    # Explanation of Unidic tags:
    # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
-    # Universal Dependencies Mapping:
+    # Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below)
    # http://universaldependencies.org/ja/overview/morphology.html
    # http://universaldependencies.org/ja/pos/all.html
-    "記号,一般,*,*": {
-        POS: PUNCT
+    "記号-一般": {
+        POS: NOUN
    },  # this includes characters used to represent sounds like ドレミ
-    "記号,文字,*,*": {
-        POS: PUNCT
-    },  # this is for Greek and Latin characters used as sumbols, as in math
-    "感動詞,フィラー,*,*": {POS: INTJ},
-    "感動詞,一般,*,*": {POS: INTJ},
-    # this is specifically for unicode full-width space
-    "空白,*,*,*": {POS: X},
-    # This is used when sequential half-width spaces are present
+    "記号-文字": {
+        POS: NOUN
+    },  # this is for Greek and Latin characters having some meanings, or used as symbols, as in math
+    "感動詞-フィラー": {POS: INTJ},
+    "感動詞-一般": {POS: INTJ},
+
    "空白": {POS: SPACE},
-    "形状詞,一般,*,*": {POS: ADJ},
-    "形状詞,タリ,*,*": {POS: ADJ},
-    "形状詞,助動詞語幹,*,*": {POS: ADJ},
-    "形容詞,一般,*,*": {POS: ADJ},
-    "形容詞,非自立可能,*,*": {POS: AUX},  # XXX ADJ if alone, AUX otherwise
-    "助詞,格助詞,*,*": {POS: ADP},
-    "助詞,係助詞,*,*": {POS: ADP},
-    "助詞,終助詞,*,*": {POS: PART},
-    "助詞,準体助詞,*,*": {POS: SCONJ},  # の as in 走るのが速い
-    "助詞,接続助詞,*,*": {POS: SCONJ},  # verb ending て
-    "助詞,副助詞,*,*": {POS: PART},  # ばかり, つつ after a verb
-    "助動詞,*,*,*": {POS: AUX},
-    "接続詞,*,*,*": {POS: SCONJ},  # XXX: might need refinement
-    "接頭辞,*,*,*": {POS: NOUN},
-    "接尾辞,形状詞的,*,*": {POS: ADJ},  # がち, チック
-    "接尾辞,形容詞的,*,*": {POS: ADJ},  # -らしい
-    "接尾辞,動詞的,*,*": {POS: NOUN},  # -じみ
-    "接尾辞,名詞的,サ変可能,*": {POS: NOUN},  # XXX see 名詞,普通名詞,サ変可能,*
-    "接尾辞,名詞的,一般,*": {POS: NOUN},
-    "接尾辞,名詞的,助数詞,*": {POS: NOUN},
-    "接尾辞,名詞的,副詞可能,*": {POS: NOUN},  # -後, -過ぎ
-    "代名詞,*,*,*": {POS: PRON},
-    "動詞,一般,*,*": {POS: VERB},
-    "動詞,非自立可能,*,*": {POS: VERB},  # XXX VERB if alone, AUX otherwise
-    "動詞,非自立可能,*,*,AUX": {POS: AUX},
-    "動詞,非自立可能,*,*,VERB": {POS: VERB},
-    "副詞,*,*,*": {POS: ADV},
-    "補助記号,ＡＡ,一般,*": {POS: SYM},  # text art
-    "補助記号,ＡＡ,顔文字,*": {POS: SYM},  # kaomoji
-    "補助記号,一般,*,*": {POS: SYM},
-    "補助記号,括弧開,*,*": {POS: PUNCT},  # open bracket
-    "補助記号,括弧閉,*,*": {POS: PUNCT},  # close bracket
-    "補助記号,句点,*,*": {POS: PUNCT},  # period or other EOS marker
-    "補助記号,読点,*,*": {POS: PUNCT},  # comma
-    "名詞,固有名詞,一般,*": {POS: PROPN},  # general proper noun
-    "名詞,固有名詞,人名,一般": {POS: PROPN},  # person's name
-    "名詞,固有名詞,人名,姓": {POS: PROPN},  # surname
-    "名詞,固有名詞,人名,名": {POS: PROPN},  # first name
-    "名詞,固有名詞,地名,一般": {POS: PROPN},  # place name
-    "名詞,固有名詞,地名,国": {POS: PROPN},  # country name
-    "名詞,助動詞語幹,*,*": {POS: AUX},
-    "名詞,数詞,*,*": {POS: NUM},  # includes Chinese numerals
-    "名詞,普通名詞,サ変可能,*": {POS: NOUN},  # XXX: sometimes VERB in UDv2; suru-verb noun
-    "名詞,普通名詞,サ変可能,*,NOUN": {POS: NOUN},
-    "名詞,普通名詞,サ変可能,*,VERB": {POS: VERB},
-    "名詞,普通名詞,サ変形状詞可能,*": {POS: NOUN},  # ex: 下手
-    "名詞,普通名詞,一般,*": {POS: NOUN},
-    "名詞,普通名詞,形状詞可能,*": {POS: NOUN},  # XXX: sometimes ADJ in UDv2
-    "名詞,普通名詞,形状詞可能,*,NOUN": {POS: NOUN},
-    "名詞,普通名詞,形状詞可能,*,ADJ": {POS: ADJ},
-    "名詞,普通名詞,助数詞可能,*": {POS: NOUN},  # counter / unit
-    "名詞,普通名詞,副詞可能,*": {POS: NOUN},
-    "連体詞,*,*,*": {POS: ADJ},  # XXX this has exceptions based on literal token
-    "連体詞,*,*,*,ADJ": {POS: ADJ},
-    "連体詞,*,*,*,PRON": {POS: PRON},
-    "連体詞,*,*,*,DET": {POS: DET},
+
+    "形状詞-一般": {POS: ADJ},
+    "形状詞-タリ": {POS: ADJ},
+    "形状詞-助動詞語幹": {POS: AUX},
+
+    "形容詞-一般": {POS: ADJ},
+
+    "形容詞-非自立可能": {POS: ADJ},  # XXX ADJ if alone, AUX otherwise
+
+    "助詞-格助詞": {POS: ADP},
+
+    "助詞-係助詞": {POS: ADP},
+
+    "助詞-終助詞": {POS: PART},
+    "助詞-準体助詞": {POS: SCONJ},  # の as in 走るのが速い
+    "助詞-接続助詞": {POS: SCONJ},  # verb ending て0
+
+    "助詞-副助詞": {POS: ADP},  # ばかり, つつ after a verb
+
+    "助動詞": {POS: AUX},
+
+    "接続詞": {POS: CCONJ},  # XXX: might need refinement
+    "接頭辞": {POS: NOUN},
+    "接尾辞-形状詞的": {POS: PART},  # がち, チック
+
+    "接尾辞-形容詞的": {POS: AUX},  # -らしい
+
+    "接尾辞-動詞的": {POS: PART},  # -じみ
+    "接尾辞-名詞的-サ変可能": {POS: NOUN},  # XXX see 名詞,普通名詞,サ変可能,*
+    "接尾辞-名詞的-一般": {POS: NOUN},
+    "接尾辞-名詞的-助数詞": {POS: NOUN},
+    "接尾辞-名詞的-副詞可能": {POS: NOUN},  # -後, -過ぎ
+
+    "代名詞": {POS: PRON},
+
+    "動詞-一般": {POS: VERB},
+
+    "動詞-非自立可能": {POS: AUX},  # XXX VERB if alone, AUX otherwise
+
+    "副詞": {POS: ADV},
+
+    "補助記号-ＡＡ-一般": {POS: SYM},  # text art
+    "補助記号-ＡＡ-顔文字": {POS: PUNCT},  # kaomoji
+
+    "補助記号-一般": {POS: SYM},
+
+    "補助記号-括弧開": {POS: PUNCT},  # open bracket
+    "補助記号-括弧閉": {POS: PUNCT},  # close bracket
+    "補助記号-句点": {POS: PUNCT},  # period or other EOS marker
+    "補助記号-読点": {POS: PUNCT},  # comma
+
+    "名詞-固有名詞-一般": {POS: PROPN},  # general proper noun
+    "名詞-固有名詞-人名-一般": {POS: PROPN},  # person's name
+    "名詞-固有名詞-人名-姓": {POS: PROPN},  # surname
+    "名詞-固有名詞-人名-名": {POS: PROPN},  # first name
+    "名詞-固有名詞-地名-一般": {POS: PROPN},  # place name
+    "名詞-固有名詞-地名-国": {POS: PROPN},  # country name
+
+    "名詞-助動詞語幹": {POS: AUX},
+    "名詞-数詞": {POS: NUM},  # includes Chinese numerals
+
+    "名詞-普通名詞-サ変可能": {POS: NOUN},  # XXX: sometimes VERB in UDv2; suru-verb noun
+
+    "名詞-普通名詞-サ変形状詞可能": {POS: NOUN},
+
+    "名詞-普通名詞-一般": {POS: NOUN},
+
+    "名詞-普通名詞-形状詞可能": {POS: NOUN},  # XXX: sometimes ADJ in UDv2
+
+    "名詞-普通名詞-助数詞可能": {POS: NOUN},  # counter / unit
+
+    "名詞-普通名詞-副詞可能": {POS: NOUN},
+
+    "連体詞": {POS: DET},  # XXX this has exceptions based on literal token
+
+    # GSD tags. These aren't in Unidic, but we need them for the GSD data.
+    "外国語": {POS: PROPN},  # Foreign words
+
+    "絵文字・記号等": {POS: SYM},  # emoji / kaomoji ^^;
+
 }
--- a/spacy/lang/ja/tag_orth_map.py
+++ b/spacy/lang/ja/tag_orth_map.py
@ -0,0 +1,30 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import POS, ADJ, AUX, DET, PART, PRON, SPACE ,X
+
+# mapping from tag bi-gram to pos of previous token
+TAG_ORTH_MAP = {
+    "空白": {
+        " ": SPACE,
+        "　": X,
+    },
+    "助詞-副助詞": {
+        "たり": PART,
+    },
+    "連体詞": {
+        "あの": DET,
+        "かの": DET,
+        "この": DET,
+        "その": DET,
+        "どの": DET,
+        "彼の": DET,
+        "此の": DET,
+        "其の": DET,
+        "ある": PRON,
+        "こんな": PRON,
+        "そんな": PRON,
+        "どんな": PRON,
+        "あらゆる": PRON,
+    },
+}
--- a/spacy/tests/lang/ja/test_lemmatization.py
+++ b/spacy/tests/lang/ja/test_lemmatization.py
@ -6,7 +6,7 @@ import pytest

@pytest.mark.parametrize(
    "word,lemma",
-    [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")],
+    [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")],
 )
 def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
    test_lemma = ja_tokenizer(word)[0].lemma_
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -14,20 +14,26 @@ TOKENIZER_TESTS = [
 ]

 TAG_TESTS = [
-    ("日本語だよ", ['名詞,固有名詞,地名,国', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '助詞,終助詞,*,*']),
-    ("東京タワーの近くに住んでいます。", ['名詞,固有名詞,地名,一般', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '動詞,非自立可能,*,*', '助動詞,*,*,*', '補助記号,句点,*,*']),
-    ("吾輩は猫である。", ['代名詞,*,*,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '動詞,非自立可能,*,*', '補助記号,句点,*,*']),
-    ("月に代わって、お仕置きよ!", ['名詞,普通名詞,助数詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '補助記号,読点,*,*', '接頭辞,*,*,*', '名詞,普通名詞,一般,*', '助詞,終助詞,*,*', '補助記号,句点,*,*']),
-    ("すもももももももものうち", ['名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*'])
+    ("日本語だよ", ['名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
+    ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
+    ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
+    ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点']),
+    ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
 ]

 POS_TESTS = [
-    ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
+    ('日本語だよ', ['fish', 'NOUN', 'AUX', 'PART']),
    ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']),
    ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']),
    ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
    ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
 ]
+
+SENTENCE_TESTS = [
+        ('あれ。これ。', ['あれ。', 'これ。']),
+        ('「伝染るんです。」という漫画があります。', 
+            ['「伝染るんです。」という漫画があります。']),
+        ]
 # fmt: on


@ -43,14 +49,27 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
    assert tags == expected_tags


+#XXX This isn't working? Always passes
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
 def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
    pos = [token.pos_ for token in ja_tokenizer(text)]
    assert pos == expected_pos

+@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
+def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents):
+    sents = [str(sent) for sent in ja_tokenizer(text).sents]
+    assert sents == expected_sents
+

 def test_extra_spaces(ja_tokenizer):
    # note: three spaces after "I"
    tokens = ja_tokenizer("I   like cheese.")
-    assert tokens[1].orth_ == " "
-    assert tokens[2].orth_ == " "
+    assert tokens[1].orth_ == "  "
+
+from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
+
+@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
+def test_tokenizer_naughty_strings(ja_tokenizer, text):
+    tokens = ja_tokenizer(text)
+    assert tokens.text_with_ws == text
+