spaCy/spacy/ja/__init__.py

# encoding: utf8
from __future__ import unicode_literals, print_function

from os import path

from ..language import Language, BaseDefaults
from ..tokenizer import Tokenizer
from ..tagger import Tagger
from ..attrs import LANG
from ..tokens import Doc

from .language_data import *

import re
from collections import namedtuple

ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])

DETAILS_KEY = 'mecab_details'

def try_mecab_import():
    """Mecab is required for Japanese support, so check for it.

    It it's not available blow up and explain how to fix it."""
    try:
        import MeCab
        return MeCab
    except ImportError:
        raise ImportError("Japanese support requires MeCab: "
                          "https://github.com/SamuraiT/mecab-python3")

class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        MeCab = try_mecab_import()
        self.tokenizer = MeCab.Tagger()

    def __call__(self, text):
        dtokens = detailed_tokens(self.tokenizer, text)
        words = [x.surface for x in dtokens]
        doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
        # stash details tokens for tagger to use
        doc.user_data[DETAILS_KEY] = dtokens
        return doc

def resolve_pos(token):
    """If necessary, add a field to the POS tag for UD mapping.

    Under Universal Dependencies, sometimes the same Unidic POS tag can
    be mapped differently depending on the literal token or its context
    in the sentence. This function adds information to the POS tag to 
    resolve ambiguous mappings.
    """

    # NOTE: This is a first take. The rules here are crude approximations.
    # For many of these, full dependencies are needed to properly resolve
    # PoS mappings.

    if token.part_of_speech == '連体詞,*,*,*':
        if re.match('^[こそあど此其彼]の', token.surface):
            return token.part_of_speech + ',DET'
        if re.match('^[こそあど此其彼]', token.surface):
            return token.part_of_speech + ',PRON'
        else:
            return token.part_of_speech + ',ADJ'
    return token.part_of_speech

def detailed_tokens(tokenizer, text):
    """Format Mecab output into a nice data structure, based on Janome."""

    node = tokenizer.parseToNode(text)
    node = node.next # first node is beginning of sentence and empty, skip it
    words = []
    while node.posid != 0:
        surface = node.surface
        base = surface
        parts = node.feature.split(',')
        pos = ','.join(parts[0:4])

        if len(parts) > 6:
            # this information is only available for words in the tokenizer dictionary
            reading = parts[6]
            base = parts[7]

        words.append( ShortUnitWord(surface, base, pos) )
        node = node.next
    return words

class JapaneseTagger(object):
    def __init__(self, vocab):
        MeCab = try_mecab_import()
        self.tagger = Tagger(vocab)
        self.tokenizer = MeCab.Tagger()

    def __call__(self, tokens):
        # two parts to this:
        # 1. get raw JP tags
        # 2. add features to tags as necessary for UD

        dtokens = tokens.user_data[DETAILS_KEY]
        rawtags = list(map(resolve_pos, dtokens))
        self.tagger.tag_from_strings(tokens, rawtags)

class JapaneseDefaults(BaseDefaults):
    tag_map = TAG_MAP

    @classmethod
    def create_tokenizer(cls, nlp=None):
        return JapaneseTokenizer(cls, nlp)

    @classmethod
    def create_tagger(cls, tokenizer):
        return JapaneseTagger(tokenizer.vocab)

class Japanese(Language):
    lang = 'ja'

    Defaults = JapaneseDefaults

    def make_doc(self, text):
        jdoc = self.tokenizer(text)
        tagger = JapaneseDefaults.create_tagger(self.tokenizer)
        tagger(jdoc)
        return jdoc
Add basic japanese support 2017-05-03 07:56:21 +03:00			`# encoding: utf8`
			`from __future__ import unicode_literals, print_function`

			`from os import path`

Make create_tokenizer work with Japanese 2017-06-27 19:18:05 +03:00			`from ..language import Language, BaseDefaults`
			`from ..tokenizer import Tokenizer`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`from ..tagger import Tagger`
Add basic japanese support 2017-05-03 07:56:21 +03:00			`from ..attrs import LANG`
			`from ..tokens import Doc`

			`from .language_data import *`

Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`import re`
			`from collections import namedtuple`

			`ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])`

[ja] Use user_details instead of a wrapper class Instead of using a JapaneseDoc wrapper class to store Mecab output, stash it in `user_data`. -POLM 2017-10-15 18:24:34 +03:00			`DETAILS_KEY = 'mecab_details'`
[ja] Stash tokenizer output for speed Before this commit, the Mecab tokenizer had to be called twice when creating a Doc- once during tokenization and once during tagging. This creates a JapaneseDoc wrapper class for Doc that stashes the parsed tokenizer output to remove redundant processing. -POLM 2017-10-15 17:33:25 +03:00
Put Mecab import in utility function 2017-08-21 18:01:28 +03:00			`def try_mecab_import():`
			`"""Mecab is required for Japanese support, so check for it.`

			`It it's not available blow up and explain how to fix it."""`
			`try:`
			`import MeCab`
			`return MeCab`
			`except ImportError:`
			`raise ImportError("Japanese support requires MeCab: "`
			`"https://github.com/SamuraiT/mecab-python3")`

Make create_tokenizer work with Japanese 2017-06-27 19:18:05 +03:00			`class JapaneseTokenizer(object):`
			`def __init__(self, cls, nlp=None):`
			`self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)`
Put Mecab import in utility function 2017-08-21 18:01:28 +03:00			`MeCab = try_mecab_import()`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`self.tokenizer = MeCab.Tagger()`
Make create_tokenizer work with Japanese 2017-06-27 19:18:05 +03:00
			`def __call__(self, text):`
[ja] Stash tokenizer output for speed Before this commit, the Mecab tokenizer had to be called twice when creating a Doc- once during tokenization and once during tagging. This creates a JapaneseDoc wrapper class for Doc that stashes the parsed tokenizer output to remove redundant processing. -POLM 2017-10-15 17:33:25 +03:00			`dtokens = detailed_tokens(self.tokenizer, text)`
			`words = [x.surface for x in dtokens]`
[ja] Use user_details instead of a wrapper class Instead of using a JapaneseDoc wrapper class to store Mecab output, stash it in `user_data`. -POLM 2017-10-15 18:24:34 +03:00			`doc = Doc(self.vocab, words=words, spaces=[False]*len(words))`
			`# stash details tokens for tagger to use`
			`doc.user_data[DETAILS_KEY] = dtokens`
			`return doc`
Make create_tokenizer work with Japanese 2017-06-27 19:18:05 +03:00
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`def resolve_pos(token):`
			`"""If necessary, add a field to the POS tag for UD mapping.`

			`Under Universal Dependencies, sometimes the same Unidic POS tag can`
			`be mapped differently depending on the literal token or its context`
			`in the sentence. This function adds information to the POS tag to`
			`resolve ambiguous mappings.`
			`"""`

			`# NOTE: This is a first take. The rules here are crude approximations.`
			`# For many of these, full dependencies are needed to properly resolve`
			`# PoS mappings.`

			`if token.part_of_speech == '連体詞,,,*':`
			`if re.match('^[こそあど此其彼]の', token.surface):`
			`return token.part_of_speech + ',DET'`
Fix pronoun handling Missed this case earlier. 連体詞 have three classes for UD purposes: - その -> DET - それ -> PRON - 同じ -> ADJ -POLM 2017-08-21 18:01:49 +03:00			`if re.match('^[こそあど此其彼]', token.surface):`
			`return token.part_of_speech + ',PRON'`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`else:`
			`return token.part_of_speech + ',ADJ'`
			`return token.part_of_speech`

			`def detailed_tokens(tokenizer, text):`
			`"""Format Mecab output into a nice data structure, based on Janome."""`

			`node = tokenizer.parseToNode(text)`
			`node = node.next # first node is beginning of sentence and empty, skip it`
			`words = []`
			`while node.posid != 0:`
Handle out-of-vocab words Wasn't handling words out of the tokenizer dictionary vocabulary properly. This adds a fix and test for that. -POLM 2017-08-29 17:58:42 +03:00			`surface = node.surface`
			`base = surface`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`parts = node.feature.split(',')`
			`pos = ','.join(parts[0:4])`
Handle out-of-vocab words Wasn't handling words out of the tokenizer dictionary vocabulary properly. This adds a fix and test for that. -POLM 2017-08-29 17:58:42 +03:00
			`if len(parts) > 6:`
			`# this information is only available for words in the tokenizer dictionary`
			`reading = parts[6]`
			`base = parts[7]`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00
			`words.append( ShortUnitWord(surface, base, pos) )`
			`node = node.next`
			`return words`

			`class JapaneseTagger(object):`
			`def __init__(self, vocab):`
Put Mecab import in utility function 2017-08-21 18:01:28 +03:00			`MeCab = try_mecab_import()`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`self.tagger = Tagger(vocab)`
			`self.tokenizer = MeCab.Tagger()`

			`def __call__(self, tokens):`
			`# two parts to this:`
			`# 1. get raw JP tags`
			`# 2. add features to tags as necessary for UD`

[ja] Use user_details instead of a wrapper class Instead of using a JapaneseDoc wrapper class to store Mecab output, stash it in `user_data`. -POLM 2017-10-15 18:24:34 +03:00			`dtokens = tokens.user_data[DETAILS_KEY]`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`rawtags = list(map(resolve_pos, dtokens))`
			`self.tagger.tag_from_strings(tokens, rawtags)`

Make create_tokenizer work with Japanese 2017-06-27 19:18:05 +03:00			`class JapaneseDefaults(BaseDefaults):`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`tag_map = TAG_MAP`

Make create_tokenizer work with Japanese 2017-06-27 19:18:05 +03:00			`@classmethod`
			`def create_tokenizer(cls, nlp=None):`
			`return JapaneseTokenizer(cls, nlp)`

Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`@classmethod`
			`def create_tagger(cls, tokenizer):`
			`return JapaneseTagger(tokenizer.vocab)`

Make create_tokenizer work with Japanese 2017-06-27 19:18:05 +03:00			`class Japanese(Language):`
			`lang = 'ja'`

			`Defaults = JapaneseDefaults`

			`def make_doc(self, text):`
[ja] Stash tokenizer output for speed Before this commit, the Mecab tokenizer had to be called twice when creating a Doc- once during tokenization and once during tagging. This creates a JapaneseDoc wrapper class for Doc that stashes the parsed tokenizer output to remove redundant processing. -POLM 2017-10-15 17:33:25 +03:00			`jdoc = self.tokenizer(text)`
Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM 2017-08-07 19:27:15 +03:00			`tagger = JapaneseDefaults.create_tagger(self.tokenizer)`
[ja] Stash tokenizer output for speed Before this commit, the Mecab tokenizer had to be called twice when creating a Doc- once during tokenization and once during tagging. This creates a JapaneseDoc wrapper class for Doc that stashes the parsed tokenizer output to remove redundant processing. -POLM 2017-10-15 17:33:25 +03:00			`tagger(jdoc)`
			`return jdoc`