Merge pull request #1413 from explosion/feature/lemmatizer

💫  Integrate lookup lemmatization (9+ languages)
This commit is contained in:
Matthew Honnibal 2017-10-11 17:54:36 +02:00 committed by GitHub
commit 40dbc85ffa
27 changed files with 106 additions and 159 deletions

View File

@ -16,15 +16,13 @@ from ...util import update_exc
class BengaliDefaults(Language.Defaults): class BengaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn' lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES lemma_rules = LEMMA_RULES
prefixes = TOKENIZER_PREFIXES
prefixes = tuple(TOKENIZER_PREFIXES) suffixes = TOKENIZER_SUFFIXES
suffixes = tuple(TOKENIZER_SUFFIXES) infixes = TOKENIZER_INFIXES
infixes = tuple(TOKENIZER_INFIXES)
class Bengali(Language): class Bengali(Language):

View File

@ -15,9 +15,8 @@ class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da' lex_attr_getters[LANG] = lambda text: 'da'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
class Danish(Language): class Danish(Language):

View File

@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -22,16 +21,12 @@ class GermanDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[LANG] = lambda text: 'de'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
NORM_EXCEPTIONS, BASE_NORMS) NORM_EXCEPTIONS, BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES) infixes = TOKENIZER_INFIXES
tag_map = dict(TAG_MAP) tag_map = TAG_MAP
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
syntax_iterators = dict(SYNTAX_ITERATORS) syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class German(Language): class German(Language):

View File

@ -7,7 +7,7 @@ from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -23,15 +23,15 @@ class EnglishDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS) BASE_NORMS, NORM_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = TAG_MAP
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
morph_rules = dict(MORPH_RULES) morph_rules = MORPH_RULES
lemma_rules = dict(LEMMA_RULES) lemma_rules = LEMMA_RULES
lemma_index = dict(LEMMA_INDEX) lemma_index = LEMMA_INDEX
lemma_exc = dict(LEMMA_EXC) lemma_exc = LEMMA_EXC
syntax_iterators = dict(SYNTAX_ITERATORS) lemma_lookup = LOOKUP
syntax_iterators = SYNTAX_ITERATORS
class English(Language): class English(Language):

View File

@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[LANG] = lambda text: 'es'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = TAG_MAP
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
sytax_iterators = dict(SYNTAX_ITERATORS) sytax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Spanish(Language): class Spanish(Language):

View File

@ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi' lex_attr_getters[LANG] = lambda text: 'fi'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
class Finnish(Language): class Finnish(Language):

View File

@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -21,17 +20,13 @@ class FrenchDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
infixes = tuple(TOKENIZER_INFIXES) infixes = TOKENIZER_INFIXES
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
syntax_iterators = dict(SYNTAX_ITERATORS) syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class French(Language): class French(Language):

View File

@ -12,9 +12,8 @@ from ...util import update_exc
class HebrewDefaults(Language.Defaults): class HebrewDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he' lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
class Hebrew(Language): class Hebrew(Language):

View File

@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu' lex_attr_getters[LANG] = lambda text: 'hu'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
prefixes = tuple(TOKENIZER_PREFIXES) prefixes = TOKENIZER_PREFIXES
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = TOKENIZER_SUFFIXES
infixes = tuple(TOKENIZER_INFIXES) infixes = TOKENIZER_INFIXES
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
lemma_lookup = LOOKUP
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Hungarian(Language): class Hungarian(Language):

View File

@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
@ -19,19 +18,14 @@ from ...util import update_exc
class IndonesianDefaults(Language.Defaults): class IndonesianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'id' lex_attr_getters[LANG] = lambda text: 'id'
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
prefixes = tuple(TOKENIZER_PREFIXES) prefixes = TOKENIZER_PREFIXES
suffixes = tuple(TOKENIZER_SUFFIXES) suffixes = TOKENIZER_SUFFIXES
infixes = tuple(TOKENIZER_INFIXES) infixes = TOKENIZER_INFIXES
syntax_iterators = dict(SYNTAX_ITERATORS) syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Indonesian(Language): class Indonesian(Language):

View File

@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta', 'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun', 'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun', 'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
'noniliun', 'desiliun', 'noniliun', 'desiliun']
]
def like_num(text): def like_num(text):

View File

@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it' lex_attr_getters[LANG] = lambda text: 'it'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
lemma_lookup = LOOKUP
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Italian(Language): class Italian(Language):

View File

@ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb' lex_attr_getters[LANG] = lambda text: 'nb'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
class Norwegian(Language): class Norwegian(Language):

View File

@ -16,9 +16,8 @@ class DutchDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[LANG] = lambda text: 'nl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
class Dutch(Language): class Dutch(Language):

View File

@ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[LANG] = lambda text: 'pl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
class Polish(Language): class Polish(Language):

View File

@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'pt' lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
lemma_lookup = LOOKUP
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Portuguese(Language): class Portuguese(Language):

View File

@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
@ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv' lex_attr_getters[LANG] = lambda text: 'sv'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
@classmethod lemma_lookup = LOOKUP
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Swedish(Language): class Swedish(Language):

View File

@ -12,24 +12,27 @@ from ...language import Language
from ...attrs import LANG, NORM from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups from ...util import update_exc, add_lookups
class ThaiDefaults(Language.Defaults): class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'th' lex_attr_getters[LANG] = lambda text: 'th'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = TAG_MAP
stop_words = set(STOP_WORDS) stop_words = STOP_WORDS
class Thai(Language): class Thai(Language):
lang = 'th' lang = 'th'
Defaults = ThaiDefaults Defaults = ThaiDefaults
def make_doc(self, text):
try: def make_doc(self, text):
from pythainlp.tokenize import word_tokenize try:
except ImportError: from pythainlp.tokenize import word_tokenize
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " except ImportError:
"https://github.com/wannaphongcom/pythainlp/") raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
words = [x for x in list(word_tokenize(text,"newmm"))] "https://github.com/wannaphongcom/pythainlp/")
return Doc(self.vocab, words=words, spaces=[False]*len(words)) words = [x for x in list(word_tokenize(text,"newmm"))]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
__all__ = ['Thai'] __all__ = ['Thai']

View File

@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx' lex_attr_getters[LANG] = lambda text: 'xx'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)

View File

@ -33,7 +33,8 @@ from . import about
class BaseDefaults(object): class BaseDefaults(object):
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None): def create_lemmatizer(cls, nlp=None):
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules) return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules,
cls.lemma_lookup)
@classmethod @classmethod
def create_vocab(cls, nlp=None): def create_vocab(cls, nlp=None):
@ -77,6 +78,7 @@ class BaseDefaults(object):
lemma_rules = {} lemma_rules = {}
lemma_exc = {} lemma_exc = {}
lemma_index = {} lemma_index = {}
lemma_lookup = {}
morph_rules = {} morph_rules = {}
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
syntax_iterators = {} syntax_iterators = {}

View File

@ -10,10 +10,11 @@ class Lemmatizer(object):
def load(cls, path, index=None, exc=None, rules=None): def load(cls, path, index=None, exc=None, rules=None):
return cls(index or {}, exc or {}, rules or {}) return cls(index or {}, exc or {}, rules or {})
def __init__(self, index, exceptions, rules): def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index self.index = index if index is not None else {}
self.exc = exceptions self.exc = exceptions if exceptions is not None else {}
self.rules = rules self.rules = rules if rules is not None else {}
self.lookup_table = lookup if lookup is not None else {}
def __call__(self, string, univ_pos, morphology=None): def __call__(self, string, univ_pos, morphology=None):
if univ_pos == NOUN: if univ_pos == NOUN:
@ -79,6 +80,11 @@ class Lemmatizer(object):
def punct(self, string, morphology=None): def punct(self, string, morphology=None):
return self(string, 'punct', morphology) return self(string, 'punct', morphology)
def lookup(self, string):
if string in self.lookup_table:
return self.lookup_table[string]
return string
def lemmatize(string, index, exceptions, rules): def lemmatize(string, index, exceptions, rules):
string = string.lower() string = string.lower()

View File

@ -1,19 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .lemmatizer import Lemmatizer
class Lemmatizer(Lemmatizer):
@classmethod
def load(cls, path, lookup):
return cls(lookup or {})
def __init__(self, lookup):
self.lookup = lookup
def __call__(self, string, univ_pos, morphology=None):
try:
return set([self.lookup[string]])
except:
return set([string])

View File

@ -67,9 +67,13 @@ cdef class Morphology:
self.exc), None, None) self.exc), None, None)
cdef int assign_untagged(self, TokenC* token) except -1: cdef int assign_untagged(self, TokenC* token) except -1:
'''Set morphological attributes on a token without a POS tag.''' """Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
table provided by the language data as lemma_lookup (if available)."""
if token.lemma == 0: if token.lemma == 0:
token.lemma = self.lemmatize(0, token.lex.orth, {}) orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str)
token.lemma = self.strings.add(lemma)
cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int assign_tag(self, TokenC* token, tag) except -1:
if isinstance(tag, basestring): if isinstance(tag, basestring):

View File

@ -4,12 +4,12 @@ import pytest
from ...vocab import Vocab from ...vocab import Vocab
from ...tokens.doc import Doc from ...tokens.doc import Doc
from ...lemmatizerlookup import Lemmatizer from ...lemmatizer import Lemmatizer
@pytest.fixture @pytest.fixture
def lemmatizer(): def lemmatizer():
return Lemmatizer({'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'}) return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
@pytest.fixture @pytest.fixture

View File

@ -0,0 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'),
('engagierte', 'engagieren'),
('schließt', 'schließen'),
('vorgebenden', 'vorgebend')])
def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
tokens = de_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -7,6 +7,7 @@ from ..util import get_doc
import pytest import pytest
@pytest.mark.xfail
def test_issue589(): def test_issue589():
vocab = Vocab() vocab = Vocab()
vocab.strings.set_frozen(True) vocab.strings.set_frozen(True)

View File

@ -456,24 +456,11 @@ p
} }
p p
| To add a lookup lemmatizer to your language, import the #[code LOOKUP] | To provide a lookup lemmatizer for your language, import the lookup table
| table and #[code Lemmatizer], and create a new classmethod: | and add it to the #[code Language] class as #[code lemma_lookup]:
+code.
+code("__init__py (excerpt)"). lemma_lookup = dict(LOOKUP)
# other imports here, plus lookup table and lookup lemmatizer
from .lemmatizer import LOOKUP
from ...lemmatizerlookup import Lemmatizer
class Xxxxx(Language):
lang = 'xx'
class Defaults(Language.Defaults):
# other language defaults here
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
+h(3, "tag-map") Tag map +h(3, "tag-map") Tag map