mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-30 23:04:13 +03:00
Merge pull request #1413 from explosion/feature/lemmatizer
💫 Integrate lookup lemmatization (9+ languages)
This commit is contained in:
commit
40dbc85ffa
|
@ -16,15 +16,13 @@ from ...util import update_exc
|
||||||
class BengaliDefaults(Language.Defaults):
|
class BengaliDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lemma_rules = LEMMA_RULES
|
lemma_rules = LEMMA_RULES
|
||||||
|
prefixes = TOKENIZER_PREFIXES
|
||||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
infixes = TOKENIZER_INFIXES
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
|
||||||
|
|
||||||
|
|
||||||
class Bengali(Language):
|
class Bengali(Language):
|
||||||
|
|
|
@ -15,9 +15,8 @@ class DanishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'da'
|
lex_attr_getters[LANG] = lambda text: 'da'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Danish(Language):
|
class Danish(Language):
|
||||||
|
|
|
@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -22,16 +21,12 @@ class GermanDefaults(Language.Defaults):
|
||||||
lex_attr_getters[LANG] = lambda text: 'de'
|
lex_attr_getters[LANG] = lambda text: 'de'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||||
NORM_EXCEPTIONS, BASE_NORMS)
|
NORM_EXCEPTIONS, BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = TOKENIZER_INFIXES
|
||||||
tag_map = dict(TAG_MAP)
|
tag_map = TAG_MAP
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
lemma_lookup = LOOKUP
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class German(Language):
|
class German(Language):
|
||||||
|
|
|
@ -7,7 +7,7 @@ from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .morph_rules import MORPH_RULES
|
from .morph_rules import MORPH_RULES
|
||||||
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
@ -23,15 +23,15 @@ class EnglishDefaults(Language.Defaults):
|
||||||
lex_attr_getters[LANG] = lambda text: 'en'
|
lex_attr_getters[LANG] = lambda text: 'en'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||||
BASE_NORMS, NORM_EXCEPTIONS)
|
BASE_NORMS, NORM_EXCEPTIONS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = dict(TAG_MAP)
|
tag_map = TAG_MAP
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
morph_rules = dict(MORPH_RULES)
|
morph_rules = MORPH_RULES
|
||||||
lemma_rules = dict(LEMMA_RULES)
|
lemma_rules = LEMMA_RULES
|
||||||
lemma_index = dict(LEMMA_INDEX)
|
lemma_index = LEMMA_INDEX
|
||||||
lemma_exc = dict(LEMMA_EXC)
|
lemma_exc = LEMMA_EXC
|
||||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
lemma_lookup = LOOKUP
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
|
|
|
@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'es'
|
lex_attr_getters[LANG] = lambda text: 'es'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = dict(TAG_MAP)
|
tag_map = TAG_MAP
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
sytax_iterators = dict(SYNTAX_ITERATORS)
|
sytax_iterators = SYNTAX_ITERATORS
|
||||||
|
lemma_lookup = LOOKUP
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Spanish(Language):
|
class Spanish(Language):
|
||||||
|
|
|
@ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'fi'
|
lex_attr_getters[LANG] = lambda text: 'fi'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Finnish(Language):
|
class Finnish(Language):
|
||||||
|
|
|
@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -21,17 +20,13 @@ class FrenchDefaults(Language.Defaults):
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = TOKENIZER_INFIXES
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
lemma_lookup = LOOKUP
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class French(Language):
|
class French(Language):
|
||||||
|
|
|
@ -12,9 +12,8 @@ from ...util import update_exc
|
||||||
class HebrewDefaults(Language.Defaults):
|
class HebrewDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'he'
|
lex_attr_getters[LANG] = lambda text: 'he'
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Hebrew(Language):
|
class Hebrew(Language):
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'hu'
|
lex_attr_getters[LANG] = lambda text: 'hu'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = TOKENIZER_INFIXES
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
lemma_lookup = LOOKUP
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Hungarian(Language):
|
class Hungarian(Language):
|
||||||
|
|
|
@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
|
||||||
|
@ -19,19 +18,14 @@ from ...util import update_exc
|
||||||
class IndonesianDefaults(Language.Defaults):
|
class IndonesianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'id'
|
lex_attr_getters[LANG] = lambda text: 'id'
|
||||||
|
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
prefixes = TOKENIZER_PREFIXES
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
infixes = tuple(TOKENIZER_INFIXES)
|
infixes = TOKENIZER_INFIXES
|
||||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
lemma_lookup = LOOKUP
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Indonesian(Language):
|
class Indonesian(Language):
|
||||||
|
|
|
@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
|
||||||
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
|
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
|
||||||
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
|
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
|
||||||
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
|
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
|
||||||
'noniliun', 'desiliun',
|
'noniliun', 'desiliun']
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
|
|
@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'it'
|
lex_attr_getters[LANG] = lambda text: 'it'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
lemma_lookup = LOOKUP
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Italian(Language):
|
class Italian(Language):
|
||||||
|
|
|
@ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'nb'
|
lex_attr_getters[LANG] = lambda text: 'nb'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Norwegian(Language):
|
class Norwegian(Language):
|
||||||
|
|
|
@ -16,9 +16,8 @@ class DutchDefaults(Language.Defaults):
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Dutch(Language):
|
class Dutch(Language):
|
||||||
|
|
|
@ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'pl'
|
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Polish(Language):
|
class Polish(Language):
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults):
|
||||||
lex_attr_getters[LANG] = lambda text: 'pt'
|
lex_attr_getters[LANG] = lambda text: 'pt'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
lemma_lookup = LOOKUP
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Portuguese(Language):
|
class Portuguese(Language):
|
||||||
|
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
@ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'sv'
|
lex_attr_getters[LANG] = lambda text: 'sv'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
lemma_rules = LEMMA_RULES
|
||||||
@classmethod
|
lemma_lookup = LOOKUP
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
|
|
||||||
class Swedish(Language):
|
class Swedish(Language):
|
||||||
|
|
|
@ -12,24 +12,27 @@ from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
|
||||||
|
|
||||||
class ThaiDefaults(Language.Defaults):
|
class ThaiDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'th'
|
lex_attr_getters[LANG] = lambda text: 'th'
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = dict(TAG_MAP)
|
tag_map = TAG_MAP
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class Thai(Language):
|
class Thai(Language):
|
||||||
lang = 'th'
|
lang = 'th'
|
||||||
Defaults = ThaiDefaults
|
Defaults = ThaiDefaults
|
||||||
def make_doc(self, text):
|
|
||||||
try:
|
def make_doc(self, text):
|
||||||
from pythainlp.tokenize import word_tokenize
|
try:
|
||||||
except ImportError:
|
from pythainlp.tokenize import word_tokenize
|
||||||
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
|
except ImportError:
|
||||||
"https://github.com/wannaphongcom/pythainlp/")
|
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
|
||||||
words = [x for x in list(word_tokenize(text,"newmm"))]
|
"https://github.com/wannaphongcom/pythainlp/")
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
words = [x for x in list(word_tokenize(text,"newmm"))]
|
||||||
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Thai']
|
__all__ = ['Thai']
|
||||||
|
|
|
@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'xx'
|
lex_attr_getters[LANG] = lambda text: 'xx'
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
|
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,8 @@ from . import about
|
||||||
class BaseDefaults(object):
|
class BaseDefaults(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None):
|
def create_lemmatizer(cls, nlp=None):
|
||||||
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules)
|
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules,
|
||||||
|
cls.lemma_lookup)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_vocab(cls, nlp=None):
|
def create_vocab(cls, nlp=None):
|
||||||
|
@ -77,6 +78,7 @@ class BaseDefaults(object):
|
||||||
lemma_rules = {}
|
lemma_rules = {}
|
||||||
lemma_exc = {}
|
lemma_exc = {}
|
||||||
lemma_index = {}
|
lemma_index = {}
|
||||||
|
lemma_lookup = {}
|
||||||
morph_rules = {}
|
morph_rules = {}
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
syntax_iterators = {}
|
syntax_iterators = {}
|
||||||
|
|
|
@ -10,10 +10,11 @@ class Lemmatizer(object):
|
||||||
def load(cls, path, index=None, exc=None, rules=None):
|
def load(cls, path, index=None, exc=None, rules=None):
|
||||||
return cls(index or {}, exc or {}, rules or {})
|
return cls(index or {}, exc or {}, rules or {})
|
||||||
|
|
||||||
def __init__(self, index, exceptions, rules):
|
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
||||||
self.index = index
|
self.index = index if index is not None else {}
|
||||||
self.exc = exceptions
|
self.exc = exceptions if exceptions is not None else {}
|
||||||
self.rules = rules
|
self.rules = rules if rules is not None else {}
|
||||||
|
self.lookup_table = lookup if lookup is not None else {}
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
def __call__(self, string, univ_pos, morphology=None):
|
||||||
if univ_pos == NOUN:
|
if univ_pos == NOUN:
|
||||||
|
@ -79,6 +80,11 @@ class Lemmatizer(object):
|
||||||
def punct(self, string, morphology=None):
|
def punct(self, string, morphology=None):
|
||||||
return self(string, 'punct', morphology)
|
return self(string, 'punct', morphology)
|
||||||
|
|
||||||
|
def lookup(self, string):
|
||||||
|
if string in self.lookup_table:
|
||||||
|
return self.lookup_table[string]
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .lemmatizer import Lemmatizer
|
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(Lemmatizer):
|
|
||||||
@classmethod
|
|
||||||
def load(cls, path, lookup):
|
|
||||||
return cls(lookup or {})
|
|
||||||
|
|
||||||
def __init__(self, lookup):
|
|
||||||
self.lookup = lookup
|
|
||||||
|
|
||||||
def __call__(self, string, univ_pos, morphology=None):
|
|
||||||
try:
|
|
||||||
return set([self.lookup[string]])
|
|
||||||
except:
|
|
||||||
return set([string])
|
|
|
@ -67,9 +67,13 @@ cdef class Morphology:
|
||||||
self.exc), None, None)
|
self.exc), None, None)
|
||||||
|
|
||||||
cdef int assign_untagged(self, TokenC* token) except -1:
|
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||||
'''Set morphological attributes on a token without a POS tag.'''
|
"""Set morphological attributes on a token without a POS tag. Uses
|
||||||
|
the lemmatizer's lookup() method, which looks up the string in the
|
||||||
|
table provided by the language data as lemma_lookup (if available)."""
|
||||||
if token.lemma == 0:
|
if token.lemma == 0:
|
||||||
token.lemma = self.lemmatize(0, token.lex.orth, {})
|
orth_str = self.strings[token.lex.orth]
|
||||||
|
lemma = self.lemmatizer.lookup(orth_str)
|
||||||
|
token.lemma = self.strings.add(lemma)
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
if isinstance(tag, basestring):
|
if isinstance(tag, basestring):
|
||||||
|
|
|
@ -4,12 +4,12 @@ import pytest
|
||||||
|
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...tokens.doc import Doc
|
from ...tokens.doc import Doc
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmatizer():
|
def lemmatizer():
|
||||||
return Lemmatizer({'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
|
return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
13
spacy/tests/lang/de/test_lemma.py
Normal file
13
spacy/tests/lang/de/test_lemma.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'),
|
||||||
|
('engagierte', 'engagieren'),
|
||||||
|
('schließt', 'schließen'),
|
||||||
|
('vorgebenden', 'vorgebend')])
|
||||||
|
def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
|
||||||
|
tokens = de_tokenizer(string)
|
||||||
|
assert tokens[0].lemma_ == lemma
|
|
@ -7,6 +7,7 @@ from ..util import get_doc
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_issue589():
|
def test_issue589():
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
vocab.strings.set_frozen(True)
|
vocab.strings.set_frozen(True)
|
||||||
|
|
|
@ -456,24 +456,11 @@ p
|
||||||
}
|
}
|
||||||
|
|
||||||
p
|
p
|
||||||
| To add a lookup lemmatizer to your language, import the #[code LOOKUP]
|
| To provide a lookup lemmatizer for your language, import the lookup table
|
||||||
| table and #[code Lemmatizer], and create a new classmethod:
|
| and add it to the #[code Language] class as #[code lemma_lookup]:
|
||||||
|
|
||||||
|
+code.
|
||||||
+code("__init__py (excerpt)").
|
lemma_lookup = dict(LOOKUP)
|
||||||
# other imports here, plus lookup table and lookup lemmatizer
|
|
||||||
from .lemmatizer import LOOKUP
|
|
||||||
from ...lemmatizerlookup import Lemmatizer
|
|
||||||
|
|
||||||
class Xxxxx(Language):
|
|
||||||
lang = 'xx'
|
|
||||||
|
|
||||||
class Defaults(Language.Defaults):
|
|
||||||
# other language defaults here
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None):
|
|
||||||
return Lemmatizer(LOOKUP)
|
|
||||||
|
|
||||||
+h(3, "tag-map") Tag map
|
+h(3, "tag-map") Tag map
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user