mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Merge remote-tracking branch 'origin/organize-language-data' into organize-language-data
This commit is contained in:
commit
b11d8cd3db
|
@ -21,3 +21,4 @@ class English(Language):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
lemma_rules = LEMMA_RULES
|
||||
|
|
|
@ -39,7 +39,7 @@ class BaseDefaults(object):
|
|||
if nlp is None or nlp.path is None:
|
||||
return Lemmatizer({}, {}, {})
|
||||
else:
|
||||
return Lemmatizer.load(nlp.path)
|
||||
return Lemmatizer.load(nlp.path, rules=self.lemma_rules)
|
||||
|
||||
@classmethod
|
||||
def create_vocab(cls, nlp=None):
|
||||
|
@ -160,6 +160,8 @@ class BaseDefaults(object):
|
|||
|
||||
stop_words = set()
|
||||
|
||||
lemma_rules = {}
|
||||
|
||||
lex_attr_getters = {
|
||||
attrs.LOWER: lambda string: string.lower(),
|
||||
attrs.NORM: lambda string: string,
|
||||
|
|
|
@ -9,7 +9,7 @@ from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
|||
|
||||
class Lemmatizer(object):
|
||||
@classmethod
|
||||
def load(cls, path):
|
||||
def load(cls, path, rules=None):
|
||||
index = {}
|
||||
exc = {}
|
||||
for pos in ['adj', 'noun', 'verb']:
|
||||
|
@ -25,8 +25,11 @@ class Lemmatizer(object):
|
|||
exc[pos] = read_exc(file_)
|
||||
else:
|
||||
exc[pos] = {}
|
||||
if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
|
||||
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
|
||||
rules = json.load(file_)
|
||||
elif rules is None:
|
||||
rules = {}
|
||||
return cls(index, exc, rules)
|
||||
|
||||
def __init__(self, index, exceptions, rules):
|
||||
|
|
|
@ -140,6 +140,7 @@ cdef class Morphology:
|
|||
lemma = self.strings[lemma_string]
|
||||
return lemma
|
||||
|
||||
|
||||
IDS = {
|
||||
"Animacy_anim": Animacy_anim,
|
||||
"Animacy_inam": Animacy_inam,
|
||||
|
|
Loading…
Reference in New Issue
Block a user