mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
Wire up lemmatizer rules for English
This commit is contained in:
parent
e6fc4afb04
commit
44f4f008bd
|
@ -19,6 +19,8 @@ from ..language_data import EMOTICONS
|
||||||
from .language_data import ORTH_ONLY
|
from .language_data import ORTH_ONLY
|
||||||
from .language_data import get_time_exc
|
from .language_data import get_time_exc
|
||||||
|
|
||||||
|
from .lemma_rules import LEMMA_RULES
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
|
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
|
||||||
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
|
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
|
||||||
|
@ -47,3 +49,4 @@ class English(Language):
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
tag_map = TAG_MAP
|
tag_map = TAG_MAP
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
lemma_rules = LEMMA_RULES
|
||||||
|
|
|
@ -38,7 +38,7 @@ class BaseDefaults(object):
|
||||||
if nlp is None or nlp.path is None:
|
if nlp is None or nlp.path is None:
|
||||||
return Lemmatizer({}, {}, {})
|
return Lemmatizer({}, {}, {})
|
||||||
else:
|
else:
|
||||||
return Lemmatizer.load(nlp.path)
|
return Lemmatizer.load(nlp.path, rules=self.lemma_rules)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_vocab(cls, nlp=None):
|
def create_vocab(cls, nlp=None):
|
||||||
|
@ -159,6 +159,8 @@ class BaseDefaults(object):
|
||||||
|
|
||||||
stop_words = set()
|
stop_words = set()
|
||||||
|
|
||||||
|
lemma_rules = {}
|
||||||
|
|
||||||
lex_attr_getters = {
|
lex_attr_getters = {
|
||||||
attrs.LOWER: lambda string: string.lower(),
|
attrs.LOWER: lambda string: string.lower(),
|
||||||
attrs.NORM: lambda string: string,
|
attrs.NORM: lambda string: string,
|
||||||
|
|
|
@ -9,7 +9,7 @@ from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path):
|
def load(cls, path, rules=None):
|
||||||
index = {}
|
index = {}
|
||||||
exc = {}
|
exc = {}
|
||||||
for pos in ['adj', 'noun', 'verb']:
|
for pos in ['adj', 'noun', 'verb']:
|
||||||
|
@ -25,8 +25,11 @@ class Lemmatizer(object):
|
||||||
exc[pos] = read_exc(file_)
|
exc[pos] = read_exc(file_)
|
||||||
else:
|
else:
|
||||||
exc[pos] = {}
|
exc[pos] = {}
|
||||||
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
|
if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
|
||||||
rules = json.load(file_)
|
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
|
||||||
|
rules = json.load(file_)
|
||||||
|
elif rules is None:
|
||||||
|
rules = {}
|
||||||
return cls(index, exc, rules)
|
return cls(index, exc, rules)
|
||||||
|
|
||||||
def __init__(self, index, exceptions, rules):
|
def __init__(self, index, exceptions, rules):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user