mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
d013aba7b5
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -105,3 +105,7 @@ website/package.json
|
|||
website/announcement.jade
|
||||
website/www/
|
||||
website/.gitignore
|
||||
|
||||
# Python virtualenv
|
||||
venv
|
||||
venv/*
|
||||
|
|
|
@ -14,7 +14,7 @@ from spacy.language import Language
|
|||
from spacy.gold import GoldParse
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.pipeline import DependencyParser
|
||||
from spacy.pipeline import DependencyParser, BeamDependencyParser
|
||||
from spacy.syntax.parser import get_templates
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.scorer import Scorer
|
||||
|
@ -35,8 +35,8 @@ def read_conllx(loc, n=0):
|
|||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
id_, word, lemma, tag, pos, morph, head, dep, _1, _2 = line.split()
|
||||
if '-' in id_:
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
|
||||
if '-' in id_ or '.' in id_:
|
||||
continue
|
||||
try:
|
||||
id_ = int(id_) - 1
|
||||
|
@ -66,12 +66,8 @@ def score_model(vocab, tagger, parser, gold_docs, verbose=False):
|
|||
return scorer
|
||||
|
||||
|
||||
def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
|
||||
if tag_map_loc:
|
||||
with open(tag_map_loc) as file_:
|
||||
tag_map = json.loads(file_.read())
|
||||
else:
|
||||
tag_map = DEFAULT_TAG_MAP
|
||||
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
||||
LangClass = spacy.util.get_lang_class(lang_name)
|
||||
train_sents = list(read_conllx(train_loc))
|
||||
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
||||
|
||||
|
@ -79,13 +75,37 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
|
|||
features = get_templates('basic')
|
||||
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
if not model_dir.exists():
|
||||
model_dir.mkdir()
|
||||
if not (model_dir / 'deps').exists():
|
||||
(model_dir / 'deps').mkdir()
|
||||
if not (model_dir / 'pos').exists():
|
||||
(model_dir / 'pos').mkdir()
|
||||
with (model_dir / 'deps' / 'config.json').open('wb') as file_:
|
||||
file_.write(
|
||||
json.dumps(
|
||||
{'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
|
||||
vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
|
||||
|
||||
vocab = LangClass.Defaults.create_vocab()
|
||||
if not (model_dir / 'vocab').exists():
|
||||
(model_dir / 'vocab').mkdir()
|
||||
else:
|
||||
if (model_dir / 'vocab' / 'strings.json').exists():
|
||||
with (model_dir / 'vocab' / 'strings.json').open() as file_:
|
||||
vocab.strings.load(file_)
|
||||
if (model_dir / 'vocab' / 'lexemes.bin').exists():
|
||||
vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
|
||||
|
||||
if clusters_loc is not None:
|
||||
clusters_loc = pathlib.Path(clusters_loc)
|
||||
with clusters_loc.open() as file_:
|
||||
for line in file_:
|
||||
try:
|
||||
cluster, word, freq = line.split()
|
||||
except ValueError:
|
||||
continue
|
||||
lex = vocab[word]
|
||||
lex.cluster = int(cluster[::-1], 2)
|
||||
# Populate vocab
|
||||
for _, doc_sents in train_sents:
|
||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
||||
|
@ -95,13 +115,13 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
|
|||
_ = vocab[dep]
|
||||
for tag in tags:
|
||||
_ = vocab[tag]
|
||||
if tag_map:
|
||||
if vocab.morphology.tag_map:
|
||||
for tag in tags:
|
||||
assert tag in tag_map, repr(tag)
|
||||
tagger = Tagger(vocab, tag_map=tag_map)
|
||||
assert tag in vocab.morphology.tag_map, repr(tag)
|
||||
tagger = Tagger(vocab)
|
||||
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
|
||||
|
||||
for itn in range(15):
|
||||
for itn in range(30):
|
||||
loss = 0.
|
||||
for _, doc_sents in train_sents:
|
||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
||||
|
|
|
@ -35,14 +35,16 @@ set_lang_class(bn.Bengali.lang, bn.Bengali)
|
|||
|
||||
def load(name, **overrides):
|
||||
data_path = overrides.get('path', util.get_data_path())
|
||||
meta = parse_package_meta(data_path, name)
|
||||
lang = meta['lang'] if meta and 'lang' in meta else 'en'
|
||||
meta = parse_package_meta(data_path, name, require=False)
|
||||
lang = meta['lang'] if meta and 'lang' in meta else name
|
||||
cls = get_lang_class(lang)
|
||||
overrides['meta'] = meta
|
||||
overrides['path'] = Path(data_path / name)
|
||||
model_path = Path(data_path) / name
|
||||
if model_path.exists():
|
||||
overrides['path'] = model_path
|
||||
return cls(**overrides)
|
||||
|
||||
|
||||
def info(name):
|
||||
meta = parse_package_meta(util.get_data_path(), name)
|
||||
meta = parse_package_meta(util.get_data_path(), name, require=True)
|
||||
print(json.dumps(meta, indent=2))
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy'
|
||||
__version__ = '1.6.0'
|
||||
__version__ = '1.7.0'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Matthew Honnibal'
|
||||
|
|
738
spacy/es/tag_map.py
Normal file
738
spacy/es/tag_map.py
Normal file
|
@ -0,0 +1,738 @@
|
|||
{
|
||||
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin": {
|
||||
"freq": 865,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"PUNCT__PunctSide=Fin|PunctType=Brck": {
|
||||
"freq": 1476,
|
||||
"morph": "PunctSide=Fin|PunctType=Brck",
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin": {
|
||||
"freq": 7033,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"PRON__Number=Sing|Person=2|PronType=Prs": {
|
||||
"freq": 132,
|
||||
"morph": "Number=Sing|Person=2|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"PRON": {
|
||||
"pos": "PRON"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 525,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"SYM__NumForm=Digit|NumType=Frac": {
|
||||
"freq": 236,
|
||||
"morph": "NumForm=Digit|NumType=Frac",
|
||||
"pos": "SYM"
|
||||
},
|
||||
"ADJ___": {
|
||||
"freq": 515,
|
||||
"morph": "_",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"PRON__Person=3": {
|
||||
"freq": 3185,
|
||||
"morph": "Person=3",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"PRON__Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs": {
|
||||
"freq": 104,
|
||||
"morph": "Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"DET__Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs": {
|
||||
"freq": 148,
|
||||
"morph": "Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs",
|
||||
"pos": "DET"
|
||||
},
|
||||
"CONJ": {
|
||||
"pos": "CONJ"
|
||||
},
|
||||
"PUNCT__PunctType=Comm": {
|
||||
"freq": 24475,
|
||||
"morph": "PunctType=Comm",
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"ADV": {
|
||||
"pos": "ADV"
|
||||
},
|
||||
"ADV__AdpType=Prep": {
|
||||
"freq": 161,
|
||||
"morph": "AdpType=Prep",
|
||||
"pos": "ADV"
|
||||
},
|
||||
"ADJ__Number=Plur": {
|
||||
"freq": 2617,
|
||||
"morph": "Number=Plur",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"AUX__Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
|
||||
"freq": 149,
|
||||
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"ADJ__Gender=Masc|Number=Sing|NumType=Ord": {
|
||||
"freq": 654,
|
||||
"morph": "Gender=Masc|Number=Sing|NumType=Ord",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"AUX__Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin": {
|
||||
"freq": 272,
|
||||
"morph": "Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin": {
|
||||
"freq": 388,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"ADJ__Gender=Masc|Number=Plur": {
|
||||
"freq": 1995,
|
||||
"morph": "Gender=Masc|Number=Plur",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"DET": {
|
||||
"pos": "DET"
|
||||
},
|
||||
"VERB__VerbForm=Inf": {
|
||||
"freq": 8204,
|
||||
"morph": "VerbForm=Inf",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"DET__Definite=Def|Gender=Fem|Number=Plur|PronType=Art": {
|
||||
"freq": 4275,
|
||||
"morph": "Definite=Def|Gender=Fem|Number=Plur|PronType=Art",
|
||||
"pos": "DET"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": {
|
||||
"freq": 495,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"DET__Definite=Def|Gender=Masc|Number=Plur|PronType=Art": {
|
||||
"freq": 6951,
|
||||
"morph": "Definite=Def|Gender=Masc|Number=Plur|PronType=Art",
|
||||
"pos": "DET"
|
||||
},
|
||||
"PRON___": {
|
||||
"freq": 1871,
|
||||
"morph": "_",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"DET__Definite=Ind|Gender=Masc|Number=Plur|PronType=Art": {
|
||||
"freq": 113,
|
||||
"morph": "Definite=Ind|Gender=Masc|Number=Plur|PronType=Art",
|
||||
"pos": "DET"
|
||||
},
|
||||
"NOUN__Number=Sing": {
|
||||
"freq": 1977,
|
||||
"morph": "Number=Sing",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"ADJ__Gender=Fem|Number=Sing|NumType=Ord": {
|
||||
"freq": 568,
|
||||
"morph": "Gender=Fem|Number=Sing|NumType=Ord",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"NOUN__Gender=Masc|Number=Sing": {
|
||||
"freq": 25557,
|
||||
"morph": "Gender=Masc|Number=Sing",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"PART": {
|
||||
"pos": "PART"
|
||||
},
|
||||
"ADJ__Number=Sing": {
|
||||
"freq": 6619,
|
||||
"morph": "Number=Sing",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"NUM": {
|
||||
"pos": "NUM"
|
||||
},
|
||||
"DET__Number=Sing|PronType=Ind": {
|
||||
"freq": 309,
|
||||
"morph": "Number=Sing|PronType=Ind",
|
||||
"pos": "DET"
|
||||
},
|
||||
"ADJ__Gender=Fem|Number=Sing|VerbForm=Part": {
|
||||
"freq": 1387,
|
||||
"morph": "Gender=Fem|Number=Sing|VerbForm=Part",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"VERB__Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
|
||||
"freq": 272,
|
||||
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin": {
|
||||
"freq": 1574,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"PRON__Gender=Masc|Number=Sing|PronType=Dem": {
|
||||
"freq": 115,
|
||||
"morph": "Gender=Masc|Number=Sing|PronType=Dem",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"ADP": {
|
||||
"pos": "ADP"
|
||||
},
|
||||
"NOUN__AdvType=Tim": {
|
||||
"freq": 1504,
|
||||
"morph": "AdvType=Tim",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin": {
|
||||
"freq": 130,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"PRON__Case=Nom|Number=Sing|Person=1|PronType=Prs": {
|
||||
"freq": 115,
|
||||
"morph": "Case=Nom|Number=Sing|Person=1|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"PUNCT__PunctType=Semi": {
|
||||
"freq": 259,
|
||||
"morph": "PunctType=Semi",
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"PUNCT__PunctSide=Ini|PunctType=Qest": {
|
||||
"freq": 206,
|
||||
"morph": "PunctSide=Ini|PunctType=Qest",
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"PRON__Case=Dat|Number=Sing|Person=3|PronType=Prs": {
|
||||
"freq": 754,
|
||||
"morph": "Case=Dat|Number=Sing|Person=3|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"PRON__Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
|
||||
"freq": 624,
|
||||
"morph": "Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"NUM__NumForm=Digit": {
|
||||
"freq": 2979,
|
||||
"morph": "NumForm=Digit",
|
||||
"pos": "NUM"
|
||||
},
|
||||
"PUNCT__PunctType=Colo": {
|
||||
"freq": 638,
|
||||
"morph": "PunctType=Colo",
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"PROPN": {
|
||||
"pos": "PROPN"
|
||||
},
|
||||
"X": {
|
||||
"pos": "X"
|
||||
},
|
||||
"NOUN__NumForm=Digit": {
|
||||
"freq": 555,
|
||||
"morph": "NumForm=Digit",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"VERB__Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part": {
|
||||
"freq": 3297,
|
||||
"morph": "Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"ADJ__Gender=Masc|Number=Plur|NumType=Ord": {
|
||||
"freq": 227,
|
||||
"morph": "Gender=Masc|Number=Plur|NumType=Ord",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"PRON__Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
|
||||
"freq": 205,
|
||||
"morph": "Gender=Masc|Number=Sing|Person=3|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"NOUN__Number=Plur": {
|
||||
"freq": 1463,
|
||||
"morph": "Number=Plur",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"DET__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
|
||||
"freq": 2909,
|
||||
"morph": "Number=Sing|Person=3|Poss=Yes|PronType=Prs",
|
||||
"pos": "DET"
|
||||
},
|
||||
"VERB__VerbForm=Ger": {
|
||||
"freq": 994,
|
||||
"morph": "VerbForm=Ger",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"INTJ": {
|
||||
"pos": "INTJ"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin": {
|
||||
"freq": 398,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 1403,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"PRON__Number=Plur|Person=1|PronType=Prs": {
|
||||
"freq": 264,
|
||||
"morph": "Number=Plur|Person=1|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"ADV__Negative=Neg": {
|
||||
"freq": 2960,
|
||||
"morph": "Negative=Neg",
|
||||
"pos": "ADV"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 2488,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"DET__Gender=Masc|Number=Sing|PronType=Ind": {
|
||||
"freq": 855,
|
||||
"morph": "Gender=Masc|Number=Sing|PronType=Ind",
|
||||
"pos": "DET"
|
||||
},
|
||||
"VERB__Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 408,
|
||||
"morph": "Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"PRON__Gender=Fem|Number=Sing|PronType=Ind": {
|
||||
"freq": 237,
|
||||
"morph": "Gender=Fem|Number=Sing|PronType=Ind",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"DET__Gender=Fem|Number=Plur|PronType=Ind": {
|
||||
"freq": 592,
|
||||
"morph": "Gender=Fem|Number=Plur|PronType=Ind",
|
||||
"pos": "DET"
|
||||
},
|
||||
"ADJ__Gender=Fem|Number=Plur|VerbForm=Part": {
|
||||
"freq": 614,
|
||||
"morph": "Gender=Fem|Number=Plur|VerbForm=Part",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"DET__Gender=Fem|Number=Sing|PronType=Dem": {
|
||||
"freq": 808,
|
||||
"morph": "Gender=Fem|Number=Sing|PronType=Dem",
|
||||
"pos": "DET"
|
||||
},
|
||||
"DET__Gender=Fem|Number=Sing|PronType=Ind": {
|
||||
"freq": 613,
|
||||
"morph": "Gender=Fem|Number=Sing|PronType=Ind",
|
||||
"pos": "DET"
|
||||
},
|
||||
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Art": {
|
||||
"freq": 4277,
|
||||
"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Art",
|
||||
"pos": "DET"
|
||||
},
|
||||
"VERB__Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 788,
|
||||
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"NOUN__Gender=Fem": {
|
||||
"freq": 145,
|
||||
"morph": "Gender=Fem",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"PRON__Gender=Fem|Number=Plur|PronType=Ind": {
|
||||
"freq": 127,
|
||||
"morph": "Gender=Fem|Number=Plur|PronType=Ind",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
|
||||
"freq": 729,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
|
||||
"freq": 1223,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"AUX__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 164,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"PRON__PronType=Rel": {
|
||||
"freq": 7301,
|
||||
"morph": "PronType=Rel",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"DET__Definite=Def|Number=Sing|PronType=Art": {
|
||||
"freq": 928,
|
||||
"morph": "Definite=Def|Number=Sing|PronType=Art",
|
||||
"pos": "DET"
|
||||
},
|
||||
"ADV___": {
|
||||
"freq": 11334,
|
||||
"morph": "_",
|
||||
"pos": "ADV"
|
||||
},
|
||||
"ADJ": {
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"AUX__VerbForm=Ger": {
|
||||
"freq": 154,
|
||||
"morph": "VerbForm=Ger",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"PRON__Number=Sing|PronType=Int": {
|
||||
"freq": 201,
|
||||
"morph": "Number=Sing|PronType=Int",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin": {
|
||||
"freq": 1236,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"NOUN__Gender=Masc|Number=Plur": {
|
||||
"freq": 12310,
|
||||
"morph": "Gender=Masc|Number=Plur",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"NOUN__Gender=Fem|Number=Plur": {
|
||||
"freq": 8612,
|
||||
"morph": "Gender=Fem|Number=Plur",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 6343,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"PRON__Gender=Masc|Number=Plur|PronType=Ind": {
|
||||
"freq": 460,
|
||||
"morph": "Gender=Masc|Number=Plur|PronType=Ind",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"VERB__Mood=Sub|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": {
|
||||
"freq": 100,
|
||||
"morph": "Mood=Sub|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"PUNCT__PunctSide=Ini|PunctType=Brck": {
|
||||
"freq": 1482,
|
||||
"morph": "PunctSide=Ini|PunctType=Brck",
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"PRON__Gender=Masc|Number=Sing|PronType=Tot": {
|
||||
"freq": 111,
|
||||
"morph": "Gender=Masc|Number=Sing|PronType=Tot",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"SCONJ": {
|
||||
"pos": "SCONJ"
|
||||
},
|
||||
"AUX__VerbForm=Inf": {
|
||||
"freq": 1495,
|
||||
"morph": "VerbForm=Inf",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 5227,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"ADJ__AdpType=Prep": {
|
||||
"freq": 124,
|
||||
"morph": "AdpType=Prep",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"PRON__Gender=Masc|Number=Sing|PronType=Ind": {
|
||||
"freq": 624,
|
||||
"morph": "Gender=Masc|Number=Sing|PronType=Ind",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"DET__Gender=Masc|Number=Plur|PronType=Dem": {
|
||||
"freq": 269,
|
||||
"morph": "Gender=Masc|Number=Plur|PronType=Dem",
|
||||
"pos": "DET"
|
||||
},
|
||||
"ADJ__Gender=Fem|Number=Plur": {
|
||||
"freq": 1612,
|
||||
"morph": "Gender=Fem|Number=Plur",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"NUM__Gender=Masc|Number=Plur|NumType=Card": {
|
||||
"freq": 104,
|
||||
"morph": "Gender=Masc|Number=Plur|NumType=Card",
|
||||
"pos": "NUM"
|
||||
},
|
||||
"NUM__NumType=Card": {
|
||||
"freq": 533,
|
||||
"morph": "NumType=Card",
|
||||
"pos": "NUM"
|
||||
},
|
||||
"SCONJ___": {
|
||||
"freq": 10129,
|
||||
"morph": "_",
|
||||
"pos": "SCONJ"
|
||||
},
|
||||
"PRON__Number=Sing|PronType=Rel": {
|
||||
"freq": 318,
|
||||
"morph": "Number=Sing|PronType=Rel",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"VERB__Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin": {
|
||||
"freq": 253,
|
||||
"morph": "Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"NOUN": {
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"NOUN__Gender=Masc": {
|
||||
"freq": 153,
|
||||
"morph": "Gender=Masc",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art": {
|
||||
"freq": 3087,
|
||||
"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Art",
|
||||
"pos": "DET"
|
||||
},
|
||||
"ADJ__Gender=Masc|Number=Plur|VerbForm=Part": {
|
||||
"freq": 997,
|
||||
"morph": "Gender=Masc|Number=Plur|VerbForm=Part",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"PRON__Number=Sing|PronType=Dem": {
|
||||
"freq": 302,
|
||||
"morph": "Number=Sing|PronType=Dem",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"PRON__Number=Sing|Person=3|PronType=Prs": {
|
||||
"freq": 116,
|
||||
"morph": "Number=Sing|Person=3|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"PRON__Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
|
||||
"freq": 173,
|
||||
"morph": "Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"PUNCT": {
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"DET__Gender=Masc|Number=Sing|PronType=Dem": {
|
||||
"freq": 962,
|
||||
"morph": "Gender=Masc|Number=Sing|PronType=Dem",
|
||||
"pos": "DET"
|
||||
},
|
||||
"PRON__Number=Plur|PronType=Rel": {
|
||||
"freq": 102,
|
||||
"morph": "Number=Plur|PronType=Rel",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"ADJ__Gender=Masc|Number=Sing": {
|
||||
"freq": 5136,
|
||||
"morph": "Gender=Masc|Number=Sing",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art": {
|
||||
"freq": 22962,
|
||||
"morph": "Definite=Def|Gender=Masc|Number=Sing|PronType=Art",
|
||||
"pos": "DET"
|
||||
},
|
||||
"AUX__Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 107,
|
||||
"morph": "Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"PRON__Case=Dat|Number=Plur|Person=3|PronType=Prs": {
|
||||
"freq": 220,
|
||||
"morph": "Case=Dat|Number=Plur|Person=3|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"VERB__Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part": {
|
||||
"freq": 206,
|
||||
"morph": "Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"DET__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {
|
||||
"freq": 1021,
|
||||
"morph": "Number=Plur|Person=3|Poss=Yes|PronType=Prs",
|
||||
"pos": "DET"
|
||||
},
|
||||
"ADJ__Gender=Fem|Number=Plur|NumType=Ord": {
|
||||
"freq": 101,
|
||||
"morph": "Gender=Fem|Number=Plur|NumType=Ord",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"PRON__PronType=Int": {
|
||||
"freq": 137,
|
||||
"morph": "PronType=Int",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"ADP__AdpType=Prep": {
|
||||
"freq": 71133,
|
||||
"morph": "AdpType=Prep",
|
||||
"pos": "ADP"
|
||||
},
|
||||
"DET__Gender=Masc|Number=Plur|PronType=Ind": {
|
||||
"freq": 904,
|
||||
"morph": "Gender=Masc|Number=Plur|PronType=Ind",
|
||||
"pos": "DET"
|
||||
},
|
||||
"AUX__Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 299,
|
||||
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"DET__Gender=Fem|Number=Plur|PronType=Dem": {
|
||||
"freq": 188,
|
||||
"morph": "Gender=Fem|Number=Plur|PronType=Dem",
|
||||
"pos": "DET"
|
||||
},
|
||||
"NUM__NumForm=Digit|NumType=Card": {
|
||||
"freq": 1108,
|
||||
"morph": "NumForm=Digit|NumType=Card",
|
||||
"pos": "NUM"
|
||||
},
|
||||
"PUNCT__PunctType=Quot": {
|
||||
"freq": 7380,
|
||||
"morph": "PunctType=Quot",
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"VERB__Gender=Masc|Number=Plur|Tense=Past|VerbForm=Part": {
|
||||
"freq": 184,
|
||||
"morph": "Gender=Masc|Number=Plur|Tense=Past|VerbForm=Part",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"PUNCT__PunctType=Dash": {
|
||||
"freq": 2345,
|
||||
"morph": "PunctType=Dash",
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"ADJ__Gender=Fem|Number=Sing": {
|
||||
"freq": 3935,
|
||||
"morph": "Gender=Fem|Number=Sing",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"AUX__Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 215,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin": {
|
||||
"freq": 218,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"PROPN___": {
|
||||
"freq": 34454,
|
||||
"morph": "_",
|
||||
"pos": "PROPN"
|
||||
},
|
||||
"PRON__Number=Sing|PronType=Ind": {
|
||||
"freq": 421,
|
||||
"morph": "Number=Sing|PronType=Ind",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin": {
|
||||
"freq": 359,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"PUNCT__PunctSide=Fin|PunctType=Qest": {
|
||||
"freq": 312,
|
||||
"morph": "PunctSide=Fin|PunctType=Qest",
|
||||
"pos": "PUNCT"
|
||||
},
|
||||
"PRON__Number=Sing|Person=1|PronType=Prs": {
|
||||
"freq": 298,
|
||||
"morph": "Number=Sing|Person=1|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"PART__Negative=Neg": {
|
||||
"freq": 122,
|
||||
"morph": "Negative=Neg",
|
||||
"pos": "PART"
|
||||
},
|
||||
"PRON__Gender=Masc|Number=Plur|Person=3|PronType=Prs": {
|
||||
"freq": 176,
|
||||
"morph": "Gender=Masc|Number=Plur|Person=3|PronType=Prs",
|
||||
"pos": "PRON"
|
||||
},
|
||||
"NOUN__Gender=Fem|Number=Sing": {
|
||||
"freq": 24416,
|
||||
"morph": "Gender=Fem|Number=Sing",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"ADJ__Gender=Masc|Number=Sing|VerbForm=Part": {
|
||||
"freq": 2297,
|
||||
"morph": "Gender=Masc|Number=Sing|VerbForm=Part",
|
||||
"pos": "ADJ"
|
||||
},
|
||||
"CONJ___": {
|
||||
"freq": 12225,
|
||||
"morph": "_",
|
||||
"pos": "CONJ"
|
||||
},
|
||||
"NUM__Number=Plur|NumType=Card": {
|
||||
"freq": 2057,
|
||||
"morph": "Number=Plur|NumType=Card",
|
||||
"pos": "NUM"
|
||||
},
|
||||
"NOUN___": {
|
||||
"freq": 4829,
|
||||
"morph": "_",
|
||||
"pos": "NOUN"
|
||||
},
|
||||
"VERB": {
|
||||
"pos": "VERB"
|
||||
},
|
||||
"DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {
|
||||
"freq": 16487,
|
||||
"morph": "Definite=Def|Gender=Fem|Number=Sing|PronType=Art",
|
||||
"pos": "DET"
|
||||
},
|
||||
"SYM": {
|
||||
"pos": "SYM"
|
||||
},
|
||||
"VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin": {
|
||||
"freq": 130,
|
||||
"morph": "Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"AUX": {
|
||||
"pos": "AUX"
|
||||
},
|
||||
"AUX__Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part": {
|
||||
"freq": 494,
|
||||
"morph": "Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": {
|
||||
"freq": 199,
|
||||
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin",
|
||||
"pos": "AUX"
|
||||
},
|
||||
"VERB__Mood=Imp|Number=Sing|Person=2|VerbForm=Fin": {
|
||||
"freq": 100,
|
||||
"morph": "Mood=Imp|Number=Sing|Person=2|VerbForm=Fin",
|
||||
"pos": "VERB"
|
||||
},
|
||||
"PUNCT__PunctType=Peri": {
|
||||
"freq": 14170,
|
||||
"morph": "PunctType=Peri",
|
||||
"pos": "PUNCT"
|
||||
}
|
||||
}
|
|
@ -6,6 +6,7 @@ import ujson as json
|
|||
|
||||
from .en.lemmatizer import INDEX, EXC, RULES
|
||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||
from .symbols import VerbForm_inf, VerbForm_none
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
|
@ -43,10 +44,13 @@ class Lemmatizer(object):
|
|||
avoid lemmatization entirely.'''
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||
true_morph_key = morphology.get('morph', 0)
|
||||
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
|
||||
return True
|
||||
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
|
||||
return True
|
||||
elif true_morph_key in (VerbForm_inf, VerbForm_none):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
@ -70,11 +74,16 @@ def lemmatize(string, index, exceptions, rules):
|
|||
#if string in index:
|
||||
# forms.append(string)
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[:len(string) - len(old)] + new
|
||||
if form in index or not form.isalpha():
|
||||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return set(forms)
|
||||
|
|
|
@ -310,78 +310,78 @@ IDS = {
|
|||
"Number_grpa": Number_grpa, # U20
|
||||
"Number_grpl": Number_grpl, # U20
|
||||
"Number_inv": Number_inv, # U20
|
||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||
"NumValue_one ": NumValue_one, # cz, U,
|
||||
"NumValue_two ": NumValue_two, # cz, U,
|
||||
"NumValue_three ": NumValue_three, # cz, U,
|
||||
"PartForm_pres ": PartForm_pres, # fi,
|
||||
"PartForm_past ": PartForm_past, # fi,
|
||||
"PartForm_agt ": PartForm_agt, # fi,
|
||||
"PartForm_neg ": PartForm_neg, # fi,
|
||||
"PartType_mod ": PartType_mod, # U,
|
||||
"PartType_emp ": PartType_emp, # U,
|
||||
"PartType_res ": PartType_res, # U,
|
||||
"PartType_inf ": PartType_inf, # U,
|
||||
"PartType_vbp ": PartType_vbp, # U,
|
||||
"Person_abs_one ": Person_abs_one, # bq, U,
|
||||
"Person_abs_two ": Person_abs_two, # bq, U,
|
||||
"Person_abs_three ": Person_abs_three, # bq, U,
|
||||
"Person_dat_one ": Person_dat_one, # bq, U,
|
||||
"Person_dat_two ": Person_dat_two, # bq, U,
|
||||
"Person_dat_three ": Person_dat_three, # bq, U,
|
||||
"Person_erg_one ": Person_erg_one, # bq, U,
|
||||
"Person_erg_two ": Person_erg_two, # bq, U,
|
||||
"Person_erg_three ": Person_erg_three, # bq, U,
|
||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||
"Person_zero ": Person_zero, # U20
|
||||
"Person_four ": Person_four, # U20
|
||||
"Polite_inf ": Polite_inf, # bq, U,
|
||||
"Polite_pol ": Polite_pol, # bq, U,
|
||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
|
||||
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
|
||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||
"Polite_infm ": Polite_infm, # U20
|
||||
"Polite_form ": Polite_form, # U20
|
||||
"Polite_form_elev ": Polite_form_elev, # U20
|
||||
"NumForm_digit": NumForm_digit, # cz, sl, U,
|
||||
"NumForm_roman": NumForm_roman, # cz, sl, U,
|
||||
"NumForm_word": NumForm_word, # cz, sl, U,
|
||||
"NumValue_one": NumValue_one, # cz, U,
|
||||
"NumValue_two": NumValue_two, # cz, U,
|
||||
"NumValue_three": NumValue_three, # cz, U,
|
||||
"PartForm_pres": PartForm_pres, # fi,
|
||||
"PartForm_past": PartForm_past, # fi,
|
||||
"PartForm_agt": PartForm_agt, # fi,
|
||||
"PartForm_neg": PartForm_neg, # fi,
|
||||
"PartType_mod": PartType_mod, # U,
|
||||
"PartType_emp": PartType_emp, # U,
|
||||
"PartType_res": PartType_res, # U,
|
||||
"PartType_inf": PartType_inf, # U,
|
||||
"PartType_vbp": PartType_vbp, # U,
|
||||
"Person_abs_one": Person_abs_one, # bq, U,
|
||||
"Person_abs_two": Person_abs_two, # bq, U,
|
||||
"Person_abs_three": Person_abs_three, # bq, U,
|
||||
"Person_dat_one": Person_dat_one, # bq, U,
|
||||
"Person_dat_two": Person_dat_two, # bq, U,
|
||||
"Person_dat_three": Person_dat_three, # bq, U,
|
||||
"Person_erg_one": Person_erg_one, # bq, U,
|
||||
"Person_erg_two": Person_erg_two, # bq, U,
|
||||
"Person_erg_three": Person_erg_three, # bq, U,
|
||||
"Person_psor_one": Person_psor_one, # fi, U,
|
||||
"Person_psor_two": Person_psor_two, # fi, U,
|
||||
"Person_psor_three": Person_psor_three, # fi, U,
|
||||
"Person_zero": Person_zero, # U20
|
||||
"Person_four": Person_four, # U20
|
||||
"Polite_inf": Polite_inf, # bq, U,
|
||||
"Polite_pol": Polite_pol, # bq, U,
|
||||
"Polite_abs_inf": Polite_abs_inf, # bq, U,
|
||||
"Polite_abs_pol": Polite_abs_pol, # bq, U,
|
||||
"Polite_erg_inf": Polite_erg_inf, # bq, U,
|
||||
"Polite_erg_pol": Polite_erg_pol, # bq, U,
|
||||
"Polite_dat_inf": Polite_dat_inf, # bq, U,
|
||||
"Polite_dat_pol": Polite_dat_pol, # bq, U,
|
||||
"Polite_infm": Polite_infm, # U20
|
||||
"Polite_form": Polite_form, # U20
|
||||
"Polite_form_elev": Polite_form_elev, # U20
|
||||
"Polite_form_humb ": Polite_form_humb, # U20
|
||||
"Prefix_yes ": Prefix_yes, # U,
|
||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||
"PrepCase_pre ": PrepCase_pre, # U,
|
||||
"PunctSide_ini ": PunctSide_ini, # U,
|
||||
"PunctSide_fin ": PunctSide_fin, # U,
|
||||
"PunctType_peri ": PunctType_peri, # U,
|
||||
"PunctType_qest ": PunctType_qest, # U,
|
||||
"PunctType_excl ": PunctType_excl, # U,
|
||||
"PunctType_quot ": PunctType_quot, # U,
|
||||
"PunctType_brck ": PunctType_brck, # U,
|
||||
"PunctType_comm ": PunctType_comm, # U,
|
||||
"PunctType_colo ": PunctType_colo, # U,
|
||||
"PunctType_semi ": PunctType_semi, # U,
|
||||
"PunctType_dash ": PunctType_dash, # U,
|
||||
"Style_arch ": Style_arch, # cz, fi, U,
|
||||
"Style_rare ": Style_rare, # cz, fi, U,
|
||||
"Style_poet ": Style_poet, # cz, U,
|
||||
"Style_norm ": Style_norm, # cz, U,
|
||||
"Style_coll ": Style_coll, # cz, U,
|
||||
"Style_vrnc ": Style_vrnc, # cz, U,
|
||||
"Style_sing ": Style_sing, # cz, U,
|
||||
"Style_expr ": Style_expr, # cz, U,
|
||||
"Style_derg ": Style_derg, # cz, U,
|
||||
"Style_vulg ": Style_vulg, # cz, U,
|
||||
"Style_yes ": Style_yes, # fi, U,
|
||||
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
|
||||
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
|
||||
"VerbType_aux ": VerbType_aux, # U,
|
||||
"VerbType_cop ": VerbType_cop, # U,
|
||||
"VerbType_mod ": VerbType_mod, # U,
|
||||
"VerbType_light ": VerbType_light, # U,
|
||||
"Prefix_yes": Prefix_yes, # U,
|
||||
"PrepCase_npr": PrepCase_npr, # cz,
|
||||
"PrepCase_pre": PrepCase_pre, # U,
|
||||
"PunctSide_ini": PunctSide_ini, # U,
|
||||
"PunctSide_fin": PunctSide_fin, # U,
|
||||
"PunctType_peri": PunctType_peri, # U,
|
||||
"PunctType_qest": PunctType_qest, # U,
|
||||
"PunctType_excl": PunctType_excl, # U,
|
||||
"PunctType_quot": PunctType_quot, # U,
|
||||
"PunctType_brck": PunctType_brck, # U,
|
||||
"PunctType_comm": PunctType_comm, # U,
|
||||
"PunctType_colo": PunctType_colo, # U,
|
||||
"PunctType_semi": PunctType_semi, # U,
|
||||
"PunctType_dash": PunctType_dash, # U,
|
||||
"Style_arch": Style_arch, # cz, fi, U,
|
||||
"Style_rare": Style_rare, # cz, fi, U,
|
||||
"Style_poet": Style_poet, # cz, U,
|
||||
"Style_norm": Style_norm, # cz, U,
|
||||
"Style_coll": Style_coll, # cz, U,
|
||||
"Style_vrnc": Style_vrnc, # cz, U,
|
||||
"Style_sing": Style_sing, # cz, U,
|
||||
"Style_expr": Style_expr, # cz, U,
|
||||
"Style_derg": Style_derg, # cz, U,
|
||||
"Style_vulg": Style_vulg, # cz, U,
|
||||
"Style_yes": Style_yes, # fi, U,
|
||||
"StyleVariant_styleShort": StyleVariant_styleShort, # cz,
|
||||
"StyleVariant_styleBound": StyleVariant_styleBound, # cz, sl,
|
||||
"VerbType_aux": VerbType_aux, # U,
|
||||
"VerbType_cop": VerbType_cop, # U,
|
||||
"VerbType_mod": VerbType_mod, # U,
|
||||
"VerbType_light": VerbType_light, # U,
|
||||
|
||||
"PERSON": PERSON,
|
||||
"NORP": NORP,
|
||||
|
|
|
@ -16,7 +16,6 @@ from ..tokens import Doc
|
|||
from ..strings import StringStore
|
||||
from ..lemmatizer import Lemmatizer
|
||||
from ..attrs import ORTH, TAG, HEAD, DEP
|
||||
from ..util import match_best_version, get_data_path
|
||||
|
||||
from io import StringIO, BytesIO
|
||||
from pathlib import Path
|
||||
|
@ -90,11 +89,8 @@ def en_entityrecognizer():
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer(path):
|
||||
if path is not None:
|
||||
return Lemmatizer.load(path)
|
||||
else:
|
||||
return None
|
||||
def lemmatizer():
|
||||
return English.Defaults.create_lemmatizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -106,14 +102,6 @@ def text_file_b():
|
|||
return BytesIO()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def path():
|
||||
if 'SPACY_DATA' in os.environ:
|
||||
return Path(os.environ['SPACY_DATA'])
|
||||
else:
|
||||
return match_best_version('en', None, get_data_path())
|
||||
|
||||
|
||||
# only used for tests that require loading the models
|
||||
# in all other cases, use specific instances
|
||||
@pytest.fixture(scope="session")
|
||||
|
|
10
spacy/tests/regression/test_issue781.py
Normal file
10
spacy/tests/regression/test_issue781.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# Note: "chromosomes" worked previous the bug fix
|
||||
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
|
||||
def test_issue781(lemmatizer, word, lemmas):
|
||||
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)
|
|
@ -31,6 +31,12 @@ def test_spans_root(doc):
|
|||
assert span.root.text == 'sentence'
|
||||
assert span.root.head.text == 'is'
|
||||
|
||||
def test_spans_string_fn(doc):
|
||||
span = doc[0:4]
|
||||
assert len(span) == 4
|
||||
assert span.text == 'This is a sentence'
|
||||
assert span.upper_ == 'THIS IS A SENTENCE'
|
||||
assert span.lower_ == 'this is a sentence'
|
||||
|
||||
def test_spans_root2(en_tokenizer):
|
||||
text = "through North and South Carolina"
|
||||
|
|
|
@ -365,6 +365,14 @@ cdef class Span:
|
|||
def __get__(self):
|
||||
return ' '.join([t.lemma_ for t in self]).strip()
|
||||
|
||||
property upper_:
|
||||
def __get__(self):
|
||||
return ''.join([t.string.upper() for t in self]).strip()
|
||||
|
||||
property lower_:
|
||||
def __get__(self):
|
||||
return ''.join([t.string.lower() for t in self]).strip()
|
||||
|
||||
property string:
|
||||
def __get__(self):
|
||||
return ''.join([t.string for t in self])
|
||||
|
|
|
@ -149,15 +149,16 @@ def check_renamed_kwargs(renamed, kwargs):
|
|||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||
|
||||
|
||||
def parse_package_meta(package_path, package, on_error=False):
|
||||
def parse_package_meta(package_path, package, require=True):
|
||||
location = os.path.join(str(package_path), package, 'meta.json')
|
||||
if not os.path.isfile(location) and on_error:
|
||||
on_error()
|
||||
else:
|
||||
if os.path.isfile(location):
|
||||
with io.open(location, encoding='utf8') as f:
|
||||
meta = json.load(f)
|
||||
return meta
|
||||
return False
|
||||
elif require:
|
||||
raise IOError("Could not read meta.json from %s" % location)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def print_msg(*text, **kwargs):
|
||||
|
|
|
@ -596,6 +596,8 @@ cdef class Vocab:
|
|||
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
|
||||
|
||||
string_id = self.strings[chars[:word_len]]
|
||||
# Insert words into vocab to add vector.
|
||||
self.get_by_orth(self.mem, string_id)
|
||||
while string_id >= vectors.size():
|
||||
vectors.push_back(EMPTY_VEC)
|
||||
assert vec != NULL
|
||||
|
|
Loading…
Reference in New Issue
Block a user