This commit is contained in:
Matthew Honnibal 2017-03-17 18:30:53 +01:00
commit d013aba7b5
13 changed files with 897 additions and 109 deletions

4
.gitignore vendored
View File

@ -105,3 +105,7 @@ website/package.json
website/announcement.jade
website/www/
website/.gitignore
# Python virtualenv
venv
venv/*

View File

@ -14,7 +14,7 @@ from spacy.language import Language
from spacy.gold import GoldParse
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.pipeline import DependencyParser
from spacy.pipeline import DependencyParser, BeamDependencyParser
from spacy.syntax.parser import get_templates
from spacy.syntax.arc_eager import ArcEager
from spacy.scorer import Scorer
@ -35,8 +35,8 @@ def read_conllx(loc, n=0):
lines.pop(0)
tokens = []
for line in lines:
id_, word, lemma, tag, pos, morph, head, dep, _1, _2 = line.split()
if '-' in id_:
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
if '-' in id_ or '.' in id_:
continue
try:
id_ = int(id_) - 1
@ -66,12 +66,8 @@ def score_model(vocab, tagger, parser, gold_docs, verbose=False):
return scorer
def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
if tag_map_loc:
with open(tag_map_loc) as file_:
tag_map = json.loads(file_.read())
else:
tag_map = DEFAULT_TAG_MAP
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
LangClass = spacy.util.get_lang_class(lang_name)
train_sents = list(read_conllx(train_loc))
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
@ -79,13 +75,37 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
features = get_templates('basic')
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
if not (model_dir / 'deps').exists():
(model_dir / 'deps').mkdir()
if not (model_dir / 'pos').exists():
(model_dir / 'pos').mkdir()
with (model_dir / 'deps' / 'config.json').open('wb') as file_:
file_.write(
json.dumps(
{'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
vocab = LangClass.Defaults.create_vocab()
if not (model_dir / 'vocab').exists():
(model_dir / 'vocab').mkdir()
else:
if (model_dir / 'vocab' / 'strings.json').exists():
with (model_dir / 'vocab' / 'strings.json').open() as file_:
vocab.strings.load(file_)
if (model_dir / 'vocab' / 'lexemes.bin').exists():
vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
if clusters_loc is not None:
clusters_loc = pathlib.Path(clusters_loc)
with clusters_loc.open() as file_:
for line in file_:
try:
cluster, word, freq = line.split()
except ValueError:
continue
lex = vocab[word]
lex.cluster = int(cluster[::-1], 2)
# Populate vocab
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
@ -95,13 +115,13 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
_ = vocab[dep]
for tag in tags:
_ = vocab[tag]
if tag_map:
if vocab.morphology.tag_map:
for tag in tags:
assert tag in tag_map, repr(tag)
tagger = Tagger(vocab, tag_map=tag_map)
assert tag in vocab.morphology.tag_map, repr(tag)
tagger = Tagger(vocab)
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
for itn in range(15):
for itn in range(30):
loss = 0.
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:

View File

@ -35,14 +35,16 @@ set_lang_class(bn.Bengali.lang, bn.Bengali)
def load(name, **overrides):
data_path = overrides.get('path', util.get_data_path())
meta = parse_package_meta(data_path, name)
lang = meta['lang'] if meta and 'lang' in meta else 'en'
meta = parse_package_meta(data_path, name, require=False)
lang = meta['lang'] if meta and 'lang' in meta else name
cls = get_lang_class(lang)
overrides['meta'] = meta
overrides['path'] = Path(data_path / name)
model_path = Path(data_path) / name
if model_path.exists():
overrides['path'] = model_path
return cls(**overrides)
def info(name):
meta = parse_package_meta(util.get_data_path(), name)
meta = parse_package_meta(util.get_data_path(), name, require=True)
print(json.dumps(meta, indent=2))

View File

@ -3,7 +3,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy'
__version__ = '1.6.0'
__version__ = '1.7.0'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'

738
spacy/es/tag_map.py Normal file
View File

@ -0,0 +1,738 @@
{
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin": {
"freq": 865,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
"pos": "AUX"
},
"PUNCT__PunctSide=Fin|PunctType=Brck": {
"freq": 1476,
"morph": "PunctSide=Fin|PunctType=Brck",
"pos": "PUNCT"
},
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin": {
"freq": 7033,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
"pos": "VERB"
},
"PRON__Number=Sing|Person=2|PronType=Prs": {
"freq": 132,
"morph": "Number=Sing|Person=2|PronType=Prs",
"pos": "PRON"
},
"PRON": {
"pos": "PRON"
},
"VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
"freq": 525,
"morph": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"SYM__NumForm=Digit|NumType=Frac": {
"freq": 236,
"morph": "NumForm=Digit|NumType=Frac",
"pos": "SYM"
},
"ADJ___": {
"freq": 515,
"morph": "_",
"pos": "ADJ"
},
"PRON__Person=3": {
"freq": 3185,
"morph": "Person=3",
"pos": "PRON"
},
"PRON__Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs": {
"freq": 104,
"morph": "Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs",
"pos": "PRON"
},
"DET__Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs": {
"freq": 148,
"morph": "Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs",
"pos": "DET"
},
"CONJ": {
"pos": "CONJ"
},
"PUNCT__PunctType=Comm": {
"freq": 24475,
"morph": "PunctType=Comm",
"pos": "PUNCT"
},
"ADV": {
"pos": "ADV"
},
"ADV__AdpType=Prep": {
"freq": 161,
"morph": "AdpType=Prep",
"pos": "ADV"
},
"ADJ__Number=Plur": {
"freq": 2617,
"morph": "Number=Plur",
"pos": "ADJ"
},
"AUX__Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 149,
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "AUX"
},
"ADJ__Gender=Masc|Number=Sing|NumType=Ord": {
"freq": 654,
"morph": "Gender=Masc|Number=Sing|NumType=Ord",
"pos": "ADJ"
},
"AUX__Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin": {
"freq": 272,
"morph": "Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin",
"pos": "AUX"
},
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin": {
"freq": 388,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin",
"pos": "AUX"
},
"ADJ__Gender=Masc|Number=Plur": {
"freq": 1995,
"morph": "Gender=Masc|Number=Plur",
"pos": "ADJ"
},
"DET": {
"pos": "DET"
},
"VERB__VerbForm=Inf": {
"freq": 8204,
"morph": "VerbForm=Inf",
"pos": "VERB"
},
"DET__Definite=Def|Gender=Fem|Number=Plur|PronType=Art": {
"freq": 4275,
"morph": "Definite=Def|Gender=Fem|Number=Plur|PronType=Art",
"pos": "DET"
},
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 495,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "VERB"
},
"DET__Definite=Def|Gender=Masc|Number=Plur|PronType=Art": {
"freq": 6951,
"morph": "Definite=Def|Gender=Masc|Number=Plur|PronType=Art",
"pos": "DET"
},
"PRON___": {
"freq": 1871,
"morph": "_",
"pos": "PRON"
},
"DET__Definite=Ind|Gender=Masc|Number=Plur|PronType=Art": {
"freq": 113,
"morph": "Definite=Ind|Gender=Masc|Number=Plur|PronType=Art",
"pos": "DET"
},
"NOUN__Number=Sing": {
"freq": 1977,
"morph": "Number=Sing",
"pos": "NOUN"
},
"ADJ__Gender=Fem|Number=Sing|NumType=Ord": {
"freq": 568,
"morph": "Gender=Fem|Number=Sing|NumType=Ord",
"pos": "ADJ"
},
"NOUN__Gender=Masc|Number=Sing": {
"freq": 25557,
"morph": "Gender=Masc|Number=Sing",
"pos": "NOUN"
},
"PART": {
"pos": "PART"
},
"ADJ__Number=Sing": {
"freq": 6619,
"morph": "Number=Sing",
"pos": "ADJ"
},
"NUM": {
"pos": "NUM"
},
"DET__Number=Sing|PronType=Ind": {
"freq": 309,
"morph": "Number=Sing|PronType=Ind",
"pos": "DET"
},
"ADJ__Gender=Fem|Number=Sing|VerbForm=Part": {
"freq": 1387,
"morph": "Gender=Fem|Number=Sing|VerbForm=Part",
"pos": "ADJ"
},
"VERB__Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 272,
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "VERB"
},
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin": {
"freq": 1574,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin",
"pos": "VERB"
},
"PRON__Gender=Masc|Number=Sing|PronType=Dem": {
"freq": 115,
"morph": "Gender=Masc|Number=Sing|PronType=Dem",
"pos": "PRON"
},
"ADP": {
"pos": "ADP"
},
"NOUN__AdvType=Tim": {
"freq": 1504,
"morph": "AdvType=Tim",
"pos": "NOUN"
},
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin": {
"freq": 130,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin",
"pos": "AUX"
},
"PRON__Case=Nom|Number=Sing|Person=1|PronType=Prs": {
"freq": 115,
"morph": "Case=Nom|Number=Sing|Person=1|PronType=Prs",
"pos": "PRON"
},
"PUNCT__PunctType=Semi": {
"freq": 259,
"morph": "PunctType=Semi",
"pos": "PUNCT"
},
"PUNCT__PunctSide=Ini|PunctType=Qest": {
"freq": 206,
"morph": "PunctSide=Ini|PunctType=Qest",
"pos": "PUNCT"
},
"PRON__Case=Dat|Number=Sing|Person=3|PronType=Prs": {
"freq": 754,
"morph": "Case=Dat|Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"PRON__Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"freq": 624,
"morph": "Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"NUM__NumForm=Digit": {
"freq": 2979,
"morph": "NumForm=Digit",
"pos": "NUM"
},
"PUNCT__PunctType=Colo": {
"freq": 638,
"morph": "PunctType=Colo",
"pos": "PUNCT"
},
"PROPN": {
"pos": "PROPN"
},
"X": {
"pos": "X"
},
"NOUN__NumForm=Digit": {
"freq": 555,
"morph": "NumForm=Digit",
"pos": "NOUN"
},
"VERB__Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part": {
"freq": 3297,
"morph": "Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part",
"pos": "VERB"
},
"ADJ__Gender=Masc|Number=Plur|NumType=Ord": {
"freq": 227,
"morph": "Gender=Masc|Number=Plur|NumType=Ord",
"pos": "ADJ"
},
"PRON__Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"freq": 205,
"morph": "Gender=Masc|Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"NOUN__Number=Plur": {
"freq": 1463,
"morph": "Number=Plur",
"pos": "NOUN"
},
"DET__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
"freq": 2909,
"morph": "Number=Sing|Person=3|Poss=Yes|PronType=Prs",
"pos": "DET"
},
"VERB__VerbForm=Ger": {
"freq": 994,
"morph": "VerbForm=Ger",
"pos": "VERB"
},
"INTJ": {
"pos": "INTJ"
},
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin": {
"freq": 398,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin",
"pos": "VERB"
},
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 1403,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"PRON__Number=Plur|Person=1|PronType=Prs": {
"freq": 264,
"morph": "Number=Plur|Person=1|PronType=Prs",
"pos": "PRON"
},
"ADV__Negative=Neg": {
"freq": 2960,
"morph": "Negative=Neg",
"pos": "ADV"
},
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 2488,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"DET__Gender=Masc|Number=Sing|PronType=Ind": {
"freq": 855,
"morph": "Gender=Masc|Number=Sing|PronType=Ind",
"pos": "DET"
},
"VERB__Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 408,
"morph": "Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"PRON__Gender=Fem|Number=Sing|PronType=Ind": {
"freq": 237,
"morph": "Gender=Fem|Number=Sing|PronType=Ind",
"pos": "PRON"
},
"DET__Gender=Fem|Number=Plur|PronType=Ind": {
"freq": 592,
"morph": "Gender=Fem|Number=Plur|PronType=Ind",
"pos": "DET"
},
"ADJ__Gender=Fem|Number=Plur|VerbForm=Part": {
"freq": 614,
"morph": "Gender=Fem|Number=Plur|VerbForm=Part",
"pos": "ADJ"
},
"DET__Gender=Fem|Number=Sing|PronType=Dem": {
"freq": 808,
"morph": "Gender=Fem|Number=Sing|PronType=Dem",
"pos": "DET"
},
"DET__Gender=Fem|Number=Sing|PronType=Ind": {
"freq": 613,
"morph": "Gender=Fem|Number=Sing|PronType=Ind",
"pos": "DET"
},
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Art": {
"freq": 4277,
"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Art",
"pos": "DET"
},
"VERB__Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 788,
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"NOUN__Gender=Fem": {
"freq": 145,
"morph": "Gender=Fem",
"pos": "NOUN"
},
"PRON__Gender=Fem|Number=Plur|PronType=Ind": {
"freq": 127,
"morph": "Gender=Fem|Number=Plur|PronType=Ind",
"pos": "PRON"
},
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 729,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "AUX"
},
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 1223,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "VERB"
},
"AUX__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
"freq": 164,
"morph": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"PRON__PronType=Rel": {
"freq": 7301,
"morph": "PronType=Rel",
"pos": "PRON"
},
"DET__Definite=Def|Number=Sing|PronType=Art": {
"freq": 928,
"morph": "Definite=Def|Number=Sing|PronType=Art",
"pos": "DET"
},
"ADV___": {
"freq": 11334,
"morph": "_",
"pos": "ADV"
},
"ADJ": {
"pos": "ADJ"
},
"AUX__VerbForm=Ger": {
"freq": 154,
"morph": "VerbForm=Ger",
"pos": "AUX"
},
"PRON__Number=Sing|PronType=Int": {
"freq": 201,
"morph": "Number=Sing|PronType=Int",
"pos": "PRON"
},
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin": {
"freq": 1236,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin",
"pos": "VERB"
},
"NOUN__Gender=Masc|Number=Plur": {
"freq": 12310,
"morph": "Gender=Masc|Number=Plur",
"pos": "NOUN"
},
"NOUN__Gender=Fem|Number=Plur": {
"freq": 8612,
"morph": "Gender=Fem|Number=Plur",
"pos": "NOUN"
},
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 6343,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"PRON__Gender=Masc|Number=Plur|PronType=Ind": {
"freq": 460,
"morph": "Gender=Masc|Number=Plur|PronType=Ind",
"pos": "PRON"
},
"VERB__Mood=Sub|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 100,
"morph": "Mood=Sub|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "VERB"
},
"PUNCT__PunctSide=Ini|PunctType=Brck": {
"freq": 1482,
"morph": "PunctSide=Ini|PunctType=Brck",
"pos": "PUNCT"
},
"PRON__Gender=Masc|Number=Sing|PronType=Tot": {
"freq": 111,
"morph": "Gender=Masc|Number=Sing|PronType=Tot",
"pos": "PRON"
},
"SCONJ": {
"pos": "SCONJ"
},
"AUX__VerbForm=Inf": {
"freq": 1495,
"morph": "VerbForm=Inf",
"pos": "AUX"
},
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 5227,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"ADJ__AdpType=Prep": {
"freq": 124,
"morph": "AdpType=Prep",
"pos": "ADJ"
},
"PRON__Gender=Masc|Number=Sing|PronType=Ind": {
"freq": 624,
"morph": "Gender=Masc|Number=Sing|PronType=Ind",
"pos": "PRON"
},
"DET__Gender=Masc|Number=Plur|PronType=Dem": {
"freq": 269,
"morph": "Gender=Masc|Number=Plur|PronType=Dem",
"pos": "DET"
},
"ADJ__Gender=Fem|Number=Plur": {
"freq": 1612,
"morph": "Gender=Fem|Number=Plur",
"pos": "ADJ"
},
"NUM__Gender=Masc|Number=Plur|NumType=Card": {
"freq": 104,
"morph": "Gender=Masc|Number=Plur|NumType=Card",
"pos": "NUM"
},
"NUM__NumType=Card": {
"freq": 533,
"morph": "NumType=Card",
"pos": "NUM"
},
"SCONJ___": {
"freq": 10129,
"morph": "_",
"pos": "SCONJ"
},
"PRON__Number=Sing|PronType=Rel": {
"freq": 318,
"morph": "Number=Sing|PronType=Rel",
"pos": "PRON"
},
"VERB__Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin": {
"freq": 253,
"morph": "Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin",
"pos": "VERB"
},
"NOUN": {
"pos": "NOUN"
},
"NOUN__Gender=Masc": {
"freq": 153,
"morph": "Gender=Masc",
"pos": "NOUN"
},
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art": {
"freq": 3087,
"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Art",
"pos": "DET"
},
"ADJ__Gender=Masc|Number=Plur|VerbForm=Part": {
"freq": 997,
"morph": "Gender=Masc|Number=Plur|VerbForm=Part",
"pos": "ADJ"
},
"PRON__Number=Sing|PronType=Dem": {
"freq": 302,
"morph": "Number=Sing|PronType=Dem",
"pos": "PRON"
},
"PRON__Number=Sing|Person=3|PronType=Prs": {
"freq": 116,
"morph": "Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"PRON__Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
"freq": 173,
"morph": "Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"PUNCT": {
"pos": "PUNCT"
},
"DET__Gender=Masc|Number=Sing|PronType=Dem": {
"freq": 962,
"morph": "Gender=Masc|Number=Sing|PronType=Dem",
"pos": "DET"
},
"PRON__Number=Plur|PronType=Rel": {
"freq": 102,
"morph": "Number=Plur|PronType=Rel",
"pos": "PRON"
},
"ADJ__Gender=Masc|Number=Sing": {
"freq": 5136,
"morph": "Gender=Masc|Number=Sing",
"pos": "ADJ"
},
"DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art": {
"freq": 22962,
"morph": "Definite=Def|Gender=Masc|Number=Sing|PronType=Art",
"pos": "DET"
},
"AUX__Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 107,
"morph": "Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"PRON__Case=Dat|Number=Plur|Person=3|PronType=Prs": {
"freq": 220,
"morph": "Case=Dat|Number=Plur|Person=3|PronType=Prs",
"pos": "PRON"
},
"VERB__Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part": {
"freq": 206,
"morph": "Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part",
"pos": "VERB"
},
"DET__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {
"freq": 1021,
"morph": "Number=Plur|Person=3|Poss=Yes|PronType=Prs",
"pos": "DET"
},
"ADJ__Gender=Fem|Number=Plur|NumType=Ord": {
"freq": 101,
"morph": "Gender=Fem|Number=Plur|NumType=Ord",
"pos": "ADJ"
},
"PRON__PronType=Int": {
"freq": 137,
"morph": "PronType=Int",
"pos": "PRON"
},
"ADP__AdpType=Prep": {
"freq": 71133,
"morph": "AdpType=Prep",
"pos": "ADP"
},
"DET__Gender=Masc|Number=Plur|PronType=Ind": {
"freq": 904,
"morph": "Gender=Masc|Number=Plur|PronType=Ind",
"pos": "DET"
},
"AUX__Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 299,
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"DET__Gender=Fem|Number=Plur|PronType=Dem": {
"freq": 188,
"morph": "Gender=Fem|Number=Plur|PronType=Dem",
"pos": "DET"
},
"NUM__NumForm=Digit|NumType=Card": {
"freq": 1108,
"morph": "NumForm=Digit|NumType=Card",
"pos": "NUM"
},
"PUNCT__PunctType=Quot": {
"freq": 7380,
"morph": "PunctType=Quot",
"pos": "PUNCT"
},
"VERB__Gender=Masc|Number=Plur|Tense=Past|VerbForm=Part": {
"freq": 184,
"morph": "Gender=Masc|Number=Plur|Tense=Past|VerbForm=Part",
"pos": "VERB"
},
"PUNCT__PunctType=Dash": {
"freq": 2345,
"morph": "PunctType=Dash",
"pos": "PUNCT"
},
"ADJ__Gender=Fem|Number=Sing": {
"freq": 3935,
"morph": "Gender=Fem|Number=Sing",
"pos": "ADJ"
},
"AUX__Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin": {
"freq": 215,
"morph": "Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin": {
"freq": 218,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin",
"pos": "AUX"
},
"PROPN___": {
"freq": 34454,
"morph": "_",
"pos": "PROPN"
},
"PRON__Number=Sing|PronType=Ind": {
"freq": 421,
"morph": "Number=Sing|PronType=Ind",
"pos": "PRON"
},
"VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin": {
"freq": 359,
"morph": "Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"PUNCT__PunctSide=Fin|PunctType=Qest": {
"freq": 312,
"morph": "PunctSide=Fin|PunctType=Qest",
"pos": "PUNCT"
},
"PRON__Number=Sing|Person=1|PronType=Prs": {
"freq": 298,
"morph": "Number=Sing|Person=1|PronType=Prs",
"pos": "PRON"
},
"PART__Negative=Neg": {
"freq": 122,
"morph": "Negative=Neg",
"pos": "PART"
},
"PRON__Gender=Masc|Number=Plur|Person=3|PronType=Prs": {
"freq": 176,
"morph": "Gender=Masc|Number=Plur|Person=3|PronType=Prs",
"pos": "PRON"
},
"NOUN__Gender=Fem|Number=Sing": {
"freq": 24416,
"morph": "Gender=Fem|Number=Sing",
"pos": "NOUN"
},
"ADJ__Gender=Masc|Number=Sing|VerbForm=Part": {
"freq": 2297,
"morph": "Gender=Masc|Number=Sing|VerbForm=Part",
"pos": "ADJ"
},
"CONJ___": {
"freq": 12225,
"morph": "_",
"pos": "CONJ"
},
"NUM__Number=Plur|NumType=Card": {
"freq": 2057,
"morph": "Number=Plur|NumType=Card",
"pos": "NUM"
},
"NOUN___": {
"freq": 4829,
"morph": "_",
"pos": "NOUN"
},
"VERB": {
"pos": "VERB"
},
"DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {
"freq": 16487,
"morph": "Definite=Def|Gender=Fem|Number=Sing|PronType=Art",
"pos": "DET"
},
"SYM": {
"pos": "SYM"
},
"VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin": {
"freq": 130,
"morph": "Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin",
"pos": "VERB"
},
"AUX": {
"pos": "AUX"
},
"AUX__Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part": {
"freq": 494,
"morph": "Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part",
"pos": "AUX"
},
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 199,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "AUX"
},
"VERB__Mood=Imp|Number=Sing|Person=2|VerbForm=Fin": {
"freq": 100,
"morph": "Mood=Imp|Number=Sing|Person=2|VerbForm=Fin",
"pos": "VERB"
},
"PUNCT__PunctType=Peri": {
"freq": 14170,
"morph": "PunctType=Peri",
"pos": "PUNCT"
}
}

View File

@ -6,6 +6,7 @@ import ujson as json
from .en.lemmatizer import INDEX, EXC, RULES
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none
class Lemmatizer(object):
@ -43,10 +44,13 @@ class Lemmatizer(object):
avoid lemmatization entirely.'''
morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
return True
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
return True
elif true_morph_key in (VerbForm_inf, VerbForm_none):
return True
else:
return False
@ -70,11 +74,16 @@ def lemmatize(string, index, exceptions, rules):
#if string in index:
# forms.append(string)
forms.extend(exceptions.get(string, []))
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[:len(string) - len(old)] + new
if form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(string)
return set(forms)

View File

@ -310,78 +310,78 @@ IDS = {
"Number_grpa": Number_grpa, # U20
"Number_grpl": Number_grpl, # U20
"Number_inv": Number_inv, # U20
"NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U,
"NumValue_one ": NumValue_one, # cz, U,
"NumValue_two ": NumValue_two, # cz, U,
"NumValue_three ": NumValue_three, # cz, U,
"PartForm_pres ": PartForm_pres, # fi,
"PartForm_past ": PartForm_past, # fi,
"PartForm_agt ": PartForm_agt, # fi,
"PartForm_neg ": PartForm_neg, # fi,
"PartType_mod ": PartType_mod, # U,
"PartType_emp ": PartType_emp, # U,
"PartType_res ": PartType_res, # U,
"PartType_inf ": PartType_inf, # U,
"PartType_vbp ": PartType_vbp, # U,
"Person_abs_one ": Person_abs_one, # bq, U,
"Person_abs_two ": Person_abs_two, # bq, U,
"Person_abs_three ": Person_abs_three, # bq, U,
"Person_dat_one ": Person_dat_one, # bq, U,
"Person_dat_two ": Person_dat_two, # bq, U,
"Person_dat_three ": Person_dat_three, # bq, U,
"Person_erg_one ": Person_erg_one, # bq, U,
"Person_erg_two ": Person_erg_two, # bq, U,
"Person_erg_three ": Person_erg_three, # bq, U,
"Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U,
"Person_zero ": Person_zero, # U20
"Person_four ": Person_four, # U20
"Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Polite_infm ": Polite_infm, # U20
"Polite_form ": Polite_form, # U20
"Polite_form_elev ": Polite_form_elev, # U20
"NumForm_digit": NumForm_digit, # cz, sl, U,
"NumForm_roman": NumForm_roman, # cz, sl, U,
"NumForm_word": NumForm_word, # cz, sl, U,
"NumValue_one": NumValue_one, # cz, U,
"NumValue_two": NumValue_two, # cz, U,
"NumValue_three": NumValue_three, # cz, U,
"PartForm_pres": PartForm_pres, # fi,
"PartForm_past": PartForm_past, # fi,
"PartForm_agt": PartForm_agt, # fi,
"PartForm_neg": PartForm_neg, # fi,
"PartType_mod": PartType_mod, # U,
"PartType_emp": PartType_emp, # U,
"PartType_res": PartType_res, # U,
"PartType_inf": PartType_inf, # U,
"PartType_vbp": PartType_vbp, # U,
"Person_abs_one": Person_abs_one, # bq, U,
"Person_abs_two": Person_abs_two, # bq, U,
"Person_abs_three": Person_abs_three, # bq, U,
"Person_dat_one": Person_dat_one, # bq, U,
"Person_dat_two": Person_dat_two, # bq, U,
"Person_dat_three": Person_dat_three, # bq, U,
"Person_erg_one": Person_erg_one, # bq, U,
"Person_erg_two": Person_erg_two, # bq, U,
"Person_erg_three": Person_erg_three, # bq, U,
"Person_psor_one": Person_psor_one, # fi, U,
"Person_psor_two": Person_psor_two, # fi, U,
"Person_psor_three": Person_psor_three, # fi, U,
"Person_zero": Person_zero, # U20
"Person_four": Person_four, # U20
"Polite_inf": Polite_inf, # bq, U,
"Polite_pol": Polite_pol, # bq, U,
"Polite_abs_inf": Polite_abs_inf, # bq, U,
"Polite_abs_pol": Polite_abs_pol, # bq, U,
"Polite_erg_inf": Polite_erg_inf, # bq, U,
"Polite_erg_pol": Polite_erg_pol, # bq, U,
"Polite_dat_inf": Polite_dat_inf, # bq, U,
"Polite_dat_pol": Polite_dat_pol, # bq, U,
"Polite_infm": Polite_infm, # U20
"Polite_form": Polite_form, # U20
"Polite_form_elev": Polite_form_elev, # U20
"Polite_form_humb ": Polite_form_humb, # U20
"Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U,
"PunctSide_ini ": PunctSide_ini, # U,
"PunctSide_fin ": PunctSide_fin, # U,
"PunctType_peri ": PunctType_peri, # U,
"PunctType_qest ": PunctType_qest, # U,
"PunctType_excl ": PunctType_excl, # U,
"PunctType_quot ": PunctType_quot, # U,
"PunctType_brck ": PunctType_brck, # U,
"PunctType_comm ": PunctType_comm, # U,
"PunctType_colo ": PunctType_colo, # U,
"PunctType_semi ": PunctType_semi, # U,
"PunctType_dash ": PunctType_dash, # U,
"Style_arch ": Style_arch, # cz, fi, U,
"Style_rare ": Style_rare, # cz, fi, U,
"Style_poet ": Style_poet, # cz, U,
"Style_norm ": Style_norm, # cz, U,
"Style_coll ": Style_coll, # cz, U,
"Style_vrnc ": Style_vrnc, # cz, U,
"Style_sing ": Style_sing, # cz, U,
"Style_expr ": Style_expr, # cz, U,
"Style_derg ": Style_derg, # cz, U,
"Style_vulg ": Style_vulg, # cz, U,
"Style_yes ": Style_yes, # fi, U,
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
"VerbType_aux ": VerbType_aux, # U,
"VerbType_cop ": VerbType_cop, # U,
"VerbType_mod ": VerbType_mod, # U,
"VerbType_light ": VerbType_light, # U,
"Prefix_yes": Prefix_yes, # U,
"PrepCase_npr": PrepCase_npr, # cz,
"PrepCase_pre": PrepCase_pre, # U,
"PunctSide_ini": PunctSide_ini, # U,
"PunctSide_fin": PunctSide_fin, # U,
"PunctType_peri": PunctType_peri, # U,
"PunctType_qest": PunctType_qest, # U,
"PunctType_excl": PunctType_excl, # U,
"PunctType_quot": PunctType_quot, # U,
"PunctType_brck": PunctType_brck, # U,
"PunctType_comm": PunctType_comm, # U,
"PunctType_colo": PunctType_colo, # U,
"PunctType_semi": PunctType_semi, # U,
"PunctType_dash": PunctType_dash, # U,
"Style_arch": Style_arch, # cz, fi, U,
"Style_rare": Style_rare, # cz, fi, U,
"Style_poet": Style_poet, # cz, U,
"Style_norm": Style_norm, # cz, U,
"Style_coll": Style_coll, # cz, U,
"Style_vrnc": Style_vrnc, # cz, U,
"Style_sing": Style_sing, # cz, U,
"Style_expr": Style_expr, # cz, U,
"Style_derg": Style_derg, # cz, U,
"Style_vulg": Style_vulg, # cz, U,
"Style_yes": Style_yes, # fi, U,
"StyleVariant_styleShort": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound": StyleVariant_styleBound, # cz, sl,
"VerbType_aux": VerbType_aux, # U,
"VerbType_cop": VerbType_cop, # U,
"VerbType_mod": VerbType_mod, # U,
"VerbType_light": VerbType_light, # U,
"PERSON": PERSON,
"NORP": NORP,

View File

@ -16,7 +16,6 @@ from ..tokens import Doc
from ..strings import StringStore
from ..lemmatizer import Lemmatizer
from ..attrs import ORTH, TAG, HEAD, DEP
from ..util import match_best_version, get_data_path
from io import StringIO, BytesIO
from pathlib import Path
@ -90,11 +89,8 @@ def en_entityrecognizer():
@pytest.fixture
def lemmatizer(path):
if path is not None:
return Lemmatizer.load(path)
else:
return None
def lemmatizer():
return English.Defaults.create_lemmatizer()
@pytest.fixture
@ -106,14 +102,6 @@ def text_file_b():
return BytesIO()
@pytest.fixture
def path():
if 'SPACY_DATA' in os.environ:
return Path(os.environ['SPACY_DATA'])
else:
return match_best_version('en', None, get_data_path())
# only used for tests that require loading the models
# in all other cases, use specific instances
@pytest.fixture(scope="session")

View File

@ -0,0 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
# Note: "chromosomes" worked previous the bug fix
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
def test_issue781(lemmatizer, word, lemmas):
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)

View File

@ -31,6 +31,12 @@ def test_spans_root(doc):
assert span.root.text == 'sentence'
assert span.root.head.text == 'is'
def test_spans_string_fn(doc):
span = doc[0:4]
assert len(span) == 4
assert span.text == 'This is a sentence'
assert span.upper_ == 'THIS IS A SENTENCE'
assert span.lower_ == 'this is a sentence'
def test_spans_root2(en_tokenizer):
text = "through North and South Carolina"

View File

@ -365,6 +365,14 @@ cdef class Span:
def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip()
property upper_:
def __get__(self):
return ''.join([t.string.upper() for t in self]).strip()
property lower_:
def __get__(self):
return ''.join([t.string.lower() for t in self]).strip()
property string:
def __get__(self):
return ''.join([t.string for t in self])

View File

@ -149,15 +149,16 @@ def check_renamed_kwargs(renamed, kwargs):
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def parse_package_meta(package_path, package, on_error=False):
def parse_package_meta(package_path, package, require=True):
location = os.path.join(str(package_path), package, 'meta.json')
if not os.path.isfile(location) and on_error:
on_error()
else:
if os.path.isfile(location):
with io.open(location, encoding='utf8') as f:
meta = json.load(f)
return meta
return False
elif require:
raise IOError("Could not read meta.json from %s" % location)
else:
return None
def print_msg(*text, **kwargs):

View File

@ -596,6 +596,8 @@ cdef class Vocab:
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
string_id = self.strings[chars[:word_len]]
# Insert words into vocab to add vector.
self.get_by_orth(self.mem, string_id)
while string_id >= vectors.size():
vectors.push_back(EMPTY_VEC)
assert vec != NULL