This commit is contained in:
Matthew Honnibal 2017-03-17 18:30:53 +01:00
commit d013aba7b5
13 changed files with 897 additions and 109 deletions

4
.gitignore vendored
View File

@ -105,3 +105,7 @@ website/package.json
website/announcement.jade website/announcement.jade
website/www/ website/www/
website/.gitignore website/.gitignore
# Python virtualenv
venv
venv/*

View File

@ -14,7 +14,7 @@ from spacy.language import Language
from spacy.gold import GoldParse from spacy.gold import GoldParse
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tagger import Tagger from spacy.tagger import Tagger
from spacy.pipeline import DependencyParser from spacy.pipeline import DependencyParser, BeamDependencyParser
from spacy.syntax.parser import get_templates from spacy.syntax.parser import get_templates
from spacy.syntax.arc_eager import ArcEager from spacy.syntax.arc_eager import ArcEager
from spacy.scorer import Scorer from spacy.scorer import Scorer
@ -35,8 +35,8 @@ def read_conllx(loc, n=0):
lines.pop(0) lines.pop(0)
tokens = [] tokens = []
for line in lines: for line in lines:
id_, word, lemma, tag, pos, morph, head, dep, _1, _2 = line.split() id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
if '-' in id_: if '-' in id_ or '.' in id_:
continue continue
try: try:
id_ = int(id_) - 1 id_ = int(id_) - 1
@ -66,12 +66,8 @@ def score_model(vocab, tagger, parser, gold_docs, verbose=False):
return scorer return scorer
def main(train_loc, dev_loc, model_dir, tag_map_loc=None): def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
if tag_map_loc: LangClass = spacy.util.get_lang_class(lang_name)
with open(tag_map_loc) as file_:
tag_map = json.loads(file_.read())
else:
tag_map = DEFAULT_TAG_MAP
train_sents = list(read_conllx(train_loc)) train_sents = list(read_conllx(train_loc))
train_sents = PseudoProjectivity.preprocess_training_data(train_sents) train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
@ -79,13 +75,37 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
features = get_templates('basic') features = get_templates('basic')
model_dir = pathlib.Path(model_dir) model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
if not (model_dir / 'deps').exists(): if not (model_dir / 'deps').exists():
(model_dir / 'deps').mkdir() (model_dir / 'deps').mkdir()
if not (model_dir / 'pos').exists():
(model_dir / 'pos').mkdir()
with (model_dir / 'deps' / 'config.json').open('wb') as file_: with (model_dir / 'deps' / 'config.json').open('wb') as file_:
file_.write( file_.write(
json.dumps( json.dumps(
{'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
vocab = LangClass.Defaults.create_vocab()
if not (model_dir / 'vocab').exists():
(model_dir / 'vocab').mkdir()
else:
if (model_dir / 'vocab' / 'strings.json').exists():
with (model_dir / 'vocab' / 'strings.json').open() as file_:
vocab.strings.load(file_)
if (model_dir / 'vocab' / 'lexemes.bin').exists():
vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
if clusters_loc is not None:
clusters_loc = pathlib.Path(clusters_loc)
with clusters_loc.open() as file_:
for line in file_:
try:
cluster, word, freq = line.split()
except ValueError:
continue
lex = vocab[word]
lex.cluster = int(cluster[::-1], 2)
# Populate vocab # Populate vocab
for _, doc_sents in train_sents: for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents:
@ -95,13 +115,13 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
_ = vocab[dep] _ = vocab[dep]
for tag in tags: for tag in tags:
_ = vocab[tag] _ = vocab[tag]
if tag_map: if vocab.morphology.tag_map:
for tag in tags: for tag in tags:
assert tag in tag_map, repr(tag) assert tag in vocab.morphology.tag_map, repr(tag)
tagger = Tagger(vocab, tag_map=tag_map) tagger = Tagger(vocab)
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
for itn in range(15): for itn in range(30):
loss = 0. loss = 0.
for _, doc_sents in train_sents: for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents:

View File

@ -35,14 +35,16 @@ set_lang_class(bn.Bengali.lang, bn.Bengali)
def load(name, **overrides): def load(name, **overrides):
data_path = overrides.get('path', util.get_data_path()) data_path = overrides.get('path', util.get_data_path())
meta = parse_package_meta(data_path, name) meta = parse_package_meta(data_path, name, require=False)
lang = meta['lang'] if meta and 'lang' in meta else 'en' lang = meta['lang'] if meta and 'lang' in meta else name
cls = get_lang_class(lang) cls = get_lang_class(lang)
overrides['meta'] = meta overrides['meta'] = meta
overrides['path'] = Path(data_path / name) model_path = Path(data_path) / name
if model_path.exists():
overrides['path'] = model_path
return cls(**overrides) return cls(**overrides)
def info(name): def info(name):
meta = parse_package_meta(util.get_data_path(), name) meta = parse_package_meta(util.get_data_path(), name, require=True)
print(json.dumps(meta, indent=2)) print(json.dumps(meta, indent=2))

View File

@ -3,7 +3,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy' __title__ = 'spacy'
__version__ = '1.6.0' __version__ = '1.7.0'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io' __uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal' __author__ = 'Matthew Honnibal'

738
spacy/es/tag_map.py Normal file
View File

@ -0,0 +1,738 @@
{
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin": {
"freq": 865,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
"pos": "AUX"
},
"PUNCT__PunctSide=Fin|PunctType=Brck": {
"freq": 1476,
"morph": "PunctSide=Fin|PunctType=Brck",
"pos": "PUNCT"
},
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin": {
"freq": 7033,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
"pos": "VERB"
},
"PRON__Number=Sing|Person=2|PronType=Prs": {
"freq": 132,
"morph": "Number=Sing|Person=2|PronType=Prs",
"pos": "PRON"
},
"PRON": {
"pos": "PRON"
},
"VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
"freq": 525,
"morph": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"SYM__NumForm=Digit|NumType=Frac": {
"freq": 236,
"morph": "NumForm=Digit|NumType=Frac",
"pos": "SYM"
},
"ADJ___": {
"freq": 515,
"morph": "_",
"pos": "ADJ"
},
"PRON__Person=3": {
"freq": 3185,
"morph": "Person=3",
"pos": "PRON"
},
"PRON__Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs": {
"freq": 104,
"morph": "Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs",
"pos": "PRON"
},
"DET__Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs": {
"freq": 148,
"morph": "Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs",
"pos": "DET"
},
"CONJ": {
"pos": "CONJ"
},
"PUNCT__PunctType=Comm": {
"freq": 24475,
"morph": "PunctType=Comm",
"pos": "PUNCT"
},
"ADV": {
"pos": "ADV"
},
"ADV__AdpType=Prep": {
"freq": 161,
"morph": "AdpType=Prep",
"pos": "ADV"
},
"ADJ__Number=Plur": {
"freq": 2617,
"morph": "Number=Plur",
"pos": "ADJ"
},
"AUX__Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 149,
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "AUX"
},
"ADJ__Gender=Masc|Number=Sing|NumType=Ord": {
"freq": 654,
"morph": "Gender=Masc|Number=Sing|NumType=Ord",
"pos": "ADJ"
},
"AUX__Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin": {
"freq": 272,
"morph": "Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin",
"pos": "AUX"
},
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin": {
"freq": 388,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin",
"pos": "AUX"
},
"ADJ__Gender=Masc|Number=Plur": {
"freq": 1995,
"morph": "Gender=Masc|Number=Plur",
"pos": "ADJ"
},
"DET": {
"pos": "DET"
},
"VERB__VerbForm=Inf": {
"freq": 8204,
"morph": "VerbForm=Inf",
"pos": "VERB"
},
"DET__Definite=Def|Gender=Fem|Number=Plur|PronType=Art": {
"freq": 4275,
"morph": "Definite=Def|Gender=Fem|Number=Plur|PronType=Art",
"pos": "DET"
},
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 495,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "VERB"
},
"DET__Definite=Def|Gender=Masc|Number=Plur|PronType=Art": {
"freq": 6951,
"morph": "Definite=Def|Gender=Masc|Number=Plur|PronType=Art",
"pos": "DET"
},
"PRON___": {
"freq": 1871,
"morph": "_",
"pos": "PRON"
},
"DET__Definite=Ind|Gender=Masc|Number=Plur|PronType=Art": {
"freq": 113,
"morph": "Definite=Ind|Gender=Masc|Number=Plur|PronType=Art",
"pos": "DET"
},
"NOUN__Number=Sing": {
"freq": 1977,
"morph": "Number=Sing",
"pos": "NOUN"
},
"ADJ__Gender=Fem|Number=Sing|NumType=Ord": {
"freq": 568,
"morph": "Gender=Fem|Number=Sing|NumType=Ord",
"pos": "ADJ"
},
"NOUN__Gender=Masc|Number=Sing": {
"freq": 25557,
"morph": "Gender=Masc|Number=Sing",
"pos": "NOUN"
},
"PART": {
"pos": "PART"
},
"ADJ__Number=Sing": {
"freq": 6619,
"morph": "Number=Sing",
"pos": "ADJ"
},
"NUM": {
"pos": "NUM"
},
"DET__Number=Sing|PronType=Ind": {
"freq": 309,
"morph": "Number=Sing|PronType=Ind",
"pos": "DET"
},
"ADJ__Gender=Fem|Number=Sing|VerbForm=Part": {
"freq": 1387,
"morph": "Gender=Fem|Number=Sing|VerbForm=Part",
"pos": "ADJ"
},
"VERB__Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 272,
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "VERB"
},
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin": {
"freq": 1574,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin",
"pos": "VERB"
},
"PRON__Gender=Masc|Number=Sing|PronType=Dem": {
"freq": 115,
"morph": "Gender=Masc|Number=Sing|PronType=Dem",
"pos": "PRON"
},
"ADP": {
"pos": "ADP"
},
"NOUN__AdvType=Tim": {
"freq": 1504,
"morph": "AdvType=Tim",
"pos": "NOUN"
},
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin": {
"freq": 130,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin",
"pos": "AUX"
},
"PRON__Case=Nom|Number=Sing|Person=1|PronType=Prs": {
"freq": 115,
"morph": "Case=Nom|Number=Sing|Person=1|PronType=Prs",
"pos": "PRON"
},
"PUNCT__PunctType=Semi": {
"freq": 259,
"morph": "PunctType=Semi",
"pos": "PUNCT"
},
"PUNCT__PunctSide=Ini|PunctType=Qest": {
"freq": 206,
"morph": "PunctSide=Ini|PunctType=Qest",
"pos": "PUNCT"
},
"PRON__Case=Dat|Number=Sing|Person=3|PronType=Prs": {
"freq": 754,
"morph": "Case=Dat|Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"PRON__Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"freq": 624,
"morph": "Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"NUM__NumForm=Digit": {
"freq": 2979,
"morph": "NumForm=Digit",
"pos": "NUM"
},
"PUNCT__PunctType=Colo": {
"freq": 638,
"morph": "PunctType=Colo",
"pos": "PUNCT"
},
"PROPN": {
"pos": "PROPN"
},
"X": {
"pos": "X"
},
"NOUN__NumForm=Digit": {
"freq": 555,
"morph": "NumForm=Digit",
"pos": "NOUN"
},
"VERB__Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part": {
"freq": 3297,
"morph": "Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part",
"pos": "VERB"
},
"ADJ__Gender=Masc|Number=Plur|NumType=Ord": {
"freq": 227,
"morph": "Gender=Masc|Number=Plur|NumType=Ord",
"pos": "ADJ"
},
"PRON__Gender=Masc|Number=Sing|Person=3|PronType=Prs": {
"freq": 205,
"morph": "Gender=Masc|Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"NOUN__Number=Plur": {
"freq": 1463,
"morph": "Number=Plur",
"pos": "NOUN"
},
"DET__Number=Sing|Person=3|Poss=Yes|PronType=Prs": {
"freq": 2909,
"morph": "Number=Sing|Person=3|Poss=Yes|PronType=Prs",
"pos": "DET"
},
"VERB__VerbForm=Ger": {
"freq": 994,
"morph": "VerbForm=Ger",
"pos": "VERB"
},
"INTJ": {
"pos": "INTJ"
},
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin": {
"freq": 398,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin",
"pos": "VERB"
},
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 1403,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"PRON__Number=Plur|Person=1|PronType=Prs": {
"freq": 264,
"morph": "Number=Plur|Person=1|PronType=Prs",
"pos": "PRON"
},
"ADV__Negative=Neg": {
"freq": 2960,
"morph": "Negative=Neg",
"pos": "ADV"
},
"VERB__Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 2488,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"DET__Gender=Masc|Number=Sing|PronType=Ind": {
"freq": 855,
"morph": "Gender=Masc|Number=Sing|PronType=Ind",
"pos": "DET"
},
"VERB__Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 408,
"morph": "Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"PRON__Gender=Fem|Number=Sing|PronType=Ind": {
"freq": 237,
"morph": "Gender=Fem|Number=Sing|PronType=Ind",
"pos": "PRON"
},
"DET__Gender=Fem|Number=Plur|PronType=Ind": {
"freq": 592,
"morph": "Gender=Fem|Number=Plur|PronType=Ind",
"pos": "DET"
},
"ADJ__Gender=Fem|Number=Plur|VerbForm=Part": {
"freq": 614,
"morph": "Gender=Fem|Number=Plur|VerbForm=Part",
"pos": "ADJ"
},
"DET__Gender=Fem|Number=Sing|PronType=Dem": {
"freq": 808,
"morph": "Gender=Fem|Number=Sing|PronType=Dem",
"pos": "DET"
},
"DET__Gender=Fem|Number=Sing|PronType=Ind": {
"freq": 613,
"morph": "Gender=Fem|Number=Sing|PronType=Ind",
"pos": "DET"
},
"DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Art": {
"freq": 4277,
"morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Art",
"pos": "DET"
},
"VERB__Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 788,
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"NOUN__Gender=Fem": {
"freq": 145,
"morph": "Gender=Fem",
"pos": "NOUN"
},
"PRON__Gender=Fem|Number=Plur|PronType=Ind": {
"freq": 127,
"morph": "Gender=Fem|Number=Plur|PronType=Ind",
"pos": "PRON"
},
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 729,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "AUX"
},
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 1223,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "VERB"
},
"AUX__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": {
"freq": 164,
"morph": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"PRON__PronType=Rel": {
"freq": 7301,
"morph": "PronType=Rel",
"pos": "PRON"
},
"DET__Definite=Def|Number=Sing|PronType=Art": {
"freq": 928,
"morph": "Definite=Def|Number=Sing|PronType=Art",
"pos": "DET"
},
"ADV___": {
"freq": 11334,
"morph": "_",
"pos": "ADV"
},
"ADJ": {
"pos": "ADJ"
},
"AUX__VerbForm=Ger": {
"freq": 154,
"morph": "VerbForm=Ger",
"pos": "AUX"
},
"PRON__Number=Sing|PronType=Int": {
"freq": 201,
"morph": "Number=Sing|PronType=Int",
"pos": "PRON"
},
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin": {
"freq": 1236,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin",
"pos": "VERB"
},
"NOUN__Gender=Masc|Number=Plur": {
"freq": 12310,
"morph": "Gender=Masc|Number=Plur",
"pos": "NOUN"
},
"NOUN__Gender=Fem|Number=Plur": {
"freq": 8612,
"morph": "Gender=Fem|Number=Plur",
"pos": "NOUN"
},
"VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 6343,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"PRON__Gender=Masc|Number=Plur|PronType=Ind": {
"freq": 460,
"morph": "Gender=Masc|Number=Plur|PronType=Ind",
"pos": "PRON"
},
"VERB__Mood=Sub|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 100,
"morph": "Mood=Sub|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "VERB"
},
"PUNCT__PunctSide=Ini|PunctType=Brck": {
"freq": 1482,
"morph": "PunctSide=Ini|PunctType=Brck",
"pos": "PUNCT"
},
"PRON__Gender=Masc|Number=Sing|PronType=Tot": {
"freq": 111,
"morph": "Gender=Masc|Number=Sing|PronType=Tot",
"pos": "PRON"
},
"SCONJ": {
"pos": "SCONJ"
},
"AUX__VerbForm=Inf": {
"freq": 1495,
"morph": "VerbForm=Inf",
"pos": "AUX"
},
"AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 5227,
"morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"ADJ__AdpType=Prep": {
"freq": 124,
"morph": "AdpType=Prep",
"pos": "ADJ"
},
"PRON__Gender=Masc|Number=Sing|PronType=Ind": {
"freq": 624,
"morph": "Gender=Masc|Number=Sing|PronType=Ind",
"pos": "PRON"
},
"DET__Gender=Masc|Number=Plur|PronType=Dem": {
"freq": 269,
"morph": "Gender=Masc|Number=Plur|PronType=Dem",
"pos": "DET"
},
"ADJ__Gender=Fem|Number=Plur": {
"freq": 1612,
"morph": "Gender=Fem|Number=Plur",
"pos": "ADJ"
},
"NUM__Gender=Masc|Number=Plur|NumType=Card": {
"freq": 104,
"morph": "Gender=Masc|Number=Plur|NumType=Card",
"pos": "NUM"
},
"NUM__NumType=Card": {
"freq": 533,
"morph": "NumType=Card",
"pos": "NUM"
},
"SCONJ___": {
"freq": 10129,
"morph": "_",
"pos": "SCONJ"
},
"PRON__Number=Sing|PronType=Rel": {
"freq": 318,
"morph": "Number=Sing|PronType=Rel",
"pos": "PRON"
},
"VERB__Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin": {
"freq": 253,
"morph": "Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin",
"pos": "VERB"
},
"NOUN": {
"pos": "NOUN"
},
"NOUN__Gender=Masc": {
"freq": 153,
"morph": "Gender=Masc",
"pos": "NOUN"
},
"DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art": {
"freq": 3087,
"morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Art",
"pos": "DET"
},
"ADJ__Gender=Masc|Number=Plur|VerbForm=Part": {
"freq": 997,
"morph": "Gender=Masc|Number=Plur|VerbForm=Part",
"pos": "ADJ"
},
"PRON__Number=Sing|PronType=Dem": {
"freq": 302,
"morph": "Number=Sing|PronType=Dem",
"pos": "PRON"
},
"PRON__Number=Sing|Person=3|PronType=Prs": {
"freq": 116,
"morph": "Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"PRON__Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": {
"freq": 173,
"morph": "Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs",
"pos": "PRON"
},
"PUNCT": {
"pos": "PUNCT"
},
"DET__Gender=Masc|Number=Sing|PronType=Dem": {
"freq": 962,
"morph": "Gender=Masc|Number=Sing|PronType=Dem",
"pos": "DET"
},
"PRON__Number=Plur|PronType=Rel": {
"freq": 102,
"morph": "Number=Plur|PronType=Rel",
"pos": "PRON"
},
"ADJ__Gender=Masc|Number=Sing": {
"freq": 5136,
"morph": "Gender=Masc|Number=Sing",
"pos": "ADJ"
},
"DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art": {
"freq": 22962,
"morph": "Definite=Def|Gender=Masc|Number=Sing|PronType=Art",
"pos": "DET"
},
"AUX__Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 107,
"morph": "Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"PRON__Case=Dat|Number=Plur|Person=3|PronType=Prs": {
"freq": 220,
"morph": "Case=Dat|Number=Plur|Person=3|PronType=Prs",
"pos": "PRON"
},
"VERB__Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part": {
"freq": 206,
"morph": "Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part",
"pos": "VERB"
},
"DET__Number=Plur|Person=3|Poss=Yes|PronType=Prs": {
"freq": 1021,
"morph": "Number=Plur|Person=3|Poss=Yes|PronType=Prs",
"pos": "DET"
},
"ADJ__Gender=Fem|Number=Plur|NumType=Ord": {
"freq": 101,
"morph": "Gender=Fem|Number=Plur|NumType=Ord",
"pos": "ADJ"
},
"PRON__PronType=Int": {
"freq": 137,
"morph": "PronType=Int",
"pos": "PRON"
},
"ADP__AdpType=Prep": {
"freq": 71133,
"morph": "AdpType=Prep",
"pos": "ADP"
},
"DET__Gender=Masc|Number=Plur|PronType=Ind": {
"freq": 904,
"morph": "Gender=Masc|Number=Plur|PronType=Ind",
"pos": "DET"
},
"AUX__Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": {
"freq": 299,
"morph": "Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"DET__Gender=Fem|Number=Plur|PronType=Dem": {
"freq": 188,
"morph": "Gender=Fem|Number=Plur|PronType=Dem",
"pos": "DET"
},
"NUM__NumForm=Digit|NumType=Card": {
"freq": 1108,
"morph": "NumForm=Digit|NumType=Card",
"pos": "NUM"
},
"PUNCT__PunctType=Quot": {
"freq": 7380,
"morph": "PunctType=Quot",
"pos": "PUNCT"
},
"VERB__Gender=Masc|Number=Plur|Tense=Past|VerbForm=Part": {
"freq": 184,
"morph": "Gender=Masc|Number=Plur|Tense=Past|VerbForm=Part",
"pos": "VERB"
},
"PUNCT__PunctType=Dash": {
"freq": 2345,
"morph": "PunctType=Dash",
"pos": "PUNCT"
},
"ADJ__Gender=Fem|Number=Sing": {
"freq": 3935,
"morph": "Gender=Fem|Number=Sing",
"pos": "ADJ"
},
"AUX__Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin": {
"freq": 215,
"morph": "Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin",
"pos": "AUX"
},
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin": {
"freq": 218,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin",
"pos": "AUX"
},
"PROPN___": {
"freq": 34454,
"morph": "_",
"pos": "PROPN"
},
"PRON__Number=Sing|PronType=Ind": {
"freq": 421,
"morph": "Number=Sing|PronType=Ind",
"pos": "PRON"
},
"VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin": {
"freq": 359,
"morph": "Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin",
"pos": "VERB"
},
"PUNCT__PunctSide=Fin|PunctType=Qest": {
"freq": 312,
"morph": "PunctSide=Fin|PunctType=Qest",
"pos": "PUNCT"
},
"PRON__Number=Sing|Person=1|PronType=Prs": {
"freq": 298,
"morph": "Number=Sing|Person=1|PronType=Prs",
"pos": "PRON"
},
"PART__Negative=Neg": {
"freq": 122,
"morph": "Negative=Neg",
"pos": "PART"
},
"PRON__Gender=Masc|Number=Plur|Person=3|PronType=Prs": {
"freq": 176,
"morph": "Gender=Masc|Number=Plur|Person=3|PronType=Prs",
"pos": "PRON"
},
"NOUN__Gender=Fem|Number=Sing": {
"freq": 24416,
"morph": "Gender=Fem|Number=Sing",
"pos": "NOUN"
},
"ADJ__Gender=Masc|Number=Sing|VerbForm=Part": {
"freq": 2297,
"morph": "Gender=Masc|Number=Sing|VerbForm=Part",
"pos": "ADJ"
},
"CONJ___": {
"freq": 12225,
"morph": "_",
"pos": "CONJ"
},
"NUM__Number=Plur|NumType=Card": {
"freq": 2057,
"morph": "Number=Plur|NumType=Card",
"pos": "NUM"
},
"NOUN___": {
"freq": 4829,
"morph": "_",
"pos": "NOUN"
},
"VERB": {
"pos": "VERB"
},
"DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": {
"freq": 16487,
"morph": "Definite=Def|Gender=Fem|Number=Sing|PronType=Art",
"pos": "DET"
},
"SYM": {
"pos": "SYM"
},
"VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin": {
"freq": 130,
"morph": "Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin",
"pos": "VERB"
},
"AUX": {
"pos": "AUX"
},
"AUX__Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part": {
"freq": 494,
"morph": "Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part",
"pos": "AUX"
},
"AUX__Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": {
"freq": 199,
"morph": "Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin",
"pos": "AUX"
},
"VERB__Mood=Imp|Number=Sing|Person=2|VerbForm=Fin": {
"freq": 100,
"morph": "Mood=Imp|Number=Sing|Person=2|VerbForm=Fin",
"pos": "VERB"
},
"PUNCT__PunctType=Peri": {
"freq": 14170,
"morph": "PunctType=Peri",
"pos": "PUNCT"
}
}

View File

@ -6,6 +6,7 @@ import ujson as json
from .en.lemmatizer import INDEX, EXC, RULES from .en.lemmatizer import INDEX, EXC, RULES
from .symbols import POS, NOUN, VERB, ADJ, PUNCT from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none
class Lemmatizer(object): class Lemmatizer(object):
@ -43,10 +44,13 @@ class Lemmatizer(object):
avoid lemmatization entirely.''' avoid lemmatization entirely.'''
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others: if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
return True return True
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
return True return True
elif true_morph_key in (VerbForm_inf, VerbForm_none):
return True
else: else:
return False return False
@ -70,11 +74,16 @@ def lemmatize(string, index, exceptions, rules):
#if string in index: #if string in index:
# forms.append(string) # forms.append(string)
forms.extend(exceptions.get(string, [])) forms.extend(exceptions.get(string, []))
oov_forms = []
for old, new in rules: for old, new in rules:
if string.endswith(old): if string.endswith(old):
form = string[:len(string) - len(old)] + new form = string[:len(string) - len(old)] + new
if form in index or not form.isalpha(): if form in index or not form.isalpha():
forms.append(form) forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms: if not forms:
forms.append(string) forms.append(string)
return set(forms) return set(forms)

View File

@ -310,78 +310,78 @@ IDS = {
"Number_grpa": Number_grpa, # U20 "Number_grpa": Number_grpa, # U20
"Number_grpl": Number_grpl, # U20 "Number_grpl": Number_grpl, # U20
"Number_inv": Number_inv, # U20 "Number_inv": Number_inv, # U20
"NumForm_digit ": NumForm_digit, # cz, sl, U, "NumForm_digit": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U, "NumForm_roman": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U, "NumForm_word": NumForm_word, # cz, sl, U,
"NumValue_one ": NumValue_one, # cz, U, "NumValue_one": NumValue_one, # cz, U,
"NumValue_two ": NumValue_two, # cz, U, "NumValue_two": NumValue_two, # cz, U,
"NumValue_three ": NumValue_three, # cz, U, "NumValue_three": NumValue_three, # cz, U,
"PartForm_pres ": PartForm_pres, # fi, "PartForm_pres": PartForm_pres, # fi,
"PartForm_past ": PartForm_past, # fi, "PartForm_past": PartForm_past, # fi,
"PartForm_agt ": PartForm_agt, # fi, "PartForm_agt": PartForm_agt, # fi,
"PartForm_neg ": PartForm_neg, # fi, "PartForm_neg": PartForm_neg, # fi,
"PartType_mod ": PartType_mod, # U, "PartType_mod": PartType_mod, # U,
"PartType_emp ": PartType_emp, # U, "PartType_emp": PartType_emp, # U,
"PartType_res ": PartType_res, # U, "PartType_res": PartType_res, # U,
"PartType_inf ": PartType_inf, # U, "PartType_inf": PartType_inf, # U,
"PartType_vbp ": PartType_vbp, # U, "PartType_vbp": PartType_vbp, # U,
"Person_abs_one ": Person_abs_one, # bq, U, "Person_abs_one": Person_abs_one, # bq, U,
"Person_abs_two ": Person_abs_two, # bq, U, "Person_abs_two": Person_abs_two, # bq, U,
"Person_abs_three ": Person_abs_three, # bq, U, "Person_abs_three": Person_abs_three, # bq, U,
"Person_dat_one ": Person_dat_one, # bq, U, "Person_dat_one": Person_dat_one, # bq, U,
"Person_dat_two ": Person_dat_two, # bq, U, "Person_dat_two": Person_dat_two, # bq, U,
"Person_dat_three ": Person_dat_three, # bq, U, "Person_dat_three": Person_dat_three, # bq, U,
"Person_erg_one ": Person_erg_one, # bq, U, "Person_erg_one": Person_erg_one, # bq, U,
"Person_erg_two ": Person_erg_two, # bq, U, "Person_erg_two": Person_erg_two, # bq, U,
"Person_erg_three ": Person_erg_three, # bq, U, "Person_erg_three": Person_erg_three, # bq, U,
"Person_psor_one ": Person_psor_one, # fi, U, "Person_psor_one": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U, "Person_psor_two": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U, "Person_psor_three": Person_psor_three, # fi, U,
"Person_zero ": Person_zero, # U20 "Person_zero": Person_zero, # U20
"Person_four ": Person_four, # U20 "Person_four": Person_four, # U20
"Polite_inf ": Polite_inf, # bq, U, "Polite_inf": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U, "Polite_pol": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U, "Polite_abs_inf": Polite_abs_inf, # bq, U,
"Polite_abs_pol ": Polite_abs_pol, # bq, U, "Polite_abs_pol": Polite_abs_pol, # bq, U,
"Polite_erg_inf ": Polite_erg_inf, # bq, U, "Polite_erg_inf": Polite_erg_inf, # bq, U,
"Polite_erg_pol ": Polite_erg_pol, # bq, U, "Polite_erg_pol": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U, "Polite_dat_inf": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U, "Polite_dat_pol": Polite_dat_pol, # bq, U,
"Polite_infm ": Polite_infm, # U20 "Polite_infm": Polite_infm, # U20
"Polite_form ": Polite_form, # U20 "Polite_form": Polite_form, # U20
"Polite_form_elev ": Polite_form_elev, # U20 "Polite_form_elev": Polite_form_elev, # U20
"Polite_form_humb ": Polite_form_humb, # U20 "Polite_form_humb ": Polite_form_humb, # U20
"Prefix_yes ": Prefix_yes, # U, "Prefix_yes": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz, "PrepCase_npr": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U, "PrepCase_pre": PrepCase_pre, # U,
"PunctSide_ini ": PunctSide_ini, # U, "PunctSide_ini": PunctSide_ini, # U,
"PunctSide_fin ": PunctSide_fin, # U, "PunctSide_fin": PunctSide_fin, # U,
"PunctType_peri ": PunctType_peri, # U, "PunctType_peri": PunctType_peri, # U,
"PunctType_qest ": PunctType_qest, # U, "PunctType_qest": PunctType_qest, # U,
"PunctType_excl ": PunctType_excl, # U, "PunctType_excl": PunctType_excl, # U,
"PunctType_quot ": PunctType_quot, # U, "PunctType_quot": PunctType_quot, # U,
"PunctType_brck ": PunctType_brck, # U, "PunctType_brck": PunctType_brck, # U,
"PunctType_comm ": PunctType_comm, # U, "PunctType_comm": PunctType_comm, # U,
"PunctType_colo ": PunctType_colo, # U, "PunctType_colo": PunctType_colo, # U,
"PunctType_semi ": PunctType_semi, # U, "PunctType_semi": PunctType_semi, # U,
"PunctType_dash ": PunctType_dash, # U, "PunctType_dash": PunctType_dash, # U,
"Style_arch ": Style_arch, # cz, fi, U, "Style_arch": Style_arch, # cz, fi, U,
"Style_rare ": Style_rare, # cz, fi, U, "Style_rare": Style_rare, # cz, fi, U,
"Style_poet ": Style_poet, # cz, U, "Style_poet": Style_poet, # cz, U,
"Style_norm ": Style_norm, # cz, U, "Style_norm": Style_norm, # cz, U,
"Style_coll ": Style_coll, # cz, U, "Style_coll": Style_coll, # cz, U,
"Style_vrnc ": Style_vrnc, # cz, U, "Style_vrnc": Style_vrnc, # cz, U,
"Style_sing ": Style_sing, # cz, U, "Style_sing": Style_sing, # cz, U,
"Style_expr ": Style_expr, # cz, U, "Style_expr": Style_expr, # cz, U,
"Style_derg ": Style_derg, # cz, U, "Style_derg": Style_derg, # cz, U,
"Style_vulg ": Style_vulg, # cz, U, "Style_vulg": Style_vulg, # cz, U,
"Style_yes ": Style_yes, # fi, U, "Style_yes": Style_yes, # fi, U,
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz, "StyleVariant_styleShort": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, "StyleVariant_styleBound": StyleVariant_styleBound, # cz, sl,
"VerbType_aux ": VerbType_aux, # U, "VerbType_aux": VerbType_aux, # U,
"VerbType_cop ": VerbType_cop, # U, "VerbType_cop": VerbType_cop, # U,
"VerbType_mod ": VerbType_mod, # U, "VerbType_mod": VerbType_mod, # U,
"VerbType_light ": VerbType_light, # U, "VerbType_light": VerbType_light, # U,
"PERSON": PERSON, "PERSON": PERSON,
"NORP": NORP, "NORP": NORP,

View File

@ -16,7 +16,6 @@ from ..tokens import Doc
from ..strings import StringStore from ..strings import StringStore
from ..lemmatizer import Lemmatizer from ..lemmatizer import Lemmatizer
from ..attrs import ORTH, TAG, HEAD, DEP from ..attrs import ORTH, TAG, HEAD, DEP
from ..util import match_best_version, get_data_path
from io import StringIO, BytesIO from io import StringIO, BytesIO
from pathlib import Path from pathlib import Path
@ -90,11 +89,8 @@ def en_entityrecognizer():
@pytest.fixture @pytest.fixture
def lemmatizer(path): def lemmatizer():
if path is not None: return English.Defaults.create_lemmatizer()
return Lemmatizer.load(path)
else:
return None
@pytest.fixture @pytest.fixture
@ -106,14 +102,6 @@ def text_file_b():
return BytesIO() return BytesIO()
@pytest.fixture
def path():
if 'SPACY_DATA' in os.environ:
return Path(os.environ['SPACY_DATA'])
else:
return match_best_version('en', None, get_data_path())
# only used for tests that require loading the models # only used for tests that require loading the models
# in all other cases, use specific instances # in all other cases, use specific instances
@pytest.fixture(scope="session") @pytest.fixture(scope="session")

View File

@ -0,0 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
# Note: "chromosomes" worked previous the bug fix
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
def test_issue781(lemmatizer, word, lemmas):
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)

View File

@ -31,6 +31,12 @@ def test_spans_root(doc):
assert span.root.text == 'sentence' assert span.root.text == 'sentence'
assert span.root.head.text == 'is' assert span.root.head.text == 'is'
def test_spans_string_fn(doc):
span = doc[0:4]
assert len(span) == 4
assert span.text == 'This is a sentence'
assert span.upper_ == 'THIS IS A SENTENCE'
assert span.lower_ == 'this is a sentence'
def test_spans_root2(en_tokenizer): def test_spans_root2(en_tokenizer):
text = "through North and South Carolina" text = "through North and South Carolina"

View File

@ -365,6 +365,14 @@ cdef class Span:
def __get__(self): def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip() return ' '.join([t.lemma_ for t in self]).strip()
property upper_:
def __get__(self):
return ''.join([t.string.upper() for t in self]).strip()
property lower_:
def __get__(self):
return ''.join([t.string.lower() for t in self]).strip()
property string: property string:
def __get__(self): def __get__(self):
return ''.join([t.string for t in self]) return ''.join([t.string for t in self])

View File

@ -149,15 +149,16 @@ def check_renamed_kwargs(renamed, kwargs):
raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def parse_package_meta(package_path, package, on_error=False): def parse_package_meta(package_path, package, require=True):
location = os.path.join(str(package_path), package, 'meta.json') location = os.path.join(str(package_path), package, 'meta.json')
if not os.path.isfile(location) and on_error: if os.path.isfile(location):
on_error()
else:
with io.open(location, encoding='utf8') as f: with io.open(location, encoding='utf8') as f:
meta = json.load(f) meta = json.load(f)
return meta return meta
return False elif require:
raise IOError("Could not read meta.json from %s" % location)
else:
return None
def print_msg(*text, **kwargs): def print_msg(*text, **kwargs):

View File

@ -596,6 +596,8 @@ cdef class Vocab:
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float)) vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
string_id = self.strings[chars[:word_len]] string_id = self.strings[chars[:word_len]]
# Insert words into vocab to add vector.
self.get_by_orth(self.mem, string_id)
while string_id >= vectors.size(): while string_id >= vectors.size():
vectors.push_back(EMPTY_VEC) vectors.push_back(EMPTY_VEC)
assert vec != NULL assert vec != NULL