diff --git a/.gitignore b/.gitignore index 64f24a487..8716a8ef0 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,7 @@ website/package.json website/announcement.jade website/www/ website/.gitignore + +# Python virtualenv +venv +venv/* diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index 4b3080ce5..c87f40680 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -14,7 +14,7 @@ from spacy.language import Language from spacy.gold import GoldParse from spacy.vocab import Vocab from spacy.tagger import Tagger -from spacy.pipeline import DependencyParser +from spacy.pipeline import DependencyParser, BeamDependencyParser from spacy.syntax.parser import get_templates from spacy.syntax.arc_eager import ArcEager from spacy.scorer import Scorer @@ -35,8 +35,8 @@ def read_conllx(loc, n=0): lines.pop(0) tokens = [] for line in lines: - id_, word, lemma, tag, pos, morph, head, dep, _1, _2 = line.split() - if '-' in id_: + id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split() + if '-' in id_ or '.' in id_: continue try: id_ = int(id_) - 1 @@ -66,12 +66,8 @@ def score_model(vocab, tagger, parser, gold_docs, verbose=False): return scorer -def main(train_loc, dev_loc, model_dir, tag_map_loc=None): - if tag_map_loc: - with open(tag_map_loc) as file_: - tag_map = json.loads(file_.read()) - else: - tag_map = DEFAULT_TAG_MAP +def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): + LangClass = spacy.util.get_lang_class(lang_name) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) @@ -79,13 +75,37 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc=None): features = get_templates('basic') model_dir = pathlib.Path(model_dir) + if not model_dir.exists(): + model_dir.mkdir() if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() + if not (model_dir / 'pos').exists(): + (model_dir / 'pos').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) - vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) + + vocab = LangClass.Defaults.create_vocab() + if not (model_dir / 'vocab').exists(): + (model_dir / 'vocab').mkdir() + else: + if (model_dir / 'vocab' / 'strings.json').exists(): + with (model_dir / 'vocab' / 'strings.json').open() as file_: + vocab.strings.load(file_) + if (model_dir / 'vocab' / 'lexemes.bin').exists(): + vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') + + if clusters_loc is not None: + clusters_loc = pathlib.Path(clusters_loc) + with clusters_loc.open() as file_: + for line in file_: + try: + cluster, word, freq = line.split() + except ValueError: + continue + lex = vocab[word] + lex.cluster = int(cluster[::-1], 2) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: @@ -95,13 +115,13 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc=None): _ = vocab[dep] for tag in tags: _ = vocab[tag] - if tag_map: + if vocab.morphology.tag_map: for tag in tags: - assert tag in tag_map, repr(tag) - tagger = Tagger(vocab, tag_map=tag_map) + assert tag in vocab.morphology.tag_map, repr(tag) + tagger = Tagger(vocab) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) - for itn in range(15): + for itn in range(30): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: diff --git a/spacy/__init__.py b/spacy/__init__.py index ca5a39f05..0b76f1f9e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -35,14 +35,16 @@ set_lang_class(bn.Bengali.lang, bn.Bengali) def load(name, **overrides): data_path = overrides.get('path', util.get_data_path()) - meta = parse_package_meta(data_path, name) - lang = meta['lang'] if meta and 'lang' in meta else 'en' + meta = parse_package_meta(data_path, name, require=False) + lang = meta['lang'] if meta and 'lang' in meta else name cls = get_lang_class(lang) overrides['meta'] = meta - overrides['path'] = Path(data_path / name) + model_path = Path(data_path) / name + if model_path.exists(): + overrides['path'] = model_path return cls(**overrides) def info(name): - meta = parse_package_meta(util.get_data_path(), name) + meta = parse_package_meta(util.get_data_path(), name, require=True) print(json.dumps(meta, indent=2)) diff --git a/spacy/about.py b/spacy/about.py index 8e21ab316..b2ceacf54 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.6.0' +__version__ = '1.7.0' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' diff --git a/spacy/es/tag_map.py b/spacy/es/tag_map.py new file mode 100644 index 000000000..6054d147d --- /dev/null +++ b/spacy/es/tag_map.py @@ -0,0 +1,738 @@ +{ + "AUX__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin": { + "freq": 865, + "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin", + "pos": "AUX" + }, + "PUNCT__PunctSide=Fin|PunctType=Brck": { + "freq": 1476, + "morph": "PunctSide=Fin|PunctType=Brck", + "pos": "PUNCT" + }, + "VERB__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin": { + "freq": 7033, + "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin", + "pos": "VERB" + }, + "PRON__Number=Sing|Person=2|PronType=Prs": { + "freq": 132, + "morph": "Number=Sing|Person=2|PronType=Prs", + "pos": "PRON" + }, + "PRON": { + "pos": "PRON" + }, + "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": { + "freq": 525, + "morph": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin", + "pos": "VERB" + }, + "SYM__NumForm=Digit|NumType=Frac": { + "freq": 236, + "morph": "NumForm=Digit|NumType=Frac", + "pos": "SYM" + }, + "ADJ___": { + "freq": 515, + "morph": "_", + "pos": "ADJ" + }, + "PRON__Person=3": { + "freq": 3185, + "morph": "Person=3", + "pos": "PRON" + }, + "PRON__Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs": { + "freq": 104, + "morph": "Case=Acc|Gender=Masc|Number=Plur|Person=3|PronType=Prs", + "pos": "PRON" + }, + "DET__Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs": { + "freq": 148, + "morph": "Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs", + "pos": "DET" + }, + "CONJ": { + "pos": "CONJ" + }, + "PUNCT__PunctType=Comm": { + "freq": 24475, + "morph": "PunctType=Comm", + "pos": "PUNCT" + }, + "ADV": { + "pos": "ADV" + }, + "ADV__AdpType=Prep": { + "freq": 161, + "morph": "AdpType=Prep", + "pos": "ADV" + }, + "ADJ__Number=Plur": { + "freq": 2617, + "morph": "Number=Plur", + "pos": "ADJ" + }, + "AUX__Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": { + "freq": 149, + "morph": "Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin", + "pos": "AUX" + }, + "ADJ__Gender=Masc|Number=Sing|NumType=Ord": { + "freq": 654, + "morph": "Gender=Masc|Number=Sing|NumType=Ord", + "pos": "ADJ" + }, + "AUX__Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin": { + "freq": 272, + "morph": "Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin", + "pos": "AUX" + }, + "AUX__Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin": { + "freq": 388, + "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin", + "pos": "AUX" + }, + "ADJ__Gender=Masc|Number=Plur": { + "freq": 1995, + "morph": "Gender=Masc|Number=Plur", + "pos": "ADJ" + }, + "DET": { + "pos": "DET" + }, + "VERB__VerbForm=Inf": { + "freq": 8204, + "morph": "VerbForm=Inf", + "pos": "VERB" + }, + "DET__Definite=Def|Gender=Fem|Number=Plur|PronType=Art": { + "freq": 4275, + "morph": "Definite=Def|Gender=Fem|Number=Plur|PronType=Art", + "pos": "DET" + }, + "VERB__Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": { + "freq": 495, + "morph": "Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin", + "pos": "VERB" + }, + "DET__Definite=Def|Gender=Masc|Number=Plur|PronType=Art": { + "freq": 6951, + "morph": "Definite=Def|Gender=Masc|Number=Plur|PronType=Art", + "pos": "DET" + }, + "PRON___": { + "freq": 1871, + "morph": "_", + "pos": "PRON" + }, + "DET__Definite=Ind|Gender=Masc|Number=Plur|PronType=Art": { + "freq": 113, + "morph": "Definite=Ind|Gender=Masc|Number=Plur|PronType=Art", + "pos": "DET" + }, + "NOUN__Number=Sing": { + "freq": 1977, + "morph": "Number=Sing", + "pos": "NOUN" + }, + "ADJ__Gender=Fem|Number=Sing|NumType=Ord": { + "freq": 568, + "morph": "Gender=Fem|Number=Sing|NumType=Ord", + "pos": "ADJ" + }, + "NOUN__Gender=Masc|Number=Sing": { + "freq": 25557, + "morph": "Gender=Masc|Number=Sing", + "pos": "NOUN" + }, + "PART": { + "pos": "PART" + }, + "ADJ__Number=Sing": { + "freq": 6619, + "morph": "Number=Sing", + "pos": "ADJ" + }, + "NUM": { + "pos": "NUM" + }, + "DET__Number=Sing|PronType=Ind": { + "freq": 309, + "morph": "Number=Sing|PronType=Ind", + "pos": "DET" + }, + "ADJ__Gender=Fem|Number=Sing|VerbForm=Part": { + "freq": 1387, + "morph": "Gender=Fem|Number=Sing|VerbForm=Part", + "pos": "ADJ" + }, + "VERB__Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": { + "freq": 272, + "morph": "Mood=Sub|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin", + "pos": "VERB" + }, + "VERB__Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin": { + "freq": 1574, + "morph": "Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin", + "pos": "VERB" + }, + "PRON__Gender=Masc|Number=Sing|PronType=Dem": { + "freq": 115, + "morph": "Gender=Masc|Number=Sing|PronType=Dem", + "pos": "PRON" + }, + "ADP": { + "pos": "ADP" + }, + "NOUN__AdvType=Tim": { + "freq": 1504, + "morph": "AdvType=Tim", + "pos": "NOUN" + }, + "AUX__Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin": { + "freq": 130, + "morph": "Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin", + "pos": "AUX" + }, + "PRON__Case=Nom|Number=Sing|Person=1|PronType=Prs": { + "freq": 115, + "morph": "Case=Nom|Number=Sing|Person=1|PronType=Prs", + "pos": "PRON" + }, + "PUNCT__PunctType=Semi": { + "freq": 259, + "morph": "PunctType=Semi", + "pos": "PUNCT" + }, + "PUNCT__PunctSide=Ini|PunctType=Qest": { + "freq": 206, + "morph": "PunctSide=Ini|PunctType=Qest", + "pos": "PUNCT" + }, + "PRON__Case=Dat|Number=Sing|Person=3|PronType=Prs": { + "freq": 754, + "morph": "Case=Dat|Number=Sing|Person=3|PronType=Prs", + "pos": "PRON" + }, + "PRON__Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs": { + "freq": 624, + "morph": "Case=Acc|Gender=Masc|Number=Sing|Person=3|PronType=Prs", + "pos": "PRON" + }, + "NUM__NumForm=Digit": { + "freq": 2979, + "morph": "NumForm=Digit", + "pos": "NUM" + }, + "PUNCT__PunctType=Colo": { + "freq": 638, + "morph": "PunctType=Colo", + "pos": "PUNCT" + }, + "PROPN": { + "pos": "PROPN" + }, + "X": { + "pos": "X" + }, + "NOUN__NumForm=Digit": { + "freq": 555, + "morph": "NumForm=Digit", + "pos": "NOUN" + }, + "VERB__Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part": { + "freq": 3297, + "morph": "Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part", + "pos": "VERB" + }, + "ADJ__Gender=Masc|Number=Plur|NumType=Ord": { + "freq": 227, + "morph": "Gender=Masc|Number=Plur|NumType=Ord", + "pos": "ADJ" + }, + "PRON__Gender=Masc|Number=Sing|Person=3|PronType=Prs": { + "freq": 205, + "morph": "Gender=Masc|Number=Sing|Person=3|PronType=Prs", + "pos": "PRON" + }, + "NOUN__Number=Plur": { + "freq": 1463, + "morph": "Number=Plur", + "pos": "NOUN" + }, + "DET__Number=Sing|Person=3|Poss=Yes|PronType=Prs": { + "freq": 2909, + "morph": "Number=Sing|Person=3|Poss=Yes|PronType=Prs", + "pos": "DET" + }, + "VERB__VerbForm=Ger": { + "freq": 994, + "morph": "VerbForm=Ger", + "pos": "VERB" + }, + "INTJ": { + "pos": "INTJ" + }, + "VERB__Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin": { + "freq": 398, + "morph": "Mood=Ind|Number=Plur|Person=3|Tense=Fut|VerbForm=Fin", + "pos": "VERB" + }, + "AUX__Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": { + "freq": 1403, + "morph": "Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin", + "pos": "AUX" + }, + "PRON__Number=Plur|Person=1|PronType=Prs": { + "freq": 264, + "morph": "Number=Plur|Person=1|PronType=Prs", + "pos": "PRON" + }, + "ADV__Negative=Neg": { + "freq": 2960, + "morph": "Negative=Neg", + "pos": "ADV" + }, + "VERB__Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": { + "freq": 2488, + "morph": "Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin", + "pos": "VERB" + }, + "DET__Gender=Masc|Number=Sing|PronType=Ind": { + "freq": 855, + "morph": "Gender=Masc|Number=Sing|PronType=Ind", + "pos": "DET" + }, + "VERB__Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": { + "freq": 408, + "morph": "Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin", + "pos": "VERB" + }, + "PRON__Gender=Fem|Number=Sing|PronType=Ind": { + "freq": 237, + "morph": "Gender=Fem|Number=Sing|PronType=Ind", + "pos": "PRON" + }, + "DET__Gender=Fem|Number=Plur|PronType=Ind": { + "freq": 592, + "morph": "Gender=Fem|Number=Plur|PronType=Ind", + "pos": "DET" + }, + "ADJ__Gender=Fem|Number=Plur|VerbForm=Part": { + "freq": 614, + "morph": "Gender=Fem|Number=Plur|VerbForm=Part", + "pos": "ADJ" + }, + "DET__Gender=Fem|Number=Sing|PronType=Dem": { + "freq": 808, + "morph": "Gender=Fem|Number=Sing|PronType=Dem", + "pos": "DET" + }, + "DET__Gender=Fem|Number=Sing|PronType=Ind": { + "freq": 613, + "morph": "Gender=Fem|Number=Sing|PronType=Ind", + "pos": "DET" + }, + "DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Art": { + "freq": 4277, + "morph": "Definite=Ind|Gender=Masc|Number=Sing|PronType=Art", + "pos": "DET" + }, + "VERB__Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": { + "freq": 788, + "morph": "Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "pos": "VERB" + }, + "NOUN__Gender=Fem": { + "freq": 145, + "morph": "Gender=Fem", + "pos": "NOUN" + }, + "PRON__Gender=Fem|Number=Plur|PronType=Ind": { + "freq": 127, + "morph": "Gender=Fem|Number=Plur|PronType=Ind", + "pos": "PRON" + }, + "AUX__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": { + "freq": 729, + "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin", + "pos": "AUX" + }, + "VERB__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin": { + "freq": 1223, + "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin", + "pos": "VERB" + }, + "AUX__Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin": { + "freq": 164, + "morph": "Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin", + "pos": "AUX" + }, + "PRON__PronType=Rel": { + "freq": 7301, + "morph": "PronType=Rel", + "pos": "PRON" + }, + "DET__Definite=Def|Number=Sing|PronType=Art": { + "freq": 928, + "morph": "Definite=Def|Number=Sing|PronType=Art", + "pos": "DET" + }, + "ADV___": { + "freq": 11334, + "morph": "_", + "pos": "ADV" + }, + "ADJ": { + "pos": "ADJ" + }, + "AUX__VerbForm=Ger": { + "freq": 154, + "morph": "VerbForm=Ger", + "pos": "AUX" + }, + "PRON__Number=Sing|PronType=Int": { + "freq": 201, + "morph": "Number=Sing|PronType=Int", + "pos": "PRON" + }, + "VERB__Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin": { + "freq": 1236, + "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin", + "pos": "VERB" + }, + "NOUN__Gender=Masc|Number=Plur": { + "freq": 12310, + "morph": "Gender=Masc|Number=Plur", + "pos": "NOUN" + }, + "NOUN__Gender=Fem|Number=Plur": { + "freq": 8612, + "morph": "Gender=Fem|Number=Plur", + "pos": "NOUN" + }, + "VERB__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": { + "freq": 6343, + "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "pos": "VERB" + }, + "PRON__Gender=Masc|Number=Plur|PronType=Ind": { + "freq": 460, + "morph": "Gender=Masc|Number=Plur|PronType=Ind", + "pos": "PRON" + }, + "VERB__Mood=Sub|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": { + "freq": 100, + "morph": "Mood=Sub|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin", + "pos": "VERB" + }, + "PUNCT__PunctSide=Ini|PunctType=Brck": { + "freq": 1482, + "morph": "PunctSide=Ini|PunctType=Brck", + "pos": "PUNCT" + }, + "PRON__Gender=Masc|Number=Sing|PronType=Tot": { + "freq": 111, + "morph": "Gender=Masc|Number=Sing|PronType=Tot", + "pos": "PRON" + }, + "SCONJ": { + "pos": "SCONJ" + }, + "AUX__VerbForm=Inf": { + "freq": 1495, + "morph": "VerbForm=Inf", + "pos": "AUX" + }, + "AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": { + "freq": 5227, + "morph": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "pos": "AUX" + }, + "ADJ__AdpType=Prep": { + "freq": 124, + "morph": "AdpType=Prep", + "pos": "ADJ" + }, + "PRON__Gender=Masc|Number=Sing|PronType=Ind": { + "freq": 624, + "morph": "Gender=Masc|Number=Sing|PronType=Ind", + "pos": "PRON" + }, + "DET__Gender=Masc|Number=Plur|PronType=Dem": { + "freq": 269, + "morph": "Gender=Masc|Number=Plur|PronType=Dem", + "pos": "DET" + }, + "ADJ__Gender=Fem|Number=Plur": { + "freq": 1612, + "morph": "Gender=Fem|Number=Plur", + "pos": "ADJ" + }, + "NUM__Gender=Masc|Number=Plur|NumType=Card": { + "freq": 104, + "morph": "Gender=Masc|Number=Plur|NumType=Card", + "pos": "NUM" + }, + "NUM__NumType=Card": { + "freq": 533, + "morph": "NumType=Card", + "pos": "NUM" + }, + "SCONJ___": { + "freq": 10129, + "morph": "_", + "pos": "SCONJ" + }, + "PRON__Number=Sing|PronType=Rel": { + "freq": 318, + "morph": "Number=Sing|PronType=Rel", + "pos": "PRON" + }, + "VERB__Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin": { + "freq": 253, + "morph": "Mood=Cnd|Number=Sing|Person=3|VerbForm=Fin", + "pos": "VERB" + }, + "NOUN": { + "pos": "NOUN" + }, + "NOUN__Gender=Masc": { + "freq": 153, + "morph": "Gender=Masc", + "pos": "NOUN" + }, + "DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art": { + "freq": 3087, + "morph": "Definite=Ind|Gender=Fem|Number=Sing|PronType=Art", + "pos": "DET" + }, + "ADJ__Gender=Masc|Number=Plur|VerbForm=Part": { + "freq": 997, + "morph": "Gender=Masc|Number=Plur|VerbForm=Part", + "pos": "ADJ" + }, + "PRON__Number=Sing|PronType=Dem": { + "freq": 302, + "morph": "Number=Sing|PronType=Dem", + "pos": "PRON" + }, + "PRON__Number=Sing|Person=3|PronType=Prs": { + "freq": 116, + "morph": "Number=Sing|Person=3|PronType=Prs", + "pos": "PRON" + }, + "PRON__Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs": { + "freq": 173, + "morph": "Case=Acc|Gender=Fem|Number=Sing|Person=3|PronType=Prs", + "pos": "PRON" + }, + "PUNCT": { + "pos": "PUNCT" + }, + "DET__Gender=Masc|Number=Sing|PronType=Dem": { + "freq": 962, + "morph": "Gender=Masc|Number=Sing|PronType=Dem", + "pos": "DET" + }, + "PRON__Number=Plur|PronType=Rel": { + "freq": 102, + "morph": "Number=Plur|PronType=Rel", + "pos": "PRON" + }, + "ADJ__Gender=Masc|Number=Sing": { + "freq": 5136, + "morph": "Gender=Masc|Number=Sing", + "pos": "ADJ" + }, + "DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art": { + "freq": 22962, + "morph": "Definite=Def|Gender=Masc|Number=Sing|PronType=Art", + "pos": "DET" + }, + "AUX__Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin": { + "freq": 107, + "morph": "Mood=Sub|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin", + "pos": "AUX" + }, + "PRON__Case=Dat|Number=Plur|Person=3|PronType=Prs": { + "freq": 220, + "morph": "Case=Dat|Number=Plur|Person=3|PronType=Prs", + "pos": "PRON" + }, + "VERB__Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part": { + "freq": 206, + "morph": "Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part", + "pos": "VERB" + }, + "DET__Number=Plur|Person=3|Poss=Yes|PronType=Prs": { + "freq": 1021, + "morph": "Number=Plur|Person=3|Poss=Yes|PronType=Prs", + "pos": "DET" + }, + "ADJ__Gender=Fem|Number=Plur|NumType=Ord": { + "freq": 101, + "morph": "Gender=Fem|Number=Plur|NumType=Ord", + "pos": "ADJ" + }, + "PRON__PronType=Int": { + "freq": 137, + "morph": "PronType=Int", + "pos": "PRON" + }, + "ADP__AdpType=Prep": { + "freq": 71133, + "morph": "AdpType=Prep", + "pos": "ADP" + }, + "DET__Gender=Masc|Number=Plur|PronType=Ind": { + "freq": 904, + "morph": "Gender=Masc|Number=Plur|PronType=Ind", + "pos": "DET" + }, + "AUX__Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin": { + "freq": 299, + "morph": "Mood=Sub|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin", + "pos": "AUX" + }, + "DET__Gender=Fem|Number=Plur|PronType=Dem": { + "freq": 188, + "morph": "Gender=Fem|Number=Plur|PronType=Dem", + "pos": "DET" + }, + "NUM__NumForm=Digit|NumType=Card": { + "freq": 1108, + "morph": "NumForm=Digit|NumType=Card", + "pos": "NUM" + }, + "PUNCT__PunctType=Quot": { + "freq": 7380, + "morph": "PunctType=Quot", + "pos": "PUNCT" + }, + "VERB__Gender=Masc|Number=Plur|Tense=Past|VerbForm=Part": { + "freq": 184, + "morph": "Gender=Masc|Number=Plur|Tense=Past|VerbForm=Part", + "pos": "VERB" + }, + "PUNCT__PunctType=Dash": { + "freq": 2345, + "morph": "PunctType=Dash", + "pos": "PUNCT" + }, + "ADJ__Gender=Fem|Number=Sing": { + "freq": 3935, + "morph": "Gender=Fem|Number=Sing", + "pos": "ADJ" + }, + "AUX__Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin": { + "freq": 215, + "morph": "Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin", + "pos": "AUX" + }, + "AUX__Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin": { + "freq": 218, + "morph": "Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin", + "pos": "AUX" + }, + "PROPN___": { + "freq": 34454, + "morph": "_", + "pos": "PROPN" + }, + "PRON__Number=Sing|PronType=Ind": { + "freq": 421, + "morph": "Number=Sing|PronType=Ind", + "pos": "PRON" + }, + "VERB__Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin": { + "freq": 359, + "morph": "Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin", + "pos": "VERB" + }, + "PUNCT__PunctSide=Fin|PunctType=Qest": { + "freq": 312, + "morph": "PunctSide=Fin|PunctType=Qest", + "pos": "PUNCT" + }, + "PRON__Number=Sing|Person=1|PronType=Prs": { + "freq": 298, + "morph": "Number=Sing|Person=1|PronType=Prs", + "pos": "PRON" + }, + "PART__Negative=Neg": { + "freq": 122, + "morph": "Negative=Neg", + "pos": "PART" + }, + "PRON__Gender=Masc|Number=Plur|Person=3|PronType=Prs": { + "freq": 176, + "morph": "Gender=Masc|Number=Plur|Person=3|PronType=Prs", + "pos": "PRON" + }, + "NOUN__Gender=Fem|Number=Sing": { + "freq": 24416, + "morph": "Gender=Fem|Number=Sing", + "pos": "NOUN" + }, + "ADJ__Gender=Masc|Number=Sing|VerbForm=Part": { + "freq": 2297, + "morph": "Gender=Masc|Number=Sing|VerbForm=Part", + "pos": "ADJ" + }, + "CONJ___": { + "freq": 12225, + "morph": "_", + "pos": "CONJ" + }, + "NUM__Number=Plur|NumType=Card": { + "freq": 2057, + "morph": "Number=Plur|NumType=Card", + "pos": "NUM" + }, + "NOUN___": { + "freq": 4829, + "morph": "_", + "pos": "NOUN" + }, + "VERB": { + "pos": "VERB" + }, + "DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art": { + "freq": 16487, + "morph": "Definite=Def|Gender=Fem|Number=Sing|PronType=Art", + "pos": "DET" + }, + "SYM": { + "pos": "SYM" + }, + "VERB__Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin": { + "freq": 130, + "morph": "Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin", + "pos": "VERB" + }, + "AUX": { + "pos": "AUX" + }, + "AUX__Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part": { + "freq": 494, + "morph": "Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part", + "pos": "AUX" + }, + "AUX__Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin": { + "freq": 199, + "morph": "Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin", + "pos": "AUX" + }, + "VERB__Mood=Imp|Number=Sing|Person=2|VerbForm=Fin": { + "freq": 100, + "morph": "Mood=Imp|Number=Sing|Person=2|VerbForm=Fin", + "pos": "VERB" + }, + "PUNCT__PunctType=Peri": { + "freq": 14170, + "morph": "PunctType=Peri", + "pos": "PUNCT" + } +} \ No newline at end of file diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 323dddd3a..85feb8127 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -6,6 +6,7 @@ import ujson as json from .en.lemmatizer import INDEX, EXC, RULES from .symbols import POS, NOUN, VERB, ADJ, PUNCT +from .symbols import VerbForm_inf, VerbForm_none class Lemmatizer(object): @@ -43,10 +44,13 @@ class Lemmatizer(object): avoid lemmatization entirely.''' morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] + true_morph_key = morphology.get('morph', 0) if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others: return True elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: return True + elif true_morph_key in (VerbForm_inf, VerbForm_none): + return True else: return False @@ -70,11 +74,16 @@ def lemmatize(string, index, exceptions, rules): #if string in index: # forms.append(string) forms.extend(exceptions.get(string, [])) + oov_forms = [] for old, new in rules: if string.endswith(old): form = string[:len(string) - len(old)] + new if form in index or not form.isalpha(): forms.append(form) + else: + oov_forms.append(form) + if not forms: + forms.extend(oov_forms) if not forms: forms.append(string) return set(forms) diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 56b27512e..d9102037a 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -310,78 +310,78 @@ IDS = { "Number_grpa": Number_grpa, # U20 "Number_grpl": Number_grpl, # U20 "Number_inv": Number_inv, # U20 - "NumForm_digit ": NumForm_digit, # cz, sl, U, - "NumForm_roman ": NumForm_roman, # cz, sl, U, - "NumForm_word ": NumForm_word, # cz, sl, U, - "NumValue_one ": NumValue_one, # cz, U, - "NumValue_two ": NumValue_two, # cz, U, - "NumValue_three ": NumValue_three, # cz, U, - "PartForm_pres ": PartForm_pres, # fi, - "PartForm_past ": PartForm_past, # fi, - "PartForm_agt ": PartForm_agt, # fi, - "PartForm_neg ": PartForm_neg, # fi, - "PartType_mod ": PartType_mod, # U, - "PartType_emp ": PartType_emp, # U, - "PartType_res ": PartType_res, # U, - "PartType_inf ": PartType_inf, # U, - "PartType_vbp ": PartType_vbp, # U, - "Person_abs_one ": Person_abs_one, # bq, U, - "Person_abs_two ": Person_abs_two, # bq, U, - "Person_abs_three ": Person_abs_three, # bq, U, - "Person_dat_one ": Person_dat_one, # bq, U, - "Person_dat_two ": Person_dat_two, # bq, U, - "Person_dat_three ": Person_dat_three, # bq, U, - "Person_erg_one ": Person_erg_one, # bq, U, - "Person_erg_two ": Person_erg_two, # bq, U, - "Person_erg_three ": Person_erg_three, # bq, U, - "Person_psor_one ": Person_psor_one, # fi, U, - "Person_psor_two ": Person_psor_two, # fi, U, - "Person_psor_three ": Person_psor_three, # fi, U, - "Person_zero ": Person_zero, # U20 - "Person_four ": Person_four, # U20 - "Polite_inf ": Polite_inf, # bq, U, - "Polite_pol ": Polite_pol, # bq, U, - "Polite_abs_inf ": Polite_abs_inf, # bq, U, - "Polite_abs_pol ": Polite_abs_pol, # bq, U, - "Polite_erg_inf ": Polite_erg_inf, # bq, U, - "Polite_erg_pol ": Polite_erg_pol, # bq, U, - "Polite_dat_inf ": Polite_dat_inf, # bq, U, - "Polite_dat_pol ": Polite_dat_pol, # bq, U, - "Polite_infm ": Polite_infm, # U20 - "Polite_form ": Polite_form, # U20 - "Polite_form_elev ": Polite_form_elev, # U20 + "NumForm_digit": NumForm_digit, # cz, sl, U, + "NumForm_roman": NumForm_roman, # cz, sl, U, + "NumForm_word": NumForm_word, # cz, sl, U, + "NumValue_one": NumValue_one, # cz, U, + "NumValue_two": NumValue_two, # cz, U, + "NumValue_three": NumValue_three, # cz, U, + "PartForm_pres": PartForm_pres, # fi, + "PartForm_past": PartForm_past, # fi, + "PartForm_agt": PartForm_agt, # fi, + "PartForm_neg": PartForm_neg, # fi, + "PartType_mod": PartType_mod, # U, + "PartType_emp": PartType_emp, # U, + "PartType_res": PartType_res, # U, + "PartType_inf": PartType_inf, # U, + "PartType_vbp": PartType_vbp, # U, + "Person_abs_one": Person_abs_one, # bq, U, + "Person_abs_two": Person_abs_two, # bq, U, + "Person_abs_three": Person_abs_three, # bq, U, + "Person_dat_one": Person_dat_one, # bq, U, + "Person_dat_two": Person_dat_two, # bq, U, + "Person_dat_three": Person_dat_three, # bq, U, + "Person_erg_one": Person_erg_one, # bq, U, + "Person_erg_two": Person_erg_two, # bq, U, + "Person_erg_three": Person_erg_three, # bq, U, + "Person_psor_one": Person_psor_one, # fi, U, + "Person_psor_two": Person_psor_two, # fi, U, + "Person_psor_three": Person_psor_three, # fi, U, + "Person_zero": Person_zero, # U20 + "Person_four": Person_four, # U20 + "Polite_inf": Polite_inf, # bq, U, + "Polite_pol": Polite_pol, # bq, U, + "Polite_abs_inf": Polite_abs_inf, # bq, U, + "Polite_abs_pol": Polite_abs_pol, # bq, U, + "Polite_erg_inf": Polite_erg_inf, # bq, U, + "Polite_erg_pol": Polite_erg_pol, # bq, U, + "Polite_dat_inf": Polite_dat_inf, # bq, U, + "Polite_dat_pol": Polite_dat_pol, # bq, U, + "Polite_infm": Polite_infm, # U20 + "Polite_form": Polite_form, # U20 + "Polite_form_elev": Polite_form_elev, # U20 "Polite_form_humb ": Polite_form_humb, # U20 - "Prefix_yes ": Prefix_yes, # U, - "PrepCase_npr ": PrepCase_npr, # cz, - "PrepCase_pre ": PrepCase_pre, # U, - "PunctSide_ini ": PunctSide_ini, # U, - "PunctSide_fin ": PunctSide_fin, # U, - "PunctType_peri ": PunctType_peri, # U, - "PunctType_qest ": PunctType_qest, # U, - "PunctType_excl ": PunctType_excl, # U, - "PunctType_quot ": PunctType_quot, # U, - "PunctType_brck ": PunctType_brck, # U, - "PunctType_comm ": PunctType_comm, # U, - "PunctType_colo ": PunctType_colo, # U, - "PunctType_semi ": PunctType_semi, # U, - "PunctType_dash ": PunctType_dash, # U, - "Style_arch ": Style_arch, # cz, fi, U, - "Style_rare ": Style_rare, # cz, fi, U, - "Style_poet ": Style_poet, # cz, U, - "Style_norm ": Style_norm, # cz, U, - "Style_coll ": Style_coll, # cz, U, - "Style_vrnc ": Style_vrnc, # cz, U, - "Style_sing ": Style_sing, # cz, U, - "Style_expr ": Style_expr, # cz, U, - "Style_derg ": Style_derg, # cz, U, - "Style_vulg ": Style_vulg, # cz, U, - "Style_yes ": Style_yes, # fi, U, - "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, - "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, - "VerbType_aux ": VerbType_aux, # U, - "VerbType_cop ": VerbType_cop, # U, - "VerbType_mod ": VerbType_mod, # U, - "VerbType_light ": VerbType_light, # U, + "Prefix_yes": Prefix_yes, # U, + "PrepCase_npr": PrepCase_npr, # cz, + "PrepCase_pre": PrepCase_pre, # U, + "PunctSide_ini": PunctSide_ini, # U, + "PunctSide_fin": PunctSide_fin, # U, + "PunctType_peri": PunctType_peri, # U, + "PunctType_qest": PunctType_qest, # U, + "PunctType_excl": PunctType_excl, # U, + "PunctType_quot": PunctType_quot, # U, + "PunctType_brck": PunctType_brck, # U, + "PunctType_comm": PunctType_comm, # U, + "PunctType_colo": PunctType_colo, # U, + "PunctType_semi": PunctType_semi, # U, + "PunctType_dash": PunctType_dash, # U, + "Style_arch": Style_arch, # cz, fi, U, + "Style_rare": Style_rare, # cz, fi, U, + "Style_poet": Style_poet, # cz, U, + "Style_norm": Style_norm, # cz, U, + "Style_coll": Style_coll, # cz, U, + "Style_vrnc": Style_vrnc, # cz, U, + "Style_sing": Style_sing, # cz, U, + "Style_expr": Style_expr, # cz, U, + "Style_derg": Style_derg, # cz, U, + "Style_vulg": Style_vulg, # cz, U, + "Style_yes": Style_yes, # fi, U, + "StyleVariant_styleShort": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound": StyleVariant_styleBound, # cz, sl, + "VerbType_aux": VerbType_aux, # U, + "VerbType_cop": VerbType_cop, # U, + "VerbType_mod": VerbType_mod, # U, + "VerbType_light": VerbType_light, # U, "PERSON": PERSON, "NORP": NORP, diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 7c6dcda1b..850eaa4c2 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -16,7 +16,6 @@ from ..tokens import Doc from ..strings import StringStore from ..lemmatizer import Lemmatizer from ..attrs import ORTH, TAG, HEAD, DEP -from ..util import match_best_version, get_data_path from io import StringIO, BytesIO from pathlib import Path @@ -90,11 +89,8 @@ def en_entityrecognizer(): @pytest.fixture -def lemmatizer(path): - if path is not None: - return Lemmatizer.load(path) - else: - return None +def lemmatizer(): + return English.Defaults.create_lemmatizer() @pytest.fixture @@ -106,14 +102,6 @@ def text_file_b(): return BytesIO() -@pytest.fixture -def path(): - if 'SPACY_DATA' in os.environ: - return Path(os.environ['SPACY_DATA']) - else: - return match_best_version('en', None, get_data_path()) - - # only used for tests that require loading the models # in all other cases, use specific instances @pytest.fixture(scope="session") diff --git a/spacy/tests/regression/test_issue781.py b/spacy/tests/regression/test_issue781.py new file mode 100644 index 000000000..1c48d1534 --- /dev/null +++ b/spacy/tests/regression/test_issue781.py @@ -0,0 +1,10 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +# Note: "chromosomes" worked previous the bug fix +@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])]) +def test_issue781(lemmatizer, word, lemmas): + assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas) diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index 79505f1cb..14c176edc 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -31,6 +31,12 @@ def test_spans_root(doc): assert span.root.text == 'sentence' assert span.root.head.text == 'is' +def test_spans_string_fn(doc): + span = doc[0:4] + assert len(span) == 4 + assert span.text == 'This is a sentence' + assert span.upper_ == 'THIS IS A SENTENCE' + assert span.lower_ == 'this is a sentence' def test_spans_root2(en_tokenizer): text = "through North and South Carolina" diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 903ef26d1..fc5d26174 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -365,6 +365,14 @@ cdef class Span: def __get__(self): return ' '.join([t.lemma_ for t in self]).strip() + property upper_: + def __get__(self): + return ''.join([t.string.upper() for t in self]).strip() + + property lower_: + def __get__(self): + return ''.join([t.string.lower() for t in self]).strip() + property string: def __get__(self): return ''.join([t.string for t in self]) diff --git a/spacy/util.py b/spacy/util.py index ef6d9884d..63c57bf69 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -149,15 +149,16 @@ def check_renamed_kwargs(renamed, kwargs): raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) -def parse_package_meta(package_path, package, on_error=False): +def parse_package_meta(package_path, package, require=True): location = os.path.join(str(package_path), package, 'meta.json') - if not os.path.isfile(location) and on_error: - on_error() - else: + if os.path.isfile(location): with io.open(location, encoding='utf8') as f: meta = json.load(f) return meta - return False + elif require: + raise IOError("Could not read meta.json from %s" % location) + else: + return None def print_msg(*text, **kwargs): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ddd3134cb..55dbe7ba0 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -596,6 +596,8 @@ cdef class Vocab: vec = file_.alloc_read(self.mem, vec_len, sizeof(float)) string_id = self.strings[chars[:word_len]] + # Insert words into vocab to add vector. + self.get_by_orth(self.mem, string_id) while string_id >= vectors.size(): vectors.push_back(EMPTY_VEC) assert vec != NULL