diff --git a/bin/init_model.py b/bin/init_model.py index 3307bffa8..cffd9df96 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -20,6 +20,7 @@ from __future__ import unicode_literals from ast import literal_eval import math import gzip +import json import plac from pathlib import Path @@ -29,8 +30,6 @@ from shutil import copytree import codecs from collections import defaultdict -from spacy.en import get_lex_props -from spacy.en.lemmatizer import Lemmatizer from spacy.vocab import Vocab from spacy.vocab import write_binary_vectors from spacy.strings import hash_string @@ -38,6 +37,13 @@ from preshed.counter import PreshCounter from spacy.parts_of_speech import NOUN, VERB, ADJ +import spacy.en +import spacy.de +import spacy.fi +import spacy.it + + + def setup_tokenizer(lang_data_dir, tok_dir): if not tok_dir.exists(): @@ -139,7 +145,7 @@ def _read_senses(loc): return lexicon -def setup_vocab(src_dir, dst_dir): +def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() @@ -148,13 +154,13 @@ def setup_vocab(src_dir, dst_dir): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") - vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) + vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') if not probs: - oov_prob = 0.0 + oov_prob = -20 else: oov_prob = min(probs.values()) for word in clusters: @@ -163,23 +169,32 @@ def setup_vocab(src_dir, dst_dir): lexicon = [] for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): - entry = get_lex_props(word) - entry['prob'] = float(prob) - cluster = clusters.get(word, '0') + lexeme = vocab[word] + lexeme.prob = prob + lexeme.is_oov = False # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx - entry['cluster'] = int(cluster[::-1], 2) - vocab[word] = entry + if word in clusters: + lexeme.cluster = int(clusters[word][::-1], 2) + else: + lexeme.cluster = 0 vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt')) with (dst_dir / 'oov_prob').open('w') as file_: file_.write('%f' % oov_prob) -def main(lang_data_dir, corpora_dir, model_dir): +def main(lang_id, lang_data_dir, corpora_dir, model_dir): + languages = { + 'en': spacy.en.English.default_lex_attrs(), + 'de': spacy.de.Deutsch.default_lex_attrs(), + 'fi': spacy.fi.Finnish.default_lex_attrs(), + 'it': spacy.it.Italian.default_lex_attrs(), + } + model_dir = Path(model_dir) - lang_data_dir = Path(lang_data_dir) - corpora_dir = Path(corpora_dir) + lang_data_dir = Path(lang_data_dir) / lang_id + corpora_dir = Path(corpora_dir) / lang_id assert corpora_dir.exists() assert lang_data_dir.exists() @@ -187,13 +202,19 @@ def main(lang_data_dir, corpora_dir, model_dir): if not model_dir.exists(): model_dir.mkdir() + tag_map = json.load((lang_data_dir / 'tag_map.json').open()) setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') - setup_vocab(corpora_dir, model_dir / 'vocab') + setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab') if (lang_data_dir / 'gazetteer.json').exists(): copyfile(str(lang_data_dir / 'gazetteer.json'), str(model_dir / 'vocab' / 'gazetteer.json')) - if not (model_dir / 'wordnet').exists(): + + if (lang_data_dir / 'lemma_rules.json').exists(): + copyfile(str(lang_data_dir / 'lemma_rules.json'), + str(model_dir / 'vocab' / 'lemma_rules.json')) + + if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists(): copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet')) diff --git a/bin/parser/train.py b/bin/parser/train.py index 68217fcb3..abd5eb16e 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -14,7 +14,6 @@ import re import spacy.util from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir from spacy.syntax.util import Config from spacy.gold import read_json_file @@ -22,6 +21,11 @@ from spacy.gold import GoldParse from spacy.scorer import Scorer +from spacy.syntax.arc_eager import ArcEager +from spacy.syntax.ner import BiluoPushDown +from spacy.tagger import Tagger +from spacy.syntax.parser import Parser + def _corrupt(c, noise_level): if random.random() >= noise_level: @@ -80,32 +84,28 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', beam_width=1, verbose=False, use_orig_arc_eager=False): dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) if path.exists(ner_model_dir): shutil.rmtree(ner_model_dir) os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) os.mkdir(ner_model_dir) - setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=Language.ParserTransitionSystem.get_labels(gold_tuples), + labels=ArcEager.get_labels(gold_tuples), beam_width=beam_width) Config.write(ner_model_dir, 'config', features='ner', seed=seed, - labels=Language.EntityTransitionSystem.get_labels(gold_tuples), + labels=BiluoPushDown.get_labels(gold_tuples), beam_width=0) if n_sents > 0: gold_tuples = gold_tuples[:n_sents] - nlp = Language(data_dir=model_dir) - + nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) + nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) + nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) + nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() @@ -140,7 +140,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc)) - nlp.end_training() + nlp.end_training(model_dir) def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None): diff --git a/bin/tagger/train.py b/bin/tagger/train.py new file mode 100755 index 000000000..9cd8cc011 --- /dev/null +++ b/bin/tagger/train.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import unicode_literals +from __future__ import print_function + +import os +from os import path +import shutil +import codecs +import random + +import plac +import re + +import spacy.util +from spacy.en import English + +from spacy.tagger import Tagger + +from spacy.syntax.util import Config +from spacy.gold import read_json_file +from spacy.gold import GoldParse + +from spacy.scorer import Scorer + + +def score_model(scorer, nlp, raw_text, annot_tuples): + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + else: + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold) + + +def _merge_sents(sents): + m_deps = [[], [], [], [], [], []] + m_brackets = [] + i = 0 + for (ids, words, tags, heads, labels, ner), brackets in sents: + m_deps[0].extend(id_ + i for id_ in ids) + m_deps[1].extend(words) + m_deps[2].extend(tags) + m_deps[3].extend(head + i for head in heads) + m_deps[4].extend(labels) + m_deps[5].extend(ner) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + i += len(ids) + return [(m_deps, m_brackets)] + + +def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', + seed=0, gold_preproc=False, n_sents=0, corruption_level=0, + beam_width=1, verbose=False, + use_orig_arc_eager=False): + if n_sents > 0: + gold_tuples = gold_tuples[:n_sents] + + templates = Tagger.default_templates() + nlp = Language(data_dir=model_dir, tagger=False) + nlp.tagger = Tagger.blank(nlp.vocab, templates) + + print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") + for itn in range(n_iter): + scorer = Scorer() + loss = 0 + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, ctnt in sents: + words = annot_tuples[1] + gold_tags = annot_tuples[2] + score_model(scorer, nlp, raw_text, annot_tuples) + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(words) + else: + tokens = nlp.tokenizer(raw_text) + loss += nlp.tagger.train(tokens, gold_tags) + random.shuffle(gold_tuples) + print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, + scorer.tags_acc, + scorer.token_acc)) + nlp.end_training(model_dir) + +def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, + beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + scorer = Scorer() + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=verbose) + return scorer + + +def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + gold_tuples = read_json_file(dev_loc) + scorer = Scorer() + out_file = codecs.open(out_loc, 'w', 'utf8') + for raw_text, sents in gold_tuples: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=False) + for t in tokens: + out_file.write( + '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) + ) + return scorer + + +@plac.annotations( + train_loc=("Location of training file or directory"), + dev_loc=("Location of development file or directory"), + model_dir=("Location of output model directory",), + eval_only=("Skip training, and only evaluate", "flag", "e", bool), + corruption_level=("Amount of noise to add to training data", "option", "c", float), + gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), + out_loc=("Out location", "option", "o", str), + n_sents=("Number of training sentences", "option", "n", int), + n_iter=("Number of training iterations", "option", "i", int), + verbose=("Verbose error reporting", "flag", "v", bool), + debug=("Debug mode", "flag", "d", bool), +) +def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, + debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False): + if not eval_only: + gold_train = list(read_json_file(train_loc)) + train(English, gold_train, model_dir, + feat_set='basic' if not debug else 'debug', + gold_preproc=gold_preproc, n_sents=n_sents, + corruption_level=corruption_level, n_iter=n_iter, + verbose=verbose) + #if out_loc: + # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) + scorer = evaluate(English, list(read_json_file(dev_loc)), + model_dir, gold_preproc=gold_preproc, verbose=verbose) + print('TOK', scorer.token_acc) + print('POS', scorer.tags_acc) + print('UAS', scorer.uas) + print('LAS', scorer.las) + + print('NER P', scorer.ents_p) + print('NER R', scorer.ents_r) + print('NER F', scorer.ents_f) + + +if __name__ == '__main__': + plac.call(main) diff --git a/lang_data/de/infix.txt b/lang_data/de/infix.txt new file mode 100644 index 000000000..37eca7350 --- /dev/null +++ b/lang_data/de/infix.txt @@ -0,0 +1,3 @@ +\.\.\. +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) diff --git a/lang_data/de/lemma_rules.json b/lang_data/de/lemma_rules.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/de/morphs.json b/lang_data/de/morphs.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/de/prefix.txt b/lang_data/de/prefix.txt new file mode 100644 index 000000000..48c4fc549 --- /dev/null +++ b/lang_data/de/prefix.txt @@ -0,0 +1,21 @@ +, +" +( +[ +{ +* +< +$ +£ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... diff --git a/lang_data/de/sample.txt b/lang_data/de/sample.txt new file mode 100644 index 000000000..12c0bb787 --- /dev/null +++ b/lang_data/de/sample.txt @@ -0,0 +1,3 @@ +Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern. + +Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs. diff --git a/lang_data/de/specials.json b/lang_data/de/specials.json new file mode 100644 index 000000000..0e0986339 --- /dev/null +++ b/lang_data/de/specials.json @@ -0,0 +1,149 @@ +{ +"a.m.": [{"F": "a.m."}], +"p.m.": [{"F": "p.m."}], + +"1a.m.": [{"F": "1"}, {"F": "a.m."}], +"2a.m.": [{"F": "2"}, {"F": "a.m."}], +"3a.m.": [{"F": "3"}, {"F": "a.m."}], +"4a.m.": [{"F": "4"}, {"F": "a.m."}], +"5a.m.": [{"F": "5"}, {"F": "a.m."}], +"6a.m.": [{"F": "6"}, {"F": "a.m."}], +"7a.m.": [{"F": "7"}, {"F": "a.m."}], +"8a.m.": [{"F": "8"}, {"F": "a.m."}], +"9a.m.": [{"F": "9"}, {"F": "a.m."}], +"10a.m.": [{"F": "10"}, {"F": "a.m."}], +"11a.m.": [{"F": "11"}, {"F": "a.m."}], +"12a.m.": [{"F": "12"}, {"F": "a.m."}], +"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], +"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], +"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], +"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], +"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], +"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], +"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], +"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], +"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], +"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], +"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], +"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + +"1p.m.": [{"F": "1"}, {"F": "p.m."}], +"2p.m.": [{"F": "2"}, {"F": "p.m."}], +"3p.m.": [{"F": "3"}, {"F": "p.m."}], +"4p.m.": [{"F": "4"}, {"F": "p.m."}], +"5p.m.": [{"F": "5"}, {"F": "p.m."}], +"6p.m.": [{"F": "6"}, {"F": "p.m."}], +"7p.m.": [{"F": "7"}, {"F": "p.m."}], +"8p.m.": [{"F": "8"}, {"F": "p.m."}], +"9p.m.": [{"F": "9"}, {"F": "p.m."}], +"10p.m.": [{"F": "10"}, {"F": "p.m."}], +"11p.m.": [{"F": "11"}, {"F": "p.m."}], +"12p.m.": [{"F": "12"}, {"F": "p.m."}], +"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], +"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], +"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], +"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], +"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], +"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], +"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], +"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], +"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], +"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], +"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], +"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + +"Jan.": [{"F": "Jan.", "L": "Januar"}], +"Feb.": [{"F": "Feb.", "L": "Februar"}], +"Mär.": [{"F": "Mär.", "L": "März"}], +"Apr.": [{"F": "Apr.", "L": "April"}], +"Mai.": [{"F": "Mai.", "L": "Mai"}], +"Jun.": [{"F": "Jun.", "L": "Juni"}], +"Jul.": [{"F": "Jul.", "L": "Juli"}], +"Aug.": [{"F": "Aug.", "L": "August"}], +"Sep.": [{"F": "Sep.", "L": "September"}], +"Sept.": [{"F": "Sept.", "L": "September"}], +"Okt.": [{"F": "Okt.", "L": "Oktober"}], +"Nov.": [{"F": "Nov.", "L": "November"}], +"Dez.": [{"F": "Dez.", "L": "Dezember"}], + +":)": [{"F": ":)"}], +"<3": [{"F": "<3"}], +";)": [{"F": ";)"}], +"(:": [{"F": "(:"}], +":(": [{"F": ":("}], +"-_-": [{"F": "-_-"}], +"=)": [{"F": "=)"}], +":/": [{"F": ":/"}], +":>": [{"F": ":>"}], +";-)": [{"F": ";-)"}], +":Y": [{"F": ":Y"}], +":P": [{"F": ":P"}], +":-P": [{"F": ":-P"}], +":3": [{"F": ":3"}], +"=3": [{"F": "=3"}], +"xD": [{"F": "xD"}], +"^_^": [{"F": "^_^"}], +"=]": [{"F": "=]"}], +"=D": [{"F": "=D"}], +"<333": [{"F": "<333"}], +":))": [{"F": ":))"}], +":0": [{"F": ":0"}], +"-__-": [{"F": "-__-"}], +"xDD": [{"F": "xDD"}], +"o_o": [{"F": "o_o"}], +"o_O": [{"F": "o_O"}], +"V_V": [{"F": "V_V"}], +"=[[": [{"F": "=[["}], +"<33": [{"F": "<33"}], +";p": [{"F": ";p"}], +";D": [{"F": ";D"}], +";-p": [{"F": ";-p"}], +";(": [{"F": ";("}], +":p": [{"F": ":p"}], +":]": [{"F": ":]"}], +":O": [{"F": ":O"}], +":-/": [{"F": ":-/"}], +":-)": [{"F": ":-)"}], +":(((": [{"F": ":((("}], +":((": [{"F": ":(("}], +":')": [{"F": ":')"}], +"(^_^)": [{"F": "(^_^)"}], +"(=": [{"F": "(="}], +"o.O": [{"F": "o.O"}], +"\")": [{"F": "\")"}], +"a.": [{"F": "a."}], +"b.": [{"F": "b."}], +"c.": [{"F": "c."}], +"d.": [{"F": "d."}], +"e.": [{"F": "e."}], +"f.": [{"F": "f."}], +"g.": [{"F": "g."}], +"h.": [{"F": "h."}], +"i.": [{"F": "i."}], +"j.": [{"F": "j."}], +"k.": [{"F": "k."}], +"l.": [{"F": "l."}], +"m.": [{"F": "m."}], +"n.": [{"F": "n."}], +"o.": [{"F": "o."}], +"p.": [{"F": "p."}], +"q.": [{"F": "q."}], +"s.": [{"F": "s."}], +"t.": [{"F": "t."}], +"u.": [{"F": "u."}], +"v.": [{"F": "v."}], +"w.": [{"F": "w."}], +"x.": [{"F": "x."}], +"y.": [{"F": "y."}], +"z.": [{"F": "z."}], + +"z.b.": [{"F": "z.b."}], +"e.h.": [{"F": "I.e."}], +"o.ä.": [{"F": "I.E."}], +"bzw.": [{"F": "bzw."}], +"usw.": [{"F": "usw."}], +"\n": [{"F": "\n", "pos": "SP"}], +"\t": [{"F": "\t", "pos": "SP"}], +" ": [{"F": " ", "pos": "SP"}] +} diff --git a/lang_data/de/suffix.txt b/lang_data/de/suffix.txt new file mode 100644 index 000000000..d8c6bc2c2 --- /dev/null +++ b/lang_data/de/suffix.txt @@ -0,0 +1,26 @@ +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +'' +'s +'S +’s +’S +’ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]"'%\)])\. +(?<=[0-9])km diff --git a/lang_data/de/tag_map.json b/lang_data/de/tag_map.json new file mode 100644 index 000000000..ee1bb1b81 --- /dev/null +++ b/lang_data/de/tag_map.json @@ -0,0 +1,56 @@ +{ +"$(": {"pos": "PUNCT", "PunctType": "Brck"}, +"$,": {"pos": "PUNCT", "PunctType": "Comm"}, +"$.": {"pos": "PUNCT", "PunctType": "Peri"}, +"ADJA": {"pos": "ADJ"}, +"ADJD": {"pos": "ADJ", "Variant": "Short"}, +"ADV": {"pos": "ADV"}, +"APPO": {"pos": "ADP", "AdpType": "Post"}, +"APPR": {"pos": "ADP", "AdpType": "Prep"}, +"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"}, +"APZR": {"pos": "ADP", "AdpType": "Circ"}, +"ART": {"pos": "DET", "PronType": "Art"}, +"CARD": {"pos": "NUM", "NumType": "Card"}, +"FM": {"pos": "X", "Foreign": "Yes"}, +"ITJ": {"pos": "INTJ"}, +"KOKOM": {"pos": "CONJ", "ConjType": "Comp"}, +"KON": {"pos": "CONJ"}, +"KOUI": {"pos": "SCONJ"}, +"KOUS": {"pos": "SCONJ"}, +"NE": {"pos": "PROPN"}, +"NN": {"pos": "NOUN"}, +"PAV": {"pos": "ADV", "PronType": "Dem"}, +"PDAT": {"pos": "DET", "PronType": "Dem"}, +"PDS": {"pos": "PRON", "PronType": "Dem"}, +"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, +"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"}, +"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"}, +"PPER": {"pos": "PRON", "PronType": "Prs"}, +"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"}, +"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"}, +"PRELAT": {"pos": "DET", "PronType": "Rel"}, +"PRELS": {"pos": "PRON", "PronType": "Rel"}, +"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"}, +"PTKA": {"pos": "PART"}, +"PTKANT": {"pos": "PART", "PartType": "Res"}, +"PTKNEG": {"pos": "PART", "Negative": "Neg"}, +"PTKVZ": {"pos": "PART", "PartType": "Vbp"}, +"PTKZU": {"pos": "PART", "PartType": "Inf"}, +"PWAT": {"pos": "DET", "PronType": "Int"}, +"PWAV": {"pos": "ADV", "PronType": "Int"}, +"PWS": {"pos": "PRON", "PronType": "Int"}, +"TRUNC": {"pos": "X", "Hyph": "Yes"}, +"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"}, +"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"}, +"VAINF": {"pos": "AUX", "VerbForm": "Inf"}, +"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"}, +"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"}, +"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"}, +"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"}, +"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"}, +"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"}, +"VVINF": {"pos": "VERB", "VerbForm": "Inf"}, +"VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, +"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, +"XY": {"pos": "X"} +} diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json new file mode 100644 index 000000000..c45eb1df6 --- /dev/null +++ b/lang_data/en/lemma_rules.json @@ -0,0 +1,31 @@ +{ + "noun": [ + ["s", ""], + ["ses", "s"], + ["ves", "f"], + ["xes", "x"], + ["zes", "z"], + ["ches", "ch"], + ["shes", "sh"], + ["men", "man"], + ["ies", "y"] + ], + + "verb": [ + ["s", ""], + ["ies", "y"], + ["es", "e"], + ["es", ""], + ["ed", "e"], + ["ed", ""], + ["ing", "e"], + ["ing", ""] + ], + + "adj": [ + ["er", ""], + ["est", ""], + ["er", "e"], + ["est", "e"] + ] +} diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json new file mode 100644 index 000000000..8678e5afe --- /dev/null +++ b/lang_data/en/tag_map.json @@ -0,0 +1,60 @@ +{ +".": {"pos": "punct", "puncttype": "peri"}, +",": {"pos": "punct", "puncttype": "comm"}, +"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"}, +"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"}, +"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"}, +"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"}, +"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"}, +":": {"pos": "punct"}, +"$": {"pos": "sym", "other": {"symtype": "currency"}}, +"#": {"pos": "sym", "other": {"symtype": "numbersign"}}, +"AFX": {"pos": "adj", "hyph": "hyph"}, +"CC": {"pos": "conj", "conjtype": "coor"}, +"CD": {"pos": "num", "numtype": "card"}, +"DT": {"pos": "adj", "prontype": "prn"}, +"EX": {"pos": "adv", "advtype": "ex"}, +"FW": {"pos": "x", "foreign": "foreign"}, +"HYPH": {"pos": "punct", "puncttype": "dash"}, +"IN": {"pos": "adp"}, +"JJ": {"pos": "adj", "degree": "pos"}, +"JJR": {"pos": "adj", "degree": "comp"}, +"JJS": {"pos": "adj", "degree": "sup"}, +"LS": {"pos": "punct", "numtype": "ord"}, +"MD": {"pos": "verb", "verbtype": "mod"}, +"NIL": {"pos": "no_tag"}, +"NN": {"pos": "noun", "number": "sing"}, +"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, +"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, +"NNS": {"pos": "noun", "number": "plur"}, +"PDT": {"pos": "adj", "adjtype": "pdt", "prontype": "prn"}, +"POS": {"pos": "part", "poss": "poss"}, +"PRP": {"pos": "noun", "prontype": "prs"}, +"PRP$": {"pos": "adj", "prontype": "prs", "poss": "poss"}, +"RB": {"pos": "adv", "degree": "pos"}, +"RBR": {"pos": "adv", "degree": "comp"}, +"RBS": {"pos": "adv", "degree": "sup"}, +"RP": {"pos": "part"}, +"SYM": {"pos": "sym"}, +"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"}, +"UH": {"pos": "intJ"}, +"VB": {"pos": "verb", "verbform": "inf"}, +"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"}, +"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"}, +"VBN": {"pos": "verb", "verbform": "part", "tense": "past", "aspect": "perf"}, +"VBP": {"pos": "verb", "verbform": "fin", "tense": "pres"}, +"VBZ": {"pos": "verb", "verbform": "fin", "tense": "pres", "number": "sing", "person": 3}, +"WDT": {"pos": "adj", "prontype": "int|rel"}, +"WP": {"pos": "noun", "prontype": "int|rel"}, +"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"}, +"WRB": {"pos": "adv", "prontype": "int|rel"}, +"SP": {"pos": "space"}, +"ADD": {"pos": "x"}, +"NFP": {"pos": "punct"}, +"GW": {"pos": "x"}, +"AFX": {"pos": "x"}, +"HYPH": {"pos": "punct"}, +"XX": {"pos": "x"}, +"BES": {"pos": "verb"}, +"HVS": {"pos": "verb"} +} diff --git a/setup.py b/setup.py index 218272504..fe55d0d5a 100644 --- a/setup.py +++ b/setup.py @@ -153,7 +153,7 @@ def main(modules, is_pypy): MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.lexeme', 'spacy.vocab', 'spacy.attrs', - 'spacy.morphology', + 'spacy.morphology', 'spacy.tagger', 'spacy.syntax.stateclass', 'spacy._ml', 'spacy._theano', 'spacy.tokenizer', 'spacy.en.attrs', diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 18908e89e..56c080fa6 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -91,6 +91,8 @@ cdef class Model: count_feats(counts[guess], feats, n_feats, -cost) self._model.update(counts) - def end_training(self): + def end_training(self, model_loc=None): + if model_loc is None: + model_loc = self.model_loc self._model.end_training() - self._model.dump(self.model_loc, freq_thresh=0) + self._model.dump(model_loc, freq_thresh=0) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index d2ace1cff..c810762ef 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -84,3 +84,4 @@ cpdef enum attr_id_t: ENT_TYPE HEAD SPACY + PROB diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index a04b615da..f68ff196e 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -1,181 +1,12 @@ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function + from os import path -import re -import struct -import json -from .. import orth -from ..vocab import Vocab -from ..tokenizer import Tokenizer -from ..syntax.arc_eager import ArcEager -from ..syntax.ner import BiluoPushDown -from ..syntax.parser import ParserFactory -from ..serialize.bits import BitArray -from ..matcher import Matcher +from ..language import Language -from ..tokens import Doc -from ..multi_words import RegexMerger - -from .pos import EnPosTagger -from .pos import POS_TAGS -from .attrs import get_flags -from . import regexes - -from ..util import read_lang_data - -from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB - - -def get_lex_props(string, oov_prob=-30, is_oov=False): - return { - 'flags': get_flags(string, is_oov=is_oov), - 'length': len(string), - 'orth': string, - 'lower': string.lower(), - 'norm': string, - 'shape': orth.word_shape(string), - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': oov_prob, - 'sentiment': 0 - } - -if_model_present = -1 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') - -class English(object): - """The English NLP pipeline. - - Example: - - Load data from default directory: - - >>> nlp = English() - >>> nlp = English(data_dir=u'') - - Load data from specified directory: - - >>> nlp = English(data_dir=u'path/to/data_directory') - - Disable (and avoid loading) parts of the processing pipeline: - - >>> nlp = English(vectors=False, parser=False, tagger=False, entity=False) - - Start with nothing loaded: - - >>> nlp = English(data_dir=None) - """ - ParserTransitionSystem = ArcEager - EntityTransitionSystem = BiluoPushDown - - def __init__(self, - data_dir=LOCAL_DATA_DIR, - Tokenizer=Tokenizer.from_dir, - Tagger=EnPosTagger, - Parser=ParserFactory(ParserTransitionSystem), - Entity=ParserFactory(EntityTransitionSystem), - Matcher=Matcher.from_dir, - Packer=None, - load_vectors=True - ): - self.data_dir = data_dir - - if path.exists(path.join(data_dir, 'vocab', 'oov_prob')): - oov_prob = float(open(path.join(data_dir, 'vocab', 'oov_prob')).read()) - else: - oov_prob = None - - self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, - get_lex_props=get_lex_props, load_vectors=load_vectors, - pos_tags=POS_TAGS, - oov_prob=oov_prob) - if Tagger is True: - Tagger = EnPosTagger - if Parser is True: - transition_system = self.ParserTransitionSystem - Parser = lambda s, d: parser.Parser(s, d, transition_system) - if Entity is True: - transition_system = self.EntityTransitionSystem - Entity = lambda s, d: parser.Parser(s, d, transition_system) - - self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer')) - - if Tagger and path.exists(path.join(data_dir, 'pos')): - self.tagger = Tagger(self.vocab.strings, data_dir) - else: - self.tagger = None - if Parser and path.exists(path.join(data_dir, 'deps')): - self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps')) - else: - self.parser = None - if Entity and path.exists(path.join(data_dir, 'ner')): - self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner')) - else: - self.entity = None - if Matcher: - self.matcher = Matcher(self.vocab, data_dir) - else: - self.matcher = None - if Packer: - self.packer = Packer(self.vocab, data_dir) - else: - self.packer = None - self.mwe_merger = RegexMerger([ - ('IN', 'O', regexes.MW_PREPOSITIONS_RE), - ('CD', 'TIME', regexes.TIME_RE), - ('NNP', 'DATE', regexes.DAYS_RE), - ('CD', 'MONEY', regexes.MONEY_RE)]) - - def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False): - """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string - is preserved. - - Args: - text (unicode): The text to be processed. - - Returns: - tokens (spacy.tokens.Doc): - - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp('An example sentence. Another example sentence.') - >>> tokens[0].orth_, tokens[0].head.tag_ - ('An', 'NN') - """ - tokens = self.tokenizer(text) - if self.tagger and tag: - self.tagger(tokens) - if self.matcher and entity: - self.matcher(tokens) - if self.parser and parse: - self.parser(tokens) - if self.entity and entity: - self.entity(tokens) - if merge_mwes and self.mwe_merger is not None: - self.mwe_merger(tokens) - return tokens - - def end_training(self, data_dir=None): - if data_dir is None: - data_dir = self.data_dir - self.parser.model.end_training() - self.entity.model.end_training() - self.tagger.model.end_training() - self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) - - with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: - file_.write( - json.dumps([ - (TAG, list(self.tagger.freqs[TAG].items())), - (DEP, list(self.parser.moves.freqs[DEP].items())), - (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())), - (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())), - (HEAD, list(self.parser.moves.freqs[HEAD].items()))])) - - @property - def tags(self): - """Deprecated. List of part-of-speech tag names.""" - return self.tagger.tag_names +class English(Language): + @classmethod + def default_data_dir(cls): + return LOCAL_DATA_DIR diff --git a/spacy/en/lemmatizer.py b/spacy/en/lemmatizer.py deleted file mode 100644 index 5883e12c8..000000000 --- a/spacy/en/lemmatizer.py +++ /dev/null @@ -1,105 +0,0 @@ -from __future__ import unicode_literals -from os import path -import codecs - - -NOUN_RULES = ( - ('s', ''), - ('ses', 's'), - ('ves', 'f'), - ('xes', 'x'), - ('zes', 'z'), - ('ches', 'ch'), - ('shes', 'sh'), - ('men', 'man'), - ('ies', 'y') -) - - -VERB_RULES = ( - ("s", ""), - ("ies", "y"), - ("es", "e"), - ("es", ""), - ("ed", "e"), - ("ed", ""), - ("ing", "e"), - ("ing", "") -) - - -ADJ_RULES = ( - ("er", ""), - ("est", ""), - ("er", "e"), - ("est", "e") -) - - -class Lemmatizer(object): - def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): - self.noun_id = noun_id - self.verb_id = verb_id - self.adj_id = adj_id - self.index = {} - self.exc = {} - for pos in ['adj', 'adv', 'noun', 'verb']: - self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos)) - self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) - - def __call__(self, string, pos): - if pos == self.noun_id: - return self.noun(string) - elif pos == self.verb_id: - return self.verb(string) - elif pos == self.adj_id: - return self.adj(string) - else: - raise Exception("Cannot lemmatize with unknown pos: %s" % pos) - - def noun(self, string): - return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) - - def verb(self, string): - return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) - - def adj(self, string): - return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) - - -def lemmatize(string, index, exceptions, rules): - string = string.lower() - forms = [] - if string in index: - forms.append(string) - forms.extend(exceptions.get(string, [])) - for old, new in rules: - if string.endswith(old): - form = string[:len(string) - len(old)] + new - if form in index: - forms.append(form) - if not forms: - forms.append(string) - return set(forms) - - -def read_index(loc): - index = set() - for line in codecs.open(loc, 'r', 'utf8'): - if line.startswith(' '): - continue - pieces = line.split() - word = pieces[0] - if word.count('_') == 0: - index.add(word) - return index - - -def read_exc(loc): - exceptions = {} - for line in codecs.open(loc, 'r', 'utf8'): - if line.startswith(' '): - continue - pieces = line.split() - exceptions[pieces[0]] = tuple(pieces[1:]) - return exceptions diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index 2fc7b4ac7..213752cf5 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -1,26 +1,5 @@ -from preshed.maps cimport PreshMapArray -from preshed.counter cimport PreshCounter -from cymem.cymem cimport Pool - -from .._ml cimport Model -from ..strings cimport StringStore -from ..structs cimport TokenC, LexemeC, Morphology, PosTag -from ..parts_of_speech cimport univ_pos_t -from .lemmatizer import Lemmatizer +from ..tagger cimport Tagger -cdef class EnPosTagger: - cdef readonly Pool mem - cdef readonly StringStore strings - cdef readonly Model model - cdef public object lemmatizer - cdef PreshMapArray _morph_cache - cdef public dict freqs - - cdef PosTag* tags - cdef readonly object tag_names - cdef readonly object tag_map - cdef readonly int n_tags - - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 +cdef class EnPosTagger(Tagger): + pass diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 569b209fc..8e034eadf 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -1,389 +1,11 @@ from os import path -import json -import os -import shutil -from libc.string cimport memset +from ..parts_of_speech cimport NOUN, VERB, ADJ -from cymem.cymem cimport Address -from thinc.typedefs cimport atom_t, weight_t -from collections import defaultdict - -from ..parts_of_speech cimport univ_pos_t -from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON - -from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE -from ..structs cimport TokenC, Morphology, LexemeC -from ..tokens.doc cimport Doc -from ..morphology cimport set_morph_from_dict -from .._ml cimport arg_max - -from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL -from ..typedefs cimport attr_t - -from .lemmatizer import Lemmatizer +from ..lemmatizer import Lemmatizer -cpdef enum en_person_t: - NO_PERSON - FIRST - SECOND - THIRD - NON_THIRD - - -cpdef enum en_number_t: - NO_NUMBER - SINGULAR - PLURAL - MASS - - -cpdef enum en_gender_t: - NO_GENDER - MASCULINE - FEMININE - NEUTER - - -cpdef enum en_case_t: - NO_CASE - NOMINATIVE - GENITIVE - ACCUSATIVE - REFLEXIVE - DEMONYM - - -cpdef enum en_tenspect_t: - NO_TENSE - BASE_VERB - PRESENT - PAST - PASSIVE - ING - MODAL - - -cpdef enum misc_t: - NO_MISC - COMPARATIVE - SUPERLATIVE - RELATIVE - NAME - - -cpdef enum: - P2_orth - P2_cluster - P2_shape - P2_prefix - P2_suffix - P2_pos - P2_lemma - P2_flags - - P1_orth - P1_cluster - P1_shape - P1_prefix - P1_suffix - P1_pos - P1_lemma - P1_flags - - W_orth - W_cluster - W_shape - W_prefix - W_suffix - W_pos - W_lemma - W_flags - - N1_orth - N1_cluster - N1_shape - N1_prefix - N1_suffix - N1_pos - N1_lemma - N1_flags - - N2_orth - N2_cluster - N2_shape - N2_prefix - N2_suffix - N2_pos - N2_lemma - N2_flags - - N_CONTEXT_FIELDS - - -POS_TAGS = { - 'NULL': (NO_TAG, {}), - 'EOL': (EOL, {}), - 'CC': (CONJ, {}), - 'CD': (NUM, {}), - 'DT': (DET, {}), - 'EX': (DET, {}), - 'FW': (X, {}), - 'IN': (ADP, {}), - 'JJ': (ADJ, {}), - 'JJR': (ADJ, {'misc': COMPARATIVE}), - 'JJS': (ADJ, {'misc': SUPERLATIVE}), - 'LS': (X, {}), - 'MD': (VERB, {'tenspect': MODAL}), - 'NN': (NOUN, {}), - 'NNS': (NOUN, {'number': PLURAL}), - 'NNP': (NOUN, {'misc': NAME}), - 'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}), - 'PDT': (DET, {}), - 'POS': (PRT, {'case': GENITIVE}), - 'PRP': (PRON, {}), - 'PRP$': (PRON, {'case': GENITIVE}), - 'RB': (ADV, {}), - 'RBR': (ADV, {'misc': COMPARATIVE}), - 'RBS': (ADV, {'misc': SUPERLATIVE}), - 'RP': (PRT, {}), - 'SYM': (X, {}), - 'TO': (PRT, {}), - 'UH': (X, {}), - 'VB': (VERB, {}), - 'VBD': (VERB, {'tenspect': PAST}), - 'VBG': (VERB, {'tenspect': ING}), - 'VBN': (VERB, {'tenspect': PASSIVE}), - 'VBP': (VERB, {'tenspect': PRESENT}), - 'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}), - 'WDT': (DET, {'misc': RELATIVE}), - 'WP': (PRON, {'misc': RELATIVE}), - 'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}), - 'WRB': (ADV, {'misc': RELATIVE}), - '!': (PUNCT, {}), - '#': (PUNCT, {}), - '$': (PUNCT, {}), - "''": (PUNCT, {}), - "(": (PUNCT, {}), - ")": (PUNCT, {}), - "-LRB-": (PUNCT, {}), - "-RRB-": (PUNCT, {}), - ".": (PUNCT, {}), - ",": (PUNCT, {}), - "``": (PUNCT, {}), - ":": (PUNCT, {}), - "?": (PUNCT, {}), - "ADD": (X, {}), - "NFP": (PUNCT, {}), - "GW": (X, {}), - "AFX": (X, {}), - "HYPH": (PUNCT, {}), - "XX": (X, {}), - "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}), - "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}), - "SP": (SPACE, {}) -} - - -POS_TEMPLATES = ( - (W_orth,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_orth,), - (N2_orth,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_orth), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - (P1_cluster,), - (P2_cluster,), - - (W_flags,), - (N1_flags,), - (N2_flags,), - (P1_flags,), - (P2_flags,), -) - - -cdef struct _CachedMorph: - Morphology morph - int lemma - - -def setup_model_dir(tag_names, tag_map, templates, model_dir): - if path.exists(model_dir): - shutil.rmtree(model_dir) - os.mkdir(model_dir) - config = { - 'templates': templates, - 'tag_names': tag_names, - 'tag_map': tag_map - } - with open(path.join(model_dir, 'config.json'), 'w') as file_: - json.dump(config, file_) - - -cdef class EnPosTagger: +cdef class EnPosTagger(Tagger): """A part-of-speech tagger for English""" - def __init__(self, StringStore strings, data_dir): - self.mem = Pool() - model_dir = path.join(data_dir, 'pos') - self.strings = strings - cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) - self.tag_names = sorted(cfg['tag_names']) - assert self.tag_names - self.n_tags = len(self.tag_names) - self.tag_map = cfg['tag_map'] - cdef int n_tags = len(self.tag_names) + 1 - - self.model = Model(n_tags, cfg['templates'], model_dir) - self._morph_cache = PreshMapArray(n_tags) - self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) - for i, tag in enumerate(sorted(self.tag_names)): - pos, props = self.tag_map[tag] - self.tags[i].id = i - self.tags[i].pos = pos - set_morph_from_dict(&self.tags[i].morph, props) - if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')): - self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', - 'morphs.json')))) - self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) - self.freqs = {TAG: defaultdict(int)} - for tag in self.tag_names: - self.freqs[TAG][self.strings[tag]] = 1 - self.freqs[TAG][0] = 1 - - def __call__(self, Doc tokens): - """Apply the tagger, setting the POS tags onto the Doc object. - - Args: - tokens (Doc): The tokens to be tagged. - """ - if tokens.length == 0: - return 0 - cdef int i - cdef atom_t[N_CONTEXT_FIELDS] context - cdef const weight_t* scores - for i in range(tokens.length): - if tokens.data[i].pos == 0: - fill_context(context, i, tokens.data) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) - - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def tag_from_strings(self, Doc tokens, object tag_strs): - cdef int i - for i in range(tokens.length): - tokens.data[i].tag = self.strings[tag_strs[i]] - self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], - tokens.data) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def train(self, Doc tokens, object gold_tag_strs): - cdef int i - cdef int loss - cdef atom_t[N_CONTEXT_FIELDS] context - cdef const weight_t* scores - golds = [self.tag_names.index(g) if g is not None else -1 - for g in gold_tag_strs] - correct = 0 - for i in range(tokens.length): - fill_context(context, i, tokens.data) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - loss = guess != golds[i] if golds[i] != -1 else 0 - self.model.update(context, guess, golds[i], loss) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) - correct += loss == 0 - self.freqs[TAG][tokens.data[i].tag] += 1 - return correct - - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: - tokens[i].pos = tag.pos - cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) - if cached is NULL: - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) - cached.morph = tag.morph - self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) - tokens[i].lemma = cached.lemma - tokens[i].morph = cached.morph - - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: - if self.lemmatizer is None: - return lex.orth - cdef unicode py_string = self.strings[lex.orth] - if pos != NOUN and pos != VERB and pos != ADJ: - return lex.orth - cdef set lemma_strings - cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, pos) - lemma_string = sorted(lemma_strings)[0] - lemma = self.strings[lemma_string] - return lemma - - def load_morph_exceptions(self, dict exc): - cdef unicode pos_str - cdef unicode form_str - cdef unicode lemma_str - cdef dict entries - cdef dict props - cdef int lemma - cdef attr_t orth - cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) - for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.strings[lemma_str] - set_morph_from_dict(&cached.morph, props) - self._morph_cache.set(pos, orth, cached) - - -cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1: - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.lower - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.tag - context[6] = t.lemma - if t.lex.flags & (1 << IS_ALPHA): - context[7] = 1 - elif t.lex.flags & (1 << IS_PUNCT): - context[7] = 2 - elif t.lex.flags & (1 << LIKE_URL): - context[7] = 3 - elif t.lex.flags & (1 << LIKE_NUM): - context[7] = 4 - else: - context[7] = 0 + def make_lemmatizer(self, data_dir): + return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) diff --git a/spacy/fi/__init__.py b/spacy/fi/__init__.py new file mode 100644 index 000000000..8e7173767 --- /dev/null +++ b/spacy/fi/__init__.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language + + +class Finnish(Language): + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') diff --git a/spacy/language.py b/spacy/language.py new file mode 100644 index 000000000..881df7d1a --- /dev/null +++ b/spacy/language.py @@ -0,0 +1,252 @@ +from os import path + +try: + import ujson as json +except ImportError: + import json + +from .tokenizer import Tokenizer +from .vocab import Vocab +from .syntax.parser import Parser +from .tagger import Tagger +from .matcher import Matcher +from .serialize.packer import Packer +from ._ml import Model +from . import attrs +from . import orth +from .syntax.ner import BiluoPushDown +from .syntax.arc_eager import ArcEager + +from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD + + +class Language(object): + @staticmethod + def lower(string): + return string.lower() + + @staticmethod + def norm(string): + return string + + @staticmethod + def shape(string): + return orth.word_shape(string) + + @staticmethod + def prefix(string): + return string[0] + + @staticmethod + def suffix(string): + return string[-3:] + + @staticmethod + def prob(string): + return -30 + + @staticmethod + def cluster(string): + return 0 + + @staticmethod + def is_alpha(string): + return orth.is_alpha(string) + + @staticmethod + def is_ascii(string): + return orth.is_ascii(string) + + @staticmethod + def is_digit(string): + return string.isdigit() + + @staticmethod + def is_lower(string): + return orth.is_lower(string) + + @staticmethod + def is_punct(string): + return orth.is_punct(string) + + @staticmethod + def is_space(string): + return string.isspace() + + @staticmethod + def is_title(string): + return orth.is_title(string) + + @staticmethod + def is_upper(string): + return orth.is_upper(string) + + @staticmethod + def like_url(string): + return orth.like_url(string) + + @staticmethod + def like_number(string): + return orth.like_number(string) + + @staticmethod + def like_email(string): + return orth.like_email(string) + + @classmethod + def default_lex_attrs(cls, data_dir=None): + return { + attrs.LOWER: cls.lower, + attrs.NORM: cls.norm, + attrs.SHAPE: cls.shape, + attrs.PREFIX: cls.prefix, + attrs.SUFFIX: cls.suffix, + attrs.CLUSTER: cls.cluster, + attrs.PROB: lambda string: -10.0, + + attrs.IS_ALPHA: cls.is_alpha, + attrs.IS_ASCII: cls.is_ascii, + attrs.IS_DIGIT: cls.is_digit, + attrs.IS_LOWER: cls.is_lower, + attrs.IS_PUNCT: cls.is_punct, + attrs.IS_SPACE: cls.is_space, + attrs.IS_TITLE: cls.is_title, + attrs.IS_UPPER: cls.is_upper, + attrs.LIKE_URL: cls.like_url, + attrs.LIKE_NUM: cls.like_number, + attrs.LIKE_EMAIL: cls.like_email, + attrs.IS_STOP: lambda string: False, + attrs.IS_OOV: lambda string: True + } + + @classmethod + def default_dep_labels(cls): + return {0: {'ROOT': True}} + + @classmethod + def default_ner_labels(cls): + return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} + + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') + + @classmethod + def default_vectors(cls, data_dir): + return None + + @classmethod + def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None): + if data_dir is None: + data_dir = cls.default_data_dir() + if vectors is None: + vectors = cls.default_vectors(data_dir) + if get_lex_attr is None: + get_lex_attr = cls.default_lex_attrs(data_dir) + return Vocab.from_dir( + path.join(data_dir, 'vocab'), + get_lex_attr=get_lex_attr, + vectors=vectors) + + @classmethod + def default_tokenizer(cls, vocab, data_dir): + if path.exists(data_dir): + return Tokenizer.from_dir(vocab, data_dir) + else: + return Tokenizer(vocab, {}, None, None, None) + + @classmethod + def default_tagger(cls, vocab, data_dir): + if path.exists(data_dir): + return Tagger.from_dir(data_dir, vocab) + else: + return None + + @classmethod + def default_parser(cls, vocab, data_dir): + if path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, ArcEager) + else: + return None + + @classmethod + def default_entity(cls, vocab, data_dir): + if path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) + else: + return None + + @classmethod + def default_matcher(cls, vocab, data_dir): + if path.exists(data_dir): + return Matcher.from_dir(data_dir, vocab) + else: + return None + + def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None, + parser=None, entity=None, matcher=None, serializer=None): + if data_dir is None: + data_dir = self.default_data_dir() + if vocab is None: + vocab = self.default_vocab(data_dir) + if tokenizer is None: + tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer')) + if tagger is None: + tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos')) + if entity is None: + entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner')) + if parser is None: + parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps')) + if matcher is None: + matcher = self.default_matcher(vocab, data_dir=data_dir) + self.vocab = vocab + self.tokenizer = tokenizer + self.tagger = tagger + self.parser = parser + self.entity = entity + self.matcher = matcher + + def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False): + """Apply the pipeline to some text. The text can span multiple sentences, + and can contain arbtrary whitespace. Alignment into the original string + is preserved. + + Args: + text (unicode): The text to be processed. + + Returns: + tokens (spacy.tokens.Doc): + + >>> from spacy.en import English + >>> nlp = English() + >>> tokens = nlp('An example sentence. Another example sentence.') + >>> tokens[0].orth_, tokens[0].head.tag_ + ('An', 'NN') + """ + tokens = self.tokenizer(text) + if self.tagger and tag: + self.tagger(tokens) + if self.matcher and entity: + self.matcher(tokens) + if self.parser and parse: + self.parser(tokens) + if self.entity and entity: + self.entity(tokens) + return tokens + + def end_training(self, data_dir=None): + if data_dir is None: + data_dir = self.data_dir + self.parser.model.end_training(path.join(data_dir, 'deps', 'model')) + self.entity.model.end_training(path.join(data_dir, 'ner', 'model')) + self.tagger.model.end_training(path.join(data_dir, 'pos', 'model')) + self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) + + with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: + file_.write( + json.dumps([ + (TAG, list(self.tagger.freqs[TAG].items())), + (DEP, list(self.parser.moves.freqs[DEP].items())), + (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())), + (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())), + (HEAD, list(self.parser.moves.freqs[HEAD].items()))])) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py new file mode 100644 index 000000000..5e08e80a4 --- /dev/null +++ b/spacy/lemmatizer.py @@ -0,0 +1,86 @@ +from __future__ import unicode_literals +from os import path +import codecs + +try: + import ujson as json +except ImportError: + import json + +from .parts_of_speech import NOUN, VERB, ADJ + + +class Lemmatizer(object): + @classmethod + def from_dir(cls, data_dir): + index = {} + exc = {} + for pos in ['adj', 'adv', 'noun', 'verb']: + index[pos] = read_index(path.join(data_dir, 'index.%s' % pos)) + exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos)) + rules = json.load(open(path.join(data_dir, 'lemma_rules.json'))) + return cls(index, exc, rules) + + def __init__(self, index, exceptions, rules): + self.index = index + self.exc = exceptions + self.rules = rules + + def __call__(self, string, pos): + if pos == NOUN: + pos = 'noun' + elif pos == VERB: + pos = 'verb' + elif pos == ADJ: + pos = 'adj' + else: + return string + lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, [])) + return min(lemmas) + + def noun(self, string): + return self(string, 'noun') + + def verb(self, string): + return self(string, 'verb') + + def adj(self, string): + return self(string, 'adj') + + +def lemmatize(string, index, exceptions, rules): + string = string.lower() + forms = [] + if string in index: + forms.append(string) + forms.extend(exceptions.get(string, [])) + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if form in index: + forms.append(form) + if not forms: + forms.append(string) + return set(forms) + + +def read_index(loc): + index = set() + for line in codecs.open(loc, 'r', 'utf8'): + if line.startswith(' '): + continue + pieces = line.split() + word = pieces[0] + if word.count('_') == 0: + index.add(word) + return index + + +def read_exc(loc): + exceptions = {} + for line in codecs.open(loc, 'r', 'utf8'): + if line.startswith(' '): + continue + pieces = line.split() + exceptions[pieces[0]] = tuple(pieces[1:]) + return exceptions diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 130966765..f4f8d1e7f 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -17,6 +17,7 @@ cdef class Lexeme: cdef readonly attr_t orth @staticmethod +<<<<<<< HEAD cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length): cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth) self.c = lex @@ -41,11 +42,30 @@ cdef class Lexeme: lex.suffix = value elif name == CLUSTER: lex.cluster = value +======= + cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: + lex.length = props['length'] + lex.orth = vocab.strings[props['orth']] + lex.lower = vocab.strings[props['lower']] + lex.norm = vocab.strings[props['norm']] + lex.shape = vocab.strings[props['shape']] + lex.prefix = vocab.strings[props['prefix']] + lex.suffix = vocab.strings[props['suffix']] + + lex.cluster = props['cluster'] + lex.prob = props['prob'] + lex.sentiment = props['sentiment'] + + lex.flags = props['flags'] +>>>>>>> de @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: if feat_name < (sizeof(flags_t) * 8): - return Lexeme.check_flag(lex, feat_name) + if Lexeme.check_flag(lex, feat_name): + return 1 + else: + return 0 elif feat_name == ID: return lex.id elif feat_name == ORTH: @@ -66,9 +86,29 @@ cdef class Lexeme: return lex.cluster else: return 0 + + @staticmethod + cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: + if name < (sizeof(flags_t) * 8): + Lexeme.set_flag(lex, name, value) + elif name == ID: + lex.id = value + elif name == LOWER: + lex.lower = value + elif name == NORM: + lex.norm = value + elif name == SHAPE: + lex.shape = value + elif name == PREFIX: + lex.prefix = value + elif name == SUFFIX: + lex.suffix = value + elif name == CLUSTER: + lex.cluster = value @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: +<<<<<<< HEAD return lexeme.flags & (1 << flag_id) @staticmethod @@ -78,3 +118,17 @@ cdef class Lexeme: lexeme.flags |= one << flag_id else: lexeme.flags &= ~(one << flag_id) +======= + if lexeme.flags & (1 << flag_id): + return True + else: + return False + + @staticmethod + cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: + cdef flags_t one = 1 + if value: + lex.flags |= one << flag_id + else: + lex.flags &= ~(one << flag_id) +>>>>>>> de diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 832f4fec7..8ec238e32 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -27,6 +27,17 @@ cdef class Lexeme: self.vocab = vocab self.orth = orth self.c = vocab.get_by_orth(vocab.mem, orth) + assert self.c.orth == orth + + def py_set_flag(self, attr_id_t flag_id): + Lexeme.set_flag(self.c, flag_id, True) + + def py_check_flag(self, attr_id_t flag_id): + return True if Lexeme.check_flag(self.c, flag_id) else False + + property orth_: + def __get__(self): + return self.vocab.strings[self.c.orth] property lower: def __get__(self): return self.c.lower @@ -48,9 +59,13 @@ cdef class Lexeme: def __get__(self): return self.c.suffix def __set__(self, int x): self.c.suffix = x - property orth_: - def __get__(self): - return self.vocab.strings[self.c.orth] + property cluster: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x + + property prob: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x property lower_: def __get__(self): return self.vocab.strings[self.c.lower] @@ -72,6 +87,10 @@ cdef class Lexeme: def __get__(self): return self.c.suffix def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] + property flags: + def __get__(self): return self.c.flags + def __set__(self, flags_t x): self.c.flags = x + property is_oov: def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index caafe6498..b8a45d469 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -8,6 +8,7 @@ from cymem.cymem cimport Pool from libcpp.vector cimport vector from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE +from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab @@ -53,6 +54,8 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1: cdef int i for i in range(pattern.length): if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value: + print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value + print get_token_attr(token, pattern.spec[i].attr) return False return True @@ -76,7 +79,10 @@ def _convert_strings(token_specs, string_store): attr = map_attr_name(attr) if isinstance(value, basestring): value = string_store[value] + if isinstance(value, bool): + value = int(value) converted[-1].append((attr, value)) + print "Converted", converted[-1] return converted @@ -92,6 +98,32 @@ def map_attr_name(attr): return SHAPE elif attr == 'NORM': return NORM + elif attr == 'FLAG13': + return FLAG13 + elif attr == 'FLAG14': + return FLAG14 + elif attr == 'FLAG15': + return FLAG15 + elif attr == 'FLAG16': + return FLAG16 + elif attr == 'FLAG17': + return FLAG17 + elif attr == 'FLAG18': + return FLAG18 + elif attr == 'FLAG19': + return FLAG19 + elif attr == 'FLAG20': + return FLAG20 + elif attr == 'FLAG21': + return FLAG21 + elif attr == 'FLAG22': + return FLAG22 + elif attr == 'FLAG23': + return FLAG23 + elif attr == 'FLAG24': + return FLAG24 + elif attr == 'FLAG25': + return FLAG25 else: raise Exception("TODO: Finish supporting attr mapping %s" % attr) @@ -99,14 +131,28 @@ def map_attr_name(attr): cdef class Matcher: cdef Pool mem cdef vector[Pattern*] patterns - cdef readonly int n_patterns + cdef readonly Vocab vocab def __init__(self, vocab, patterns): self.vocab = vocab self.mem = Pool() + self.vocab = vocab for entity_key, (etype, attrs, specs) in sorted(patterns.items()): self.add(entity_key, etype, attrs, specs) + @classmethod + def from_dir(cls, data_dir, Vocab vocab): + patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') + if path.exists(patterns_loc): + patterns_data = open(patterns_loc).read() + patterns = json.loads(patterns_data) + return cls(vocab, patterns) + else: + return cls(vocab, {}) + + property n_patterns: + def __get__(self): return self.patterns.size() + def add(self, entity_key, etype, attrs, specs): if isinstance(entity_key, basestring): entity_key = self.vocab.strings[entity_key] @@ -120,16 +166,6 @@ cdef class Matcher: spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) - @classmethod - def from_dir(cls, vocab, data_dir): - patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') - if path.exists(patterns_loc): - patterns_data = open(patterns_loc).read() - patterns = json.loads(patterns_data) - return cls(vocab, patterns) - else: - return cls(vocab, {}) - def __call__(self, Doc doc): cdef vector[Pattern*] partials cdef int n_partials = 0 @@ -139,11 +175,13 @@ cdef class Matcher: cdef Pattern* state matches = [] for token_i in range(doc.length): + print 'check', doc[token_i].orth_ token = &doc.data[token_i] q = 0 for i in range(partials.size()): state = partials.at(i) if match(state, token): + print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: @@ -153,6 +191,7 @@ cdef class Matcher: for i in range(self.n_patterns): state = self.patterns[i] if match(state, token): + print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 5dfee4250..2229da0ad 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,4 +1,755 @@ -from .structs cimport TokenC, Morphology, PosTag +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMapArray +from libc.stdint cimport uint64_t + +from .structs cimport TokenC +from .strings cimport StringStore +from .typedefs cimport attr_t +from .parts_of_speech cimport univ_pos_t -cdef int set_morph_from_dict(Morphology* morph, dict props) except -1 +cdef struct RichTagC: + uint64_t morph + int id + univ_pos_t pos + attr_t name + + +cdef struct MorphAnalysisC: + RichTagC tag + attr_t lemma + + +cdef class Morphology: + cdef readonly Pool mem + cdef readonly StringStore strings + cdef public object lemmatizer + cdef public object n_tags + cdef public object reverse_index + cdef public object tag_names + + cdef RichTagC* rich_tags + cdef PreshMapArray _cache + + cdef int assign_tag(self, TokenC* token, tag) except -1 + + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 + + + +# +#cpdef enum Feature_t: +# Abbr +# AdpType +# AdvType +# ConjType +# Connegative +# Derivation +# Echo +# Foreign +# Gender_dat +# Gender_erg +# Gender_psor +# Hyph +# InfForm +# NameType +# NounType +# NumberAbs +# NumberDat +# NumberErg +# NumberPsee +# NumberPsor +# NumForm +# NumValue +# PartForm +# PartType +# Person_abs +# Person_dat +# Person_psor +# Polite +# Polite_abs +# Polite_dat +# Prefix +# PrepCase +# PunctSide +# PunctType +# Style +# Typo +# Variant +# VerbType +# +# +#cpdef enum Animacy: +# Anim +# Inam +# +# +#cpdef enum Aspect: +# Freq +# Imp +# Mod +# None_ +# Perf +# +# +#cpdef enum Case1: +# Nom +# Gen +# Acc +# Dat +# Voc +# Abl +# +#cdef enum Case2: +# Abe +# Abs +# Ade +# All +# Cau +# Com +# Del +# Dis +# +#cdef enum Case3: +# Ela +# Ess +# Ill +# Ine +# Ins +# Loc +# Lat +# Par +# +#cdef enum Case4: +# Sub +# Sup +# Tem +# Ter +# Tra +# +# +#cpdef enum Definite: +# Two +# Def +# Red +# Ind +# +# +#cpdef enum Degree: +# Cmp +# Comp +# None_ +# Pos +# Sup +# Abs +# Com +# Degree # du +# +# +#cpdef enum Gender: +# Com +# Fem +# Masc +# Neut +# +# +#cpdef enum Mood: +# Cnd +# Imp +# Ind +# N +# Pot +# Sub +# Opt +# +# +#cpdef enum Negative: +# Neg +# Pos +# Yes +# +# +#cpdef enum Number: +# Com +# Dual +# None_ +# Plur +# Sing +# Ptan # bg +# Count # bg +# +# +#cpdef enum NumType: +# Card +# Dist +# Frac +# Gen +# Mult +# None_ +# Ord +# Sets +# +# +#cpdef enum Person: +# One +# Two +# Three +# None_ +# +# +#cpdef enum Poss: +# Yes +# +# +#cpdef enum PronType1: +# AdvPart +# Art +# Default +# Dem +# Ind +# Int +# Neg +# +#cpdef enum PronType2: +# Prs +# Rcp +# Rel +# Tot +# Clit +# Exc # es, ca, it, fa +# Clit # it +# +# +#cpdef enum Reflex: +# Yes +# +# +#cpdef enum Tense: +# Fut +# Imp +# Past +# Pres +# +#cpdef enum VerbForm1: +# Fin +# Ger +# Inf +# None_ +# Part +# PartFut +# PartPast +# +#cpdef enum VerbForm2: +# PartPres +# Sup +# Trans +# Gdv # la +# +# +#cpdef enum Voice: +# Act +# Cau +# Pass +# Mid # gkc +# Int # hb +# +# +#cpdef enum Abbr: +# Yes # cz, fi, sl, U +# +#cpdef enum AdpType: +# Prep # cz, U +# Post # U +# Voc # cz +# Comprep # cz +# Circ # U +# Voc # U +# +# +#cpdef enum AdvType1: +# # U +# Man +# Loc +# Tim +# Deg +# Cau +# Mod +# Sta +# Ex +# +#cpdef enum AdvType2: +# Adadj +# +#cpdef enum ConjType: +# Oper # cz, U +# Comp # cz, U +# +#cpdef enum Connegative: +# Yes # fi +# +# +#cpdef enum Derivation1: +# Minen # fi +# Sti # fi +# Inen # fi +# Lainen # fi +# Ja # fi +# Ton # fi +# Vs # fi +# Ttain # fi +# +#cpdef enum Derivation2: +# Ttaa +# +# +#cpdef enum Echo: +# Rdp # U +# Ech # U +# +# +#cpdef enum Foreign: +# Foreign # cz, fi, U +# Fscript # cz, fi, U +# Tscript # cz, U +# Yes # sl +# +# +#cpdef enum Gender_dat: +# Masc # bq, U +# Fem # bq, U +# +# +#cpdef enum Gender_erg: +# Masc # bq +# Fem # bq +# +# +#cpdef enum Gender_psor: +# Masc # cz, sl, U +# Fem # cz, sl, U +# Neut # sl +# +# +#cpdef enum Hyph: +# Yes # cz, U +# +# +#cpdef enum InfForm: +# One # fi +# Two # fi +# Three # fi +# +# +#cpdef enum NameType: +# Geo # U, cz +# Prs # U, cz +# Giv # U, cz +# Sur # U, cz +# Nat # U, cz +# Com # U, cz +# Pro # U, cz +# Oth # U, cz +# +# +#cpdef enum NounType: +# Com # U +# Prop # U +# Class # U +# +#cpdef enum Number_abs: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_dat: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_erg: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_psee: +# Sing # U +# Plur # U +# +# +#cpdef enum Number_psor: +# Sing # cz, fi, sl, U +# Plur # cz, fi, sl, U +# +# +#cpdef enum NumForm: +# Digit # cz, sl, U +# Roman # cz, sl, U +# Word # cz, sl, U +# +# +#cpdef enum NumValue: +# One # cz, U +# Two # cz, U +# Three # cz, U +# +# +#cpdef enum PartForm: +# Pres # fi +# Past # fi +# Agt # fi +# Neg # fi +# +# +#cpdef enum PartType: +# Mod # U +# Emp # U +# Res # U +# Inf # U +# Vbp # U +# +#cpdef enum Person_abs: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_dat: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_erg: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_psor: +# One # fi, U +# Two # fi, U +# Three # fi, U +# +# +#cpdef enum Polite: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_abs: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_erg: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_dat: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Prefix: +# Yes # U +# +# +#cpdef enum PrepCase: +# Npr # cz +# Pre # U +# +# +#cpdef enum PunctSide: +# Ini # U +# Fin # U +# +#cpdef enum PunctType1: +# Peri # U +# Qest # U +# Excl # U +# Quot # U +# Brck # U +# Comm # U +# Colo # U +# Semi # U +# +#cpdef enum PunctType2: +# Dash # U +# +# +#cpdef enum Style1: +# Arch # cz, fi, U +# Rare # cz, fi, U +# Poet # cz, U +# Norm # cz, U +# Coll # cz, U +# Vrnc # cz, U +# Sing # cz, U +# Expr # cz, U +# +# +#cpdef enum Style2: +# Derg # cz, U +# Vulg # cz, U +# +# +#cpdef enum Typo: +# Yes # fi, U +# +# +#cpdef enum Variant: +# Short # cz +# Bound # cz, sl +# +# +#cpdef enum VerbType: +# Aux # U +# Cop # U +# Mod # U +# Light # U +# + +cpdef enum Value_t: + Animacy_Anim + Animacy_Inam + Aspect_Freq + Aspect_Imp + Aspect_Mod + Aspect_None_ + Aspect_Perf + Case_Abe + Case_Abl + Case_Abs + Case_Acc + Case_Ade + Case_All + Case_Cau + Case_Com + Case_Dat + Case_Del + Case_Dis + Case_Ela + Case_Ess + Case_Gen + Case_Ill + Case_Ine + Case_Ins + Case_Loc + Case_Lat + Case_Nom + Case_Par + Case_Sub + Case_Sup + Case_Tem + Case_Ter + Case_Tra + Case_Voc + Definite_Two + Definite_Def + Definite_Red + Definite_Ind + Degree_Cmp + Degree_Comp + Degree_None + Degree_Pos + Degree_Sup + Degree_Abs + Degree_Com + Degree_Dim # du + Gender_Com + Gender_Fem + Gender_Masc + Gender_Neut + Mood_Cnd + Mood_Imp + Mood_Ind + Mood_N + Mood_Pot + Mood_Sub + Mood_Opt + Negative_Neg + Negative_Pos + Negative_Yes + Number_Com + Number_Dual + Number_None + Number_Plur + Number_Sing + Number_Ptan # bg + Number_Count # bg + NumType_Card + NumType_Dist + NumType_Frac + NumType_Gen + NumType_Mult + NumType_None + NumType_Ord + NumType_Sets + Person_One + Person_Two + Person_Three + Person_None + Poss_Yes + PronType_AdvPart + PronType_Art + PronType_Default + PronType_Dem + PronType_Ind + PronType_Int + PronType_Neg + PronType_Prs + PronType_Rcp + PronType_Rel + PronType_Tot + PronType_Clit + PronType_Exc # es, ca, it, fa + Reflex_Yes + Tense_Fut + Tense_Imp + Tense_Past + Tense_Pres + VerbForm_Fin + VerbForm_Ger + VerbForm_Inf + VerbForm_None + VerbForm_Part + VerbForm_PartFut + VerbForm_PartPast + VerbForm_PartPres + VerbForm_Sup + VerbForm_Trans + VerbForm_Gdv # la + Voice_Act + Voice_Cau + Voice_Pass + Voice_Mid # gkc + Voice_Int # hb + Abbr_Yes # cz, fi, sl, U + AdpType_Prep # cz, U + AdpType_Post # U + AdpType_Voc # cz + AdpType_Comprep # cz + AdpType_Circ # U + AdvType_Man + AdvType_Loc + AdvType_Tim + AdvType_Deg + AdvType_Cau + AdvType_Mod + AdvType_Sta + AdvType_Ex + AdvType_Adadj + ConjType_Oper # cz, U + ConjType_Comp # cz, U + Connegative_Yes # fi + Derivation_Minen # fi + Derivation_Sti # fi + Derivation_Inen # fi + Derivation_Lainen # fi + Derivation_Ja # fi + Derivation_Ton # fi + Derivation_Vs # fi + Derivation_Ttain # fi + Derivation_Ttaa # fi + Echo_Rdp # U + Echo_Ech # U + Foreign_Foreign # cz, fi, U + Foreign_Fscript # cz, fi, U + Foreign_Tscript # cz, U + Foreign_Yes # sl + Gender_dat_Masc # bq, U + Gender_dat_Fem # bq, U + Gender_erg_Masc # bq + Gender_erg_Fem # bq + Gender_psor_Masc # cz, sl, U + Gender_psor_Fem # cz, sl, U + Gender_psor_Neut # sl + Hyph_Yes # cz, U + InfForm_One # fi + InfForm_Two # fi + InfForm_Three # fi + NameType_Geo # U, cz + NameType_Prs # U, cz + NameType_Giv # U, cz + NameType_Sur # U, cz + NameType_Nat # U, cz + NameType_Com # U, cz + NameType_Pro # U, cz + NameType_Oth # U, cz + NounType_Com # U + NounType_Prop # U + NounType_Class # U + Number_abs_Sing # bq, U + Number_abs_Plur # bq, U + Number_dat_Sing # bq, U + Number_dat_Plur # bq, U + Number_erg_Sing # bq, U + Number_erg_Plur # bq, U + Number_psee_Sing # U + Number_psee_Plur # U + Number_psor_Sing # cz, fi, sl, U + Number_psor_Plur # cz, fi, sl, U + NumForm_Digit # cz, sl, U + NumForm_Roman # cz, sl, U + NumForm_Word # cz, sl, U + NumValue_One # cz, U + NumValue_Two # cz, U + NumValue_Three # cz, U + PartForm_Pres # fi + PartForm_Past # fi + PartForm_Agt # fi + PartForm_Neg # fi + PartType_Mod # U + PartType_Emp # U + PartType_Res # U + PartType_Inf # U + PartType_Vbp # U + Person_abs_One # bq, U + Person_abs_Two # bq, U + Person_abs_Three # bq, U + Person_dat_One # bq, U + Person_dat_Two # bq, U + Person_dat_Three # bq, U + Person_erg_One # bq, U + Person_erg_Two # bq, U + Person_erg_Three # bq, U + Person_psor_One # fi, U + Person_psor_Two # fi, U + Person_psor_Three # fi, U + Polite_Inf # bq, U + Polite_Pol # bq, U + Polite_abs_Inf # bq, U + Polite_abs_Pol # bq, U + Polite_erg_Inf # bq, U + Polite_erg_Pol # bq, U + Polite_dat_Inf # bq, U + Polite_dat_Pol # bq, U + Prefix_Yes # U + PrepCase_Npr # cz + PrepCase_Pre # U + PunctSide_Ini # U + PunctSide_Fin # U + PunctType_Peri # U + PunctType_Qest # U + PunctType_Excl # U + PunctType_Quot # U + PunctType_Brck # U + PunctType_Comm # U + PunctType_Colo # U + PunctType_Semi # U + PunctType_Dash # U + Style_Arch # cz, fi, U + Style_Rare # cz, fi, U + Style_Poet # cz, U + Style_Norm # cz, U + Style_Coll # cz, U + Style_Vrnc # cz, U + Style_Sing # cz, U + Style_Expr # cz, U + Style_Derg # cz, U + Style_Vulg # cz, U + Style_Yes # fi, U + StyleVariant_StyleShort # cz + StyleVariant_StyleBound # cz, sl + VerbType_Aux # U + VerbType_Cop # U + VerbType_Mod # U + VerbType_Light # U diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 96a4ba884..fc6a4936b 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,11 +1,89 @@ -# cython: embedsignature=True +from os import path +from .lemmatizer import Lemmatizer + +try: + import ujson as json +except ImportError: + import json + +from .parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech cimport ADJ, VERB, NOUN -cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: - morph.number = props.get('number', 0) - morph.tenspect = props.get('tenspect', 0) - morph.mood = props.get('mood', 0) - morph.gender = props.get('gender', 0) - morph.person = props.get('person', 0) - morph.case = props.get('case', 0) - morph.misc = props.get('misc', 0) +cdef class Morphology: + def __init__(self, StringStore string_store, tag_map, lemmatizer): + self.mem = Pool() + self.strings = string_store + self.lemmatizer = lemmatizer + self.n_tags = len(tag_map) + 1 + self.tag_names = tuple(sorted(tag_map.keys())) + self.reverse_index = {} + + self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) + for i, (tag_str, props) in enumerate(sorted(tag_map.items())): + self.rich_tags[i].id = i + self.rich_tags[i].name = self.strings[tag_str] + self.rich_tags[i].morph = 0 + self.reverse_index[self.rich_tags[i].name] = i + self._cache = PreshMapArray(self.n_tags) + + cdef int assign_tag(self, TokenC* token, tag) except -1: + cdef int tag_id + if isinstance(tag, basestring): + try: + tag_id = self.reverse_index[self.strings[tag]] + except KeyError: + print tag + raise + else: + tag_id = tag + analysis = self._cache.get(tag_id, token.lex.orth) + if analysis is NULL: + analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) + analysis.tag = self.rich_tags[tag_id] + analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth) + token.lemma = analysis.lemma + token.pos = analysis.tag.pos + token.tag = analysis.tag.name + token.morph = analysis.tag.morph + + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1: + pass + + def load_morph_exceptions(self, dict exc): + # Map (form, pos) to (lemma, rich tag) + cdef unicode pos_str + cdef unicode form_str + cdef unicode lemma_str + cdef dict entries + cdef dict props + cdef int lemma + cdef attr_t orth + cdef int pos + for tag_str, entries in exc.items(): + tag = self.strings[tag_str] + rich_tag = self.rich_tags[self.reverse_index[tag]] + for form_str, props in entries.items(): + cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) + orth = self.strings[form_str] + for name_str, value_str in props.items(): + if name_str == 'L': + cached.lemma = self.strings[value_str] + else: + self.assign_feature(&cached.tag.morph, name_str, value_str) + if cached.lemma == 0: + cached.lemma = self.lemmatize(rich_tag.pos, orth) + self._cache.set(rich_tag.pos, orth, cached) + + def lemmatize(self, const univ_pos_t pos, attr_t orth): + if self.lemmatizer is None: + return orth + cdef unicode py_string = self.strings[orth] + if pos != NOUN and pos != VERB and pos != ADJ: + return orth + cdef set lemma_strings + cdef unicode lemma_string + lemma_strings = self.lemmatizer(py_string, pos) + lemma_string = sorted(lemma_strings)[0] + lemma = self.strings[lemma_string] + return lemma diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 6ffac839b..df4e2dc32 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -69,7 +69,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu cpdef bint like_url(unicode string): # We're looking for things that function in text like URLs. So, valid URL # or not, anything they say http:// is going to be good. - if string.startswith('http://'): + if string.startswith('http://') or string.startswith('https://'): return True elif string.startswith('www.') and len(string) >= 5: return True @@ -92,6 +92,7 @@ cpdef bint like_url(unicode string): return False +# TODO: This should live in the language.orth NUM_WORDS = set('zero one two three four five six seven eight nine ten' 'eleven twelve thirteen fourteen fifteen sixteen seventeen' 'eighteen nineteen twenty thirty forty fifty sixty seventy' diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index b915b9dde..e410c6971 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -2,17 +2,22 @@ cpdef enum univ_pos_t: NO_TAG ADJ - ADV ADP + ADV + AUX CONJ DET + INTJ NOUN NUM + PART PRON - PRT + PROPN + PUNCT + SCONJ + SYM VERB X - PUNCT EOL SPACE N_UNIV_TAGS diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 994a48eba..8c2348a47 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -4,17 +4,22 @@ from __future__ import unicode_literals UNIV_POS_NAMES = { "NO_TAG": NO_TAG, "ADJ": ADJ, - "ADV": ADV, "ADP": ADP, + "ADV": ADV, + "AUX": AUX, "CONJ": CONJ, "DET": DET, + "INTJ": INTJ, "NOUN": NOUN, "NUM": NUM, + "PART": PART, "PRON": PRON, - "PRT": PRT, + "PROPN": PROPN, + "PUNCT": PUNCT, + "SCONJ": SCONJ, + "SYM": SYM, "VERB": VERB, "X": X, - "PUNCT": PUNCT, - "SPACE": SPACE, - "EOL": EOL + "EOL": EOL, + "SPACE": SPACE } diff --git a/spacy/strings.pyx b/spacy/strings.pyx index c187a6aa6..a4a470158 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -142,6 +142,8 @@ cdef class StringStore: def load(self, loc): with codecs.open(loc, 'r', 'utf8') as file_: strings = file_.read().split(SEPARATOR) + if strings == ['']: + return None cdef unicode string cdef bytes byte_string for string in strings: diff --git a/spacy/structs.pxd b/spacy/structs.pxd index f3095df51..a0a3d65a3 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -1,4 +1,4 @@ -from libc.stdint cimport uint8_t, uint32_t, int32_t +from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t from .typedefs cimport flags_t, attr_t, hash_t from .parts_of_speech cimport univ_pos_t @@ -26,22 +26,6 @@ cdef struct LexemeC: float l2_norm -cdef struct Morphology: - uint8_t number - uint8_t tenspect # Tense/aspect/voice - uint8_t mood - uint8_t gender - uint8_t person - uint8_t case - uint8_t misc - - -cdef struct PosTag: - Morphology morph - int id - univ_pos_t pos - - cdef struct Entity: int start int end @@ -59,8 +43,8 @@ cdef struct Constituent: cdef struct TokenC: const LexemeC* lex - Morphology morph const Constituent* ctnt + uint64_t morph univ_pos_t pos bint spacy int tag diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 4ee30341a..70a0229c2 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -11,7 +11,6 @@ from .stateclass cimport StateClass cdef class Parser: - cdef readonly object cfg cdef readonly Model model cdef readonly TransitionSystem moves diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 59b90920c..cf61647b9 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -67,16 +67,22 @@ def ParserFactory(transition_system): cdef class Parser: - def __init__(self, StringStore strings, model_dir, transition_system): + def __init__(self, StringStore strings, transition_system, model): + self.moves = transition_system + self.model = model + + @classmethod + def from_dir(cls, model_dir, strings, transition_system): if not os.path.exists(model_dir): print >> sys.stderr, "Warning: No model found at", model_dir elif not os.path.isdir(model_dir): print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory" - else: - self.cfg = Config.read(model_dir, 'config') - self.moves = transition_system(strings, self.cfg.labels) - templates = get_templates(self.cfg.features) - self.model = Model(self.moves.n_moves, templates, model_dir) + cfg = Config.read(model_dir, 'config') + moves = transition_system(strings, cfg.labels) + templates = get_templates(cfg.features) + model = Model(moves.n_moves, templates, model_dir) + return cls(strings, moves, model) + def __call__(self, Doc tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd new file mode 100644 index 000000000..28d7fc711 --- /dev/null +++ b/spacy/tagger.pxd @@ -0,0 +1,12 @@ +from ._ml cimport Model +from .structs cimport TokenC +from .vocab cimport Vocab + + +cdef class Tagger: + cdef readonly Vocab vocab + cdef readonly Model model + cdef public dict freqs + + cdef int predict(self, int i, const TokenC* tokens) except -1 + cdef int update(self, int i, const TokenC* tokens, int gold) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx new file mode 100644 index 000000000..756bb7ea4 --- /dev/null +++ b/spacy/tagger.pyx @@ -0,0 +1,220 @@ +import json +from os import path +from collections import defaultdict + +from thinc.typedefs cimport atom_t, weight_t + +from .typedefs cimport attr_t +from .tokens.doc cimport Doc +from .attrs cimport TAG +from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON +from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE + +from .attrs cimport * +from ._ml cimport arg_max + + +cpdef enum: + P2_orth + P2_cluster + P2_shape + P2_prefix + P2_suffix + P2_pos + P2_lemma + P2_flags + + P1_orth + P1_cluster + P1_shape + P1_prefix + P1_suffix + P1_pos + P1_lemma + P1_flags + + W_orth + W_cluster + W_shape + W_prefix + W_suffix + W_pos + W_lemma + W_flags + + N1_orth + N1_cluster + N1_shape + N1_prefix + N1_suffix + N1_pos + N1_lemma + N1_flags + + N2_orth + N2_cluster + N2_shape + N2_prefix + N2_suffix + N2_pos + N2_lemma + N2_flags + + N_CONTEXT_FIELDS + + +cdef class Tagger: + """A part-of-speech tagger for English""" + @classmethod + def read_config(cls, data_dir): + return json.load(open(path.join(data_dir, 'pos', 'config.json'))) + + @classmethod + def default_templates(cls): + return ( + (W_orth,), + (P1_lemma, P1_pos), + (P2_lemma, P2_pos), + (N1_orth,), + (N2_orth,), + + (W_suffix,), + (W_prefix,), + + (P1_pos,), + (P2_pos,), + (P1_pos, P2_pos), + (P1_pos, W_orth), + (P1_suffix,), + (N1_suffix,), + + (W_shape,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + (P1_cluster,), + (P2_cluster,), + + (W_flags,), + (N1_flags,), + (N2_flags,), + (P1_flags,), + (P2_flags,), + ) + + @classmethod + def blank(cls, vocab, templates): + model = Model(vocab.morphology.n_tags, templates, model_loc=None) + return cls(vocab, model) + + @classmethod + def from_dir(cls, data_dir, vocab): + if path.exists(path.join(data_dir, 'templates.json')): + templates = json.loads(open(path.join(data_dir, 'templates.json'))) + else: + templates = cls.default_templates() + model = Model(vocab.morphology.n_tags, templates, data_dir) + return cls(vocab, model) + + def __init__(self, Vocab vocab, model): + self.vocab = vocab + self.model = model + + # TODO: Move this to tag map + self.freqs = {TAG: defaultdict(int)} + for tag in self.tag_names: + self.freqs[TAG][self.vocab.strings[tag]] = 1 + self.freqs[TAG][0] = 1 + + @property + def tag_names(self): + return self.vocab.morphology.tag_names + + def __call__(self, Doc tokens): + """Apply the tagger, setting the POS tags onto the Doc object. + + Args: + tokens (Doc): The tokens to be tagged. + """ + if tokens.length == 0: + return 0 + cdef int i + cdef const weight_t* scores + for i in range(tokens.length): + if tokens.data[i].pos == 0: + guess = self.predict(i, tokens.data) + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + + tokens.is_tagged = True + tokens._py_tokens = [None] * tokens.length + + def tag_from_strings(self, Doc tokens, object tag_strs): + cdef int i + for i in range(tokens.length): + self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) + tokens.is_tagged = True + tokens._py_tokens = [None] * tokens.length + + def train(self, Doc tokens, object gold_tag_strs): + assert len(tokens) == len(gold_tag_strs) + cdef int i + cdef int loss + cdef const weight_t* scores + try: + golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] + except ValueError: + raise ValueError( + [g for g in gold_tag_strs if g is not None and g not in self.tag_names]) + correct = 0 + for i in range(tokens.length): + guess = self.update(i, tokens.data, golds[i]) + loss = golds[i] != -1 and guess != golds[i] + + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + + correct += loss == 0 + self.freqs[TAG][tokens.data[i].tag] += 1 + return correct + + cdef int predict(self, int i, const TokenC* tokens) except -1: + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + return arg_max(scores, self.model.n_classes) + + cdef int update(self, int i, const TokenC* tokens, int gold) except -1: + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + guess = arg_max(scores, self.model.n_classes) + loss = guess != gold if gold != -1 else 0 + self.model.update(context, guess, gold, loss) + return guess + + +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.lower + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.tag + context[6] = t.lemma + if t.lex.flags & (1 << IS_ALPHA): + context[7] = 1 + elif t.lex.flags & (1 << IS_PUNCT): + context[7] = 2 + elif t.lex.flags & (1 << LIKE_URL): + context[7] = 3 + elif t.lex.flags & (1 << LIKE_NUM): + context[7] = 4 + else: + context[7] = 0 diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index a7f69c5aa..9d60d2a6e 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -4,15 +4,10 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .structs cimport LexemeC, TokenC, Morphology +from .structs cimport LexemeC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc -from .vocab cimport Vocab, _Cached - - -cdef union LexemesOrTokens: - const LexemeC* const* lexemes - TokenC* tokens +from .vocab cimport Vocab, LexemesOrTokens, _Cached cdef class Tokenizer: diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1e857aefc..d54770d2b 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE from cymem.cymem cimport Pool from preshed.maps cimport PreshMap -from .morphology cimport set_morph_from_dict from .strings cimport hash_string cimport cython @@ -29,7 +28,7 @@ cdef class Tokenizer: self._suffix_re = suffix_re self._infix_re = infix_re self.vocab = vocab - self._load_special_tokenization(rules, self.vocab.pos_tags) + self._load_special_tokenization(rules) @classmethod def from_dir(cls, Vocab vocab, data_dir): @@ -193,9 +192,7 @@ cdef class Tokenizer: tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) - if cache_hit: - pass - else: + if not cache_hit: match = self.find_infix(string) if match is None: tokens.push_back(self.vocab.get(tokens.mem, string), False) @@ -242,7 +239,7 @@ cdef class Tokenizer: match = self._suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, object rules, object tag_map): + def _load_special_tokenization(self, special_cases): '''Add a special-case tokenization rule. ''' cdef int i @@ -253,29 +250,11 @@ cdef class Tokenizer: cdef dict props cdef LexemeC** lexemes cdef hash_t hashed - for chunk, substrings in sorted(rules.items()): - tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) - for i, props in enumerate(substrings): - form = props['F'] - lemma = props.get("L", None) - tokens[i].lex = self.vocab.get(self.vocab.mem, form) - if lemma is not None: - tokens[i].lemma = self.vocab.strings[lemma] - else: - tokens[i].lemma = 0 - if 'pos' in props: - tokens[i].tag = self.vocab.strings[props['pos']] - tokens[i].pos = tag_map[props['pos']][0] - # These are defaults, which can be over-ridden by the - # token-specific props. - set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1]) - if tokens[i].lemma == 0: - tokens[i].lemma = tokens[i].lex.orth - set_morph_from_dict(&tokens[i].morph, props) + for chunk, substrings in sorted(special_cases.items()): cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) cached.is_lex = False - cached.data.tokens = tokens - hashed = hash_string(chunk) - self._specials.set(hashed, cached) - self._cache.set(hashed, cached) + cached.data.tokens = self.vocab.make_fused_token(substrings) + key = hash_string(chunk) + self._specials.set(key, cached) + self._cache.set(key, cached) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 121018770..a13858175 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil ctypedef const LexemeC* const_Lexeme_ptr -ctypedef TokenC* TokenC_ptr +ctypedef const TokenC* const_TokenC_ptr ctypedef fused LexemeOrToken: const_Lexeme_ptr - TokenC_ptr + const_TokenC_ptr cdef class Doc: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4ba0d675a..41d24d8ac 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -14,6 +14,7 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t +from ..lexeme cimport Lexeme from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray @@ -210,7 +211,7 @@ cdef class Doc: if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.data[self.length] - if LexemeOrToken is TokenC_ptr: + if LexemeOrToken is const_TokenC_ptr: t[0] = lex_or_tok[0] else: t.lex = lex_or_tok @@ -218,6 +219,7 @@ cdef class Doc: t.idx = 0 else: t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy + assert t.lex.orth != 0 t.spacy = has_space self.length += 1 self._py_tokens.append(None) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index f1c19f308..e2aa1a7f9 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -1,7 +1,7 @@ from __future__ import unicode_literals from collections import defaultdict -from ..structs cimport Morphology, TokenC, LexemeC +from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 2fa1366a1..f3b9aa056 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -20,6 +20,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV +from ..lexeme cimport Lexeme + cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 710a1b5ec..e491a48e3 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64 from .structs cimport LexemeC, TokenC from .typedefs cimport utf8_t, attr_t, hash_t from .strings cimport StringStore +from .morphology cimport Morphology cdef LexemeC EMPTY_LEXEME @@ -14,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME cdef union LexemesOrTokens: const LexemeC* const* lexemes - TokenC* tokens + const TokenC* tokens cdef struct _Cached: @@ -27,15 +28,18 @@ cdef class Vocab: cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings - cdef readonly object pos_tags + cpdef readonly Morphology morphology cdef readonly int length cdef public object _serializer cdef public object data_dir - cdef public float oov_prob + cdef public object get_lex_attr + cdef public object pos_tags cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const TokenC* make_fused_token(self, substrings) except NULL + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 2d67e59f2..596570a98 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -17,10 +17,12 @@ from .strings cimport hash_string from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile +from .lemmatizer import Lemmatizer from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer +from .attrs cimport PROB DEF MAX_VEC_SIZE = 100000 @@ -35,30 +37,31 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False): + def __init__(self, get_lex_attr=None, tag_map=None, vectors=None): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - #self.pos_tags = pos_tags if pos_tags is not None else {} - self.pos_tags = {} - self.get_lex_attr = get_lex_attr - self.repvec_length = 0 - self.length = 0 - self._add_lex_to_vocab(0, &EMPTY_LEXEME) - if data_dir is not None: - if not path.exists(data_dir): - raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) - if not path.isdir(data_dir): - raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - self.load_lexemes(path.join(data_dir, 'strings.txt'), - path.join(data_dir, 'lexemes.bin')) - if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): - self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) - + self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {})) + + self.length = 1 self._serializer = None - self.data_dir = data_dir + + @classmethod + def from_dir(cls, data_dir, get_lex_attr=None, vectors=None): + if not path.exists(data_dir): + raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) + if not path.isdir(data_dir): + raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) + + self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) + if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): + self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) + return self property serializer: def __get__(self): @@ -84,7 +87,9 @@ cdef class Vocab: cdef LexemeC* lex cdef hash_t key = hash_string(string) lex = self._by_hash.get(key) + cdef size_t addr if lex != NULL: + assert lex.orth == self.strings[string] return lex else: return self._new_lexeme(mem, string) @@ -103,16 +108,29 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: + cdef hash_t key cdef bint is_oov = mem is not self.mem + mem = self.mem if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - for attr, func in self.lex_attr_getters.items(): - Lexeme.set_struct_attr(lex, attr, func(string)) + lex.orth = self.strings[string] + lex.length = len(string) + lex.id = self.length + if self.get_lex_attr is not None: + for attr, func in self.get_lex_attr.items(): + value = func(string) + if isinstance(value, unicode): + value = self.strings[value] + if attr == PROB: + lex.prob = value + else: + Lexeme.set_struct_attr(lex, attr, value) if is_oov: lex.id = 0 else: - self._add_lex_to_vocab(hash_string(string), lex) + key = hash_string(string) + self._add_lex_to_vocab(key, lex) assert lex != NULL, string return lex @@ -125,7 +143,7 @@ cdef class Vocab: cdef attr_t orth cdef size_t addr for orth, addr in self._by_orth.items(): - yield Lexeme.from_ptr(addr, self, self.repvec_length) + yield Lexeme(self, orth) def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously @@ -142,23 +160,29 @@ cdef class Vocab: An instance of the Lexeme Python class, with data copied on instantiation. ''' - cdef const LexemeC* lexeme cdef attr_t orth - if type(id_or_string) == int: - orth = id_or_string - lexeme = self._by_orth.get(orth) - if lexeme == NULL: - raise KeyError(id_or_string) - assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth)) - elif type(id_or_string) == unicode: - lexeme = self.get(self.mem, id_or_string) - assert lexeme.orth == self.strings[id_or_string] + if type(id_or_string) == unicode: + orth = self.strings[id_or_string] else: - raise ValueError("Vocab unable to map type: " - "%s. Maps unicode --> Lexeme or " - "int --> Lexeme" % str(type(id_or_string))) - return Lexeme.from_ptr(lexeme, self, self.repvec_length) + orth = id_or_string + return Lexeme(self, orth) + cdef const TokenC* make_fused_token(self, substrings) except NULL: + cdef int i + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + token = &tokens[i] + # Set the special tokens up to have morphology and lemmas if + # specified, otherwise use the part-of-speech tag (if specified) + token.lex = self.get(self.mem, props['F']) + if 'pos' in props: + self.morphology.assign_tag(token, props['pos']) + if 'L' in props: + tokens[i].lemma = self.strings[props['L']] + for feature, value in props.get('morph', {}).items(): + self.morphology.assign_feature(&token.morph, feature, value) + return tokens + def dump(self, loc): if path.exists(loc): assert not path.isdir(loc) diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py index 9f570d8be..a4a57e5b2 100644 --- a/tests/parser/test_initial_actions_parse.py +++ b/tests/parser/test_initial_actions_parse.py @@ -1,6 +1,7 @@ import pytest +@pytest.mark.models def test_initial(EN): doc = EN.tokenizer(u'I ate the pizza with anchovies.') EN.tagger(doc) diff --git a/tests/serialize/test_codecs.py b/tests/serialize/test_codecs.py index ad9012068..00177f21a 100644 --- a/tests/serialize/test_codecs.py +++ b/tests/serialize/test_codecs.py @@ -41,25 +41,10 @@ def test_attribute(): def test_vocab_codec(): - def get_lex_props(string, prob): - return { - 'flags': 0, - 'length': len(string), - 'orth': string, - 'lower': string, - 'norm': string, - 'shape': string, - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': prob, - 'sentiment': 0 - } - vocab = Vocab() - vocab['dog'] = get_lex_props('dog', 0.001) - vocab['the'] = get_lex_props('the', 0.05) - vocab['jumped'] = get_lex_props('jumped', 0.005) + lex = vocab['dog'] + lex = vocab['the'] + lex = vocab['jumped'] codec = HuffmanCodec([(lex.orth, lex.prob) for lex in vocab]) diff --git a/tests/serialize/test_packer.py b/tests/serialize/test_packer.py index 5770a8938..6ec583d08 100644 --- a/tests/serialize/test_packer.py +++ b/tests/serialize/test_packer.py @@ -5,6 +5,7 @@ import re import pytest import numpy +from spacy.language import Language from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.tokenizer import Tokenizer @@ -17,30 +18,14 @@ from spacy.serialize.packer import Packer from spacy.serialize.bits import BitArray -def get_lex_props(string, prob=-22, is_oov=False): - return { - 'flags': 0, - 'length': len(string), - 'orth': string, - 'lower': string, - 'norm': string, - 'shape': string, - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': prob, - 'sentiment': 0 - } - - @pytest.fixture def vocab(): - vocab = Vocab(get_lex_props=get_lex_props) - vocab['dog'] = get_lex_props('dog', 0.001) + vocab = Vocab(Language.default_lex_attrs()) + lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' - vocab['the'] = get_lex_props('the', 0.01) - vocab['quick'] = get_lex_props('quick', 0.005) - vocab['jumped'] = get_lex_props('jumped', 0.007) + lex = vocab['the'] + lex = vocab['quick'] + lex = vocab['jumped'] return vocab diff --git a/tests/tagger/test_spaces.py b/tests/tagger/test_spaces.py index c3052160e..0ef05637b 100644 --- a/tests/tagger/test_spaces.py +++ b/tests/tagger/test_spaces.py @@ -14,6 +14,7 @@ def tagged(EN): tokens = EN(string, tag=True, parse=False) return tokens +@pytest.mark.models def test_spaces(tagged): assert tagged[0].pos != SPACE assert tagged[0].pos_ != 'SPACE' diff --git a/tests/test_docs.py b/tests/test_docs.py index 70c8b8c63..4b0831dfd 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -1,80 +1,81 @@ # -*- coding: utf-8 -*- """Sphinx doctest is just too hard. Manually paste doctest examples here""" +import pytest -@pytest.mark.models -def test_1(): - import spacy.en - from spacy.parts_of_speech import ADV - # Load the pipeline, and call it with some text. - nlp = spacy.en.English() - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", - tag=True, parse=False) - o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) - assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" - - o = nlp.vocab[u'back'].prob - assert o == -7.033305644989014 - o = nlp.vocab[u'not'].prob - assert o == -5.332601070404053 - o = nlp.vocab[u'quietly'].prob - assert o == -11.994928359985352 - - -@pytest.mark.models -def test2(): - import spacy.en - from spacy.parts_of_speech import ADV - nlp = spacy.en.English() - # Find log probability of Nth most frequent word - probs = [lex.prob for lex in nlp.vocab] - probs.sort() - is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' - -@pytest.mark.models -def test3(): - import spacy.en - from spacy.parts_of_speech import ADV - nlp = spacy.en.English() - # Find log probability of Nth most frequent word - probs = [lex.prob for lex in nlp.vocab] - probs.sort() - is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' - - pleaded = tokens[7] - assert pleaded.repvec.shape == (300,) - o = pleaded.repvec[:5] - assert sum(o) != 0 - from numpy import dot - from numpy.linalg import norm - - cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) - words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] - words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) - words.reverse() - o = [w.orth_ for w in words[0:20]] - assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', - u'pleads', u'testified', u'conspired', u'motioned', u'demurred', - u'countersued', u'remonstrated', u'begged', u'apologised', - u'consented', u'acquiesced', u'petitioned', u'quarreled', - u'appealed', u'pleading'] - o = [w.orth_ for w in words[50:60]] - assert o == [u'martialed', u'counselled', u'bragged', - u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', - u'dissented', u'yearned'] - o = [w.orth_ for w in words[100:110]] - assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', - u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', - u'clerked'] - - #o = [w.orth_ for w in words[1000:1010]] - #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', - # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] - #o = [w.orth_ for w in words[50000:50010]] - #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', - # u'dirty', u'rims', u'artists'] +#@pytest.mark.models +#def test_1(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# # Load the pipeline, and call it with some text. +# nlp = spacy.en.English() +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", +# tag=True, parse=False) +# o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) +# assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" +# +# o = nlp.vocab[u'back'].prob +# assert o == -7.033305644989014 +# o = nlp.vocab[u'not'].prob +# assert o == -5.332601070404053 +# o = nlp.vocab[u'quietly'].prob +# assert o == -11.994928359985352 +# +# +#@pytest.mark.m +#def test2(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# nlp = spacy.en.English() +# # Find log probability of Nth most frequent word +# probs = [lex.prob for lex in nlp.vocab] +# probs.sort() +# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") +# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) +# o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' +# +#@pytest.mark.models +#def test3(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# nlp = spacy.en.English() +# # Find log probability of Nth most frequent word +# probs = [lex.prob for lex in nlp.vocab] +# probs.sort() +# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") +# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) +# assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' +# +# pleaded = tokens[7] +# assert pleaded.repvec.shape == (300,) +# o = pleaded.repvec[:5] +# assert sum(o) != 0 +# from numpy import dot +# from numpy.linalg import norm +# +# cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) +# words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] +# words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) +# words.reverse() +# o = [w.orth_ for w in words[0:20]] +# assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', +# u'pleads', u'testified', u'conspired', u'motioned', u'demurred', +# u'countersued', u'remonstrated', u'begged', u'apologised', +# u'consented', u'acquiesced', u'petitioned', u'quarreled', +# u'appealed', u'pleading'] +# o = [w.orth_ for w in words[50:60]] +# assert o == [u'martialed', u'counselled', u'bragged', +# u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', +# u'dissented', u'yearned'] +# o = [w.orth_ for w in words[100:110]] +# assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', +# u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', +# u'clerked'] +# +# #o = [w.orth_ for w in words[1000:1010]] +# #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', +# # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] +# #o = [w.orth_ for w in words[50000:50010]] +# #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', +# # u'dirty', u'rims', u'artists']