diff --git a/bin/init_model.py b/bin/init_model.py index 991b5dd58..19cfcdc25 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -98,7 +98,7 @@ def _read_probs(loc): return probs, probs['-OOV-'] -def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): +def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): if not loc.exists(): print("Warning: Frequencies file not found") return {}, 0.0 @@ -125,7 +125,8 @@ def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: - word = literal_eval(key) +# word = literal_eval(key) + word = key smooth_count = counts.smoother(int(freq)) log_smooth_count = math.log(smooth_count) probs[word] = math.log(smooth_count) - log_total @@ -165,7 +166,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: - probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz') + probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') if not probs: oov_prob = -20 else: @@ -223,9 +224,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir): copyfile(str(lang_data_dir / 'gazetteer.json'), str(model_dir / 'vocab' / 'gazetteer.json')) - if (lang_data_dir / 'tag_map.json').exists(): - copyfile(str(lang_data_dir / 'tag_map.json'), - str(model_dir / 'vocab' / 'tag_map.json')) + copyfile(str(lang_data_dir / 'tag_map.json'), + str(model_dir / 'vocab' / 'tag_map.json')) if (lang_data_dir / 'lemma_rules.json').exists(): copyfile(str(lang_data_dir / 'lemma_rules.json'), diff --git a/bin/parser/train.py b/bin/parser/train.py index a7dc74770..642ed53e7 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -14,6 +14,7 @@ import re import spacy.util from spacy.en import English +from spacy.de import German from spacy.syntax.util import Config from spacy.gold import read_json_file @@ -25,6 +26,7 @@ from spacy.syntax.arc_eager import ArcEager from spacy.syntax.ner import BiluoPushDown from spacy.tagger import Tagger from spacy.syntax.parser import Parser +from spacy.syntax.nonproj import PseudoProjectivity def _corrupt(c, noise_level): @@ -82,7 +84,7 @@ def _merge_sents(sents): def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, gold_preproc=False, n_sents=0, corruption_level=0, beam_width=1, verbose=False, - use_orig_arc_eager=False): + use_orig_arc_eager=False, pseudoprojective=False): dep_model_dir = path.join(model_dir, 'deps') ner_model_dir = path.join(model_dir, 'ner') pos_model_dir = path.join(model_dir, 'pos') @@ -96,9 +98,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', os.mkdir(ner_model_dir) os.mkdir(pos_model_dir) + if pseudoprojective: + # preprocess training data here before ArcEager.get_labels() is called + gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) + Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=ArcEager.get_labels(gold_tuples), - beam_width=beam_width) + beam_width=beam_width,projectivize=pseudoprojective) Config.write(ner_model_dir, 'config', features='ner', seed=seed, labels=BiluoPushDown.get_labels(gold_tuples), beam_width=0) @@ -107,6 +113,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', gold_tuples = gold_tuples[:n_sents] nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) + if nlp.lang == 'de': + nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) @@ -131,12 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', raw_text = add_noise(raw_text, corruption_level) tokens = nlp.tokenizer(raw_text) nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=True) + gold = GoldParse(tokens, annot_tuples) if not gold.is_projective: - raise Exception( - "Non-projective sentence in training, after we should " - "have enforced projectivity: %s" % annot_tuples - ) + raise Exception("Non-projective sentence in training: %s" % annot_tuples) loss += nlp.parser.train(tokens, gold) nlp.entity.train(tokens, gold) nlp.tagger.train(tokens, gold.tags) @@ -152,6 +157,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None, cand_preproc=None): nlp = Language(data_dir=model_dir) + if nlp.lang == 'de': + nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) if beam_width is not None: nlp.parser.cfg.beam_width = beam_width scorer = Scorer() @@ -200,6 +207,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc): @plac.annotations( + language=("The language to train", "positional", None, str, ['en','de']), train_loc=("Location of training file or directory"), dev_loc=("Location of development file or directory"), model_dir=("Location of output model directory",), @@ -211,19 +219,22 @@ def write_parses(Language, dev_loc, model_dir, out_loc): n_iter=("Number of training iterations", "option", "i", int), verbose=("Verbose error reporting", "flag", "v", bool), debug=("Debug mode", "flag", "d", bool), + pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool), ) -def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False): +def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, + debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False): + lang = {'en':English, 'de':German}.get(language) + if not eval_only: gold_train = list(read_json_file(train_loc)) - train(English, gold_train, model_dir, + train(lang, gold_train, model_dir, feat_set='basic' if not debug else 'debug', gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, - verbose=verbose) + verbose=verbose,pseudoprojective=pseudoprojective) if out_loc: - write_parses(English, dev_loc, model_dir, out_loc) - scorer = evaluate(English, list(read_json_file(dev_loc)), + write_parses(lang, dev_loc, model_dir, out_loc) + scorer = evaluate(lang, list(read_json_file(dev_loc)), model_dir, gold_preproc=gold_preproc, verbose=verbose) print('TOK', scorer.token_acc) print('POS', scorer.tags_acc) diff --git a/bin/tagger/train_german_tagger.py b/bin/tagger/train_german_tagger.py new file mode 100644 index 000000000..4927a6e9a --- /dev/null +++ b/bin/tagger/train_german_tagger.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import unicode_literals + +import os +from os import path +import shutil +import io +import random +import time +import gzip +import ujson + +import plac +import cProfile +import pstats + +import spacy.util +from spacy.de import German +from spacy.gold import GoldParse +from spacy.tagger import Tagger +from spacy.scorer import PRFScore + +from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags +from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags +from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags +from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags +from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS + + +def default_templates(): + return spacy.tagger.Tagger.default_templates() + +def default_templates_without_clusters(): + return ( + (W_orth,), + (P1_lemma, P1_pos), + (P2_lemma, P2_pos), + (N1_orth,), + (N2_orth,), + + (W_suffix,), + (W_prefix,), + + (P1_pos,), + (P2_pos,), + (P1_pos, P2_pos), + (P1_pos, W_orth), + (P1_suffix,), + (N1_suffix,), + + (W_shape,), + + (W_flags,), + (N1_flags,), + (N2_flags,), + (P1_flags,), + (P2_flags,), + ) + + +def make_tagger(vocab, templates): + model = spacy.tagger.TaggerModel(templates) + return spacy.tagger.Tagger(vocab,model) + + +def read_conll(file_): + def sentences(): + words, tags = [], [] + for line in file_: + line = line.strip() + if line: + word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09) + words.append(word) + tags.append(tag) + elif words: + yield words, tags + words, tags = [], [] + if words: + yield words, tags + return [ s for s in sentences() ] + + +def score_model(score, nlp, words, gold_tags): + tokens = nlp.tokenizer.tokens_from_list(words) + assert(len(tokens) == len(gold_tags)) + nlp.tagger(tokens) + + for token, gold_tag in zip(tokens,gold_tags): + score.score_set(set([token.tag_]),set([gold_tag])) + + +def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21): + # make shuffling deterministic + random.seed(seed) + + # set up directory for model + pos_model_dir = path.join(model_dir, 'pos') + if path.exists(pos_model_dir): + shutil.rmtree(pos_model_dir) + os.mkdir(pos_model_dir) + + nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) + nlp.tagger = make_tagger(nlp.vocab,default_templates()) + + print("Itn.\ttrain acc %\tdev acc %") + for itn in range(n_iter): + # train on train set + #train_acc = PRFScore() + correct, total = 0., 0. + for words, gold_tags in train_sents: + tokens = nlp.tokenizer.tokens_from_list(words) + correct += nlp.tagger.train(tokens, gold_tags) + total += len(words) + train_acc = correct/total + + # test on dev set + dev_acc = PRFScore() + for words, gold_tags in dev_sents: + score_model(dev_acc, nlp, words, gold_tags) + + random.shuffle(train_sents) + print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision)) + + + print('end training') + nlp.end_training(model_dir) + print('done') + + +@plac.annotations( + train_loc=("Location of CoNLL 09 formatted training file"), + dev_loc=("Location of CoNLL 09 formatted development file"), + model_dir=("Location of output model directory"), + eval_only=("Skip training, and only evaluate", "flag", "e", bool), + n_iter=("Number of training iterations", "option", "i", int), +) +def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15): + # training + if not eval_only: + with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \ + io.open(dev_loc, 'r', encoding='utf8') as devfile_: + train_sents = read_conll(trainfile_) + dev_sents = read_conll(devfile_) + train(German, train_sents, dev_sents, model_dir, n_iter=n_iter) + + # testing + with io.open(dev_loc, 'r', encoding='utf8') as file_: + dev_sents = read_conll(file_) + nlp = German(data_dir=model_dir) + + dev_acc = PRFScore() + for words, gold_tags in dev_sents: + score_model(dev_acc, nlp, words, gold_tags) + + print('POS: %6.2f %%' % (100*dev_acc.precision)) + + +if __name__ == '__main__': + plac.call(main) diff --git a/lang_data/de/abbrev.de.tab b/lang_data/de/abbrev.de.tab new file mode 100644 index 000000000..97374c83d --- /dev/null +++ b/lang_data/de/abbrev.de.tab @@ -0,0 +1,319 @@ +# surface form lemma pos +# multiple values are separated by | +# empty lines and lines starting with # are being ignored + +'' '' +\") \") +\n \n SP +\t \t SP + SP + +# example: Wie geht's? +'s 's es +'S 'S es + +# example: Haste mal 'nen Euro? +'n 'n ein +'ne 'ne eine +'nen 'nen einen + +# example: Kommen S’ nur herein! +s' s' sie +S' S' sie + +# example: Da haben wir's! +ich's ich|'s ich|es +du's du|'s du|es +er's er|'s er|es +sie's sie|'s sie|es +wir's wir|'s wir|es +ihr's ihr|'s ihr|es + +# example: Die katze auf'm dach. +auf'm auf|'m auf|dem +unter'm unter|'m unter|dem +über'm über|'m über|dem +vor'm vor|'m vor|dem +hinter'm hinter|'m hinter|dem + +# persons +B.A. B.A. +B.Sc. B.Sc. +Dipl. Dipl. +Dipl.-Ing. Dipl.-Ing. +Dr. Dr. +Fr. Fr. +Frl. Frl. +Hr. Hr. +Hrn. Hrn. +Frl. Frl. +Prof. Prof. +St. St. +Hrgs. Hrgs. +Hg. Hg. +a.Z. a.Z. +a.D. a.D. +h.c. h.c. +Jr. Jr. +jr. jr. +jun. jun. +sen. sen. +rer. rer. +Ing. Ing. +M.A. M.A. +Mr. Mr. +M.Sc. M.Sc. +nat. nat. +phil. phil. + +# companies +Co. Co. +co. co. +Cie. Cie. +A.G. A.G. +G.m.b.H. G.m.b.H. +i.G. i.G. +e.V. e.V. + +# popular german abbreviations +Abb. Abb. +Abk. Abk. +Abs. Abs. +Abt. Abt. +abzgl. abzgl. +allg. allg. +a.M. a.M. +Bd. Bd. +betr. betr. +Betr. Betr. +Biol. Biol. +biol. biol. +Bf. Bf. +Bhf. Bhf. +Bsp. Bsp. +bspw. bspw. +bzgl. bzgl. +bzw. bzw. +d.h. d.h. +dgl. dgl. +ebd. ebd. +ehem. ehem. +eigtl. eigtl. +entspr. entspr. +erm. erm. +ev. ev. +evtl. evtl. +Fa. Fa. +Fam. Fam. +geb. geb. +Gebr. Gebr. +gem. gem. +ggf. ggf. +ggü. ggü. +ggfs. ggfs. +gegr. gegr. +Hbf. Hbf. +Hrsg. Hrsg. +hrsg. hrsg. +i.A. i.A. +i.d.R. i.d.R. +inkl. inkl. +insb. insb. +i.O. i.O. +i.Tr. i.Tr. +i.V. i.V. +jur. jur. +kath. kath. +K.O. K.O. +lt. lt. +max. max. +m.E. m.E. +m.M. m.M. +mtl. mtl. +min. min. +mind. mind. +MwSt. MwSt. +Nr. Nr. +o.a. o.a. +o.ä. o.ä. +o.Ä. o.Ä. +o.g. o.g. +o.k. o.k. +O.K. O.K. +Orig. Orig. +orig. orig. +pers. pers. +Pkt. Pkt. +Red. Red. +röm. röm. +s.o. s.o. +sog. sog. +std. std. +stellv. stellv. +Str. Str. +tägl. tägl. +Tel. Tel. +u.a. u.a. +usf. usf. +u.s.w. u.s.w. +usw. usw. +u.U. u.U. +u.v.m. u.v.m. +uvm. uvm. +v.a. v.a. +vgl. vgl. +vllt. vllt. +v.l.n.r. v.l.n.r. +vlt. vlt. +Vol. Vol. +wiss. wiss. +Univ. Univ. +z.B. z.B. +z.b. z.b. +z.Bsp. z.Bsp. +z.T. z.T. +z.Z. z.Z. +zzgl. zzgl. +z.Zt. z.Zt. + +# popular latin abbreviations +vs. vs. +adv. adv. +Chr. Chr. +A.C. A.C. +A.D. A.D. +e.g. e.g. +i.e. i.e. +al. al. +p.a. p.a. +P.S. P.S. +q.e.d. q.e.d. +R.I.P. R.I.P. +etc. etc. +incl. incl. +ca. ca. +n.Chr. n.Chr. +p.s. p.s. +v.Chr. v.Chr. + +# popular english abbreviations +D.C. D.C. +N.Y. N.Y. +N.Y.C. N.Y.C. +U.S. U.S. +U.S.A. U.S.A. +L.A. L.A. +U.S.S. U.S.S. + +# dates & time +Jan. Jan. +Feb. Feb. +Mrz. Mrz. +Mär. Mär. +Apr. Apr. +Jun. Jun. +Jul. Jul. +Aug. Aug. +Sep. Sep. +Sept. Sept. +Okt. Okt. +Nov. Nov. +Dez. Dez. +Mo. Mo. +Di. Di. +Mi. Mi. +Do. Do. +Fr. Fr. +Sa. Sa. +So. So. +Std. Std. +Jh. Jh. +Jhd. Jhd. + +# numbers +Tsd. Tsd. +Mio. Mio. +Mrd. Mrd. + +# countries & languages +engl. engl. +frz. frz. +lat. lat. +österr. österr. + +# smileys +:) :) +<3 <3 +;) ;) +(: (: +:( :( +-_- -_- +=) =) +:/ :/ +:> :> +;-) ;-) +:Y :Y +:P :P +:-P :-P +:3 :3 +=3 =3 +xD xD +^_^ ^_^ +=] =] +=D =D +<333 <333 +:)) :)) +:0 :0 +-__- -__- +xDD xDD +o_o o_o +o_O o_O +V_V V_V +=[[ =[[ +<33 <33 +;p ;p +;D ;D +;-p ;-p +;( ;( +:p :p +:] :] +:O :O +:-/ :-/ +:-) :-) +:((( :((( +:(( :(( +:') :') +(^_^) (^_^) +(= (= +o.O o.O + +# single letters +a. a. +b. b. +c. c. +d. d. +e. e. +f. f. +g. g. +h. h. +i. i. +j. j. +k. k. +l. l. +m. m. +n. n. +o. o. +p. p. +q. q. +r. r. +s. s. +t. t. +u. u. +v. v. +w. w. +x. x. +y. y. +z. z. +ä. ä. +ö. ö. +ü. ü. diff --git a/lang_data/de/gazetteer.json b/lang_data/de/gazetteer.json new file mode 100644 index 000000000..d52fed839 --- /dev/null +++ b/lang_data/de/gazetteer.json @@ -0,0 +1,194 @@ +{ + "Reddit": [ + "PRODUCT", + {}, + [ + [{"lower": "reddit"}] + ] + ], + "SeptemberElevenAttacks": [ + "EVENT", + {}, + [ + [ + {"orth": "9/11"} + ], + [ + {"lower": "september"}, + {"orth": "11"} + ] + ] + ], + "Linux": [ + "PRODUCT", + {}, + [ + [{"lower": "linux"}] + ] + ], + "Haskell": [ + "PRODUCT", + {}, + [ + [{"lower": "haskell"}] + ] + ], + "HaskellCurry": [ + "PERSON", + {}, + [ + [ + {"lower": "haskell"}, + {"lower": "curry"} + ] + ] + ], + "Javascript": [ + "PRODUCT", + {}, + [ + [{"lower": "javascript"}] + ] + ], + "CSS": [ + "PRODUCT", + {}, + [ + [{"lower": "css"}], + [{"lower": "css3"}] + ] + ], + "displaCy": [ + "PRODUCT", + {}, + [ + [{"lower": "displacy"}] + ] + ], + "spaCy": [ + "PRODUCT", + {}, + [ + [{"orth": "spaCy"}] + ] + ], + + "HTML": [ + "PRODUCT", + {}, + [ + [{"lower": "html"}], + [{"lower": "html5"}] + ] + ], + "Python": [ + "PRODUCT", + {}, + [ + [{"orth": "Python"}] + ] + ], + "Ruby": [ + "PRODUCT", + {}, + [ + [{"orth": "Ruby"}] + ] + ], + "Digg": [ + "PRODUCT", + {}, + [ + [{"lower": "digg"}] + ] + ], + "FoxNews": [ + "ORG", + {}, + [ + [{"orth": "Fox"}], + [{"orth": "News"}] + ] + ], + "Google": [ + "ORG", + {}, + [ + [{"lower": "google"}] + ] + ], + "Mac": [ + "PRODUCT", + {}, + [ + [{"lower": "mac"}] + ] + ], + "Wikipedia": [ + "PRODUCT", + {}, + [ + [{"lower": "wikipedia"}] + ] + ], + "Windows": [ + "PRODUCT", + {}, + [ + [{"orth": "Windows"}] + ] + ], + "Dell": [ + "ORG", + {}, + [ + [{"lower": "dell"}] + ] + ], + "Facebook": [ + "ORG", + {}, + [ + [{"lower": "facebook"}] + ] + ], + "Blizzard": [ + "ORG", + {}, + [ + [{"orth": "Blizzard"}] + ] + ], + "Ubuntu": [ + "ORG", + {}, + [ + [{"orth": "Ubuntu"}] + ] + ], + "Youtube": [ + "PRODUCT", + {}, + [ + [{"lower": "youtube"}] + ] + ], + "false_positives": [ + null, + {}, + [ + [{"orth": "Shit"}], + [{"orth": "Weed"}], + [{"orth": "Cool"}], + [{"orth": "Btw"}], + [{"orth": "Bah"}], + [{"orth": "Bullshit"}], + [{"orth": "Lol"}], + [{"orth": "Yo"}, {"lower": "dawg"}], + [{"orth": "Yay"}], + [{"orth": "Ahh"}], + [{"orth": "Yea"}], + [{"orth": "Bah"}] + ] + ] +} diff --git a/lang_data/de/generate_specials.py b/lang_data/de/generate_specials.py index 44e674800..b3dc52e4f 100644 --- a/lang_data/de/generate_specials.py +++ b/lang_data/de/generate_specials.py @@ -1,5 +1,7 @@ # coding=utf8 import json +import io +import itertools contractions = {} @@ -262,14 +264,30 @@ def get_token_properties(token, capitalize=False, remove_contractions=False): props["F"] = token return props + def create_entry(token, endings, capitalize=False, remove_contractions=False): - properties = [] properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions)) for e in endings: properties.append(get_token_properties(e, remove_contractions=remove_contractions)) return properties + +FIELDNAMES = ['F','L','pos'] +def read_hardcoded(stream): + hc_specials = {} + for line in stream: + line = line.strip() + if line.startswith('#') or not line: + continue + key,_,rest = line.partition('\t') + values = [] + for annotation in zip(*[ e.split('|') for e in rest.split('\t') ]): + values.append({ k:v for k,v in itertools.izip_longest(FIELDNAMES,annotation) if v }) + hc_specials[key] = values + return hc_specials + + def generate_specials(): specials = {} @@ -303,7 +321,10 @@ def generate_specials(): specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True) # add in hardcoded specials - specials = dict(specials, **hardcoded_specials) + # changed it so it generates them from a file + with io.open('abbrev.de.tab','r',encoding='utf8') as abbrev_: + hc_specials = read_hardcoded(abbrev_) + specials = dict(specials, **hc_specials) return specials diff --git a/lang_data/de/infix.txt b/lang_data/de/infix.txt index 37eca7350..8398d5d42 100644 --- a/lang_data/de/infix.txt +++ b/lang_data/de/infix.txt @@ -1,3 +1,6 @@ \.\.\. (?<=[a-z])\.(?=[A-Z]) -(?<=[a-zA-Z])-(?=[a-zA-z]) +(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ]) +(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ]) +(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ]) +(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ]) diff --git a/lang_data/de/prefix.txt b/lang_data/de/prefix.txt index 1082bef7d..e37542a9c 100644 --- a/lang_data/de/prefix.txt +++ b/lang_data/de/prefix.txt @@ -5,6 +5,7 @@ { * < +> $ £ „ @@ -20,3 +21,7 @@ a- ‘ .... ... +‚ +» +_ +§ diff --git a/lang_data/de/specials.json b/lang_data/de/specials.json index b8d084503..282ec6df4 100644 --- a/lang_data/de/specials.json +++ b/lang_data/de/specials.json @@ -1,27 +1,4 @@ { - "\t": [ - { - "F": "\t", - "pos": "SP" - } - ], - "\n": [ - { - "F": "\n", - "pos": "SP" - } - ], - " ": [ - { - "F": " ", - "pos": "SP" - } - ], - "\")": [ - { - "F": "\")" - } - ], "''": [ { "F": "''" @@ -217,6 +194,11 @@ "F": "<333" } ], + "": [ + { + "F": "SP" + } + ], "=)": [ { "F": "=)" @@ -267,6 +249,16 @@ "F": "Abk." } ], + "Abs.": [ + { + "F": "Abs." + } + ], + "Abt.": [ + { + "F": "Abt." + } + ], "Apr.": [ { "F": "Apr." @@ -277,6 +269,26 @@ "F": "Aug." } ], + "B.A.": [ + { + "F": "B.A." + } + ], + "B.Sc.": [ + { + "F": "B.Sc." + } + ], + "Bd.": [ + { + "F": "Bd." + } + ], + "Betr.": [ + { + "F": "Betr." + } + ], "Bf.": [ { "F": "Bf." @@ -292,6 +304,11 @@ "F": "Biol." } ], + "Bsp.": [ + { + "F": "Bsp." + } + ], "Chr.": [ { "F": "Chr." @@ -342,6 +359,16 @@ "F": "Dr." } ], + "Fa.": [ + { + "F": "Fa." + } + ], + "Fam.": [ + { + "F": "Fam." + } + ], "Feb.": [ { "F": "Feb." @@ -387,6 +414,16 @@ "F": "Hrgs." } ], + "Hrn.": [ + { + "F": "Hrn." + } + ], + "Hrsg.": [ + { + "F": "Hrsg." + } + ], "Ing.": [ { "F": "Ing." @@ -397,11 +434,21 @@ "F": "Jan." } ], + "Jh.": [ + { + "F": "Jh." + } + ], "Jhd.": [ { "F": "Jhd." } ], + "Jr.": [ + { + "F": "Jr." + } + ], "Jul.": [ { "F": "Jul." @@ -412,21 +459,61 @@ "F": "Jun." } ], + "K.O.": [ + { + "F": "K.O." + } + ], + "L.A.": [ + { + "F": "L.A." + } + ], + "M.A.": [ + { + "F": "M.A." + } + ], + "M.Sc.": [ + { + "F": "M.Sc." + } + ], "Mi.": [ { "F": "Mi." } ], + "Mio.": [ + { + "F": "Mio." + } + ], "Mo.": [ { "F": "Mo." } ], + "Mr.": [ + { + "F": "Mr." + } + ], + "Mrd.": [ + { + "F": "Mrd." + } + ], "Mrz.": [ { "F": "Mrz." } ], + "MwSt.": [ + { + "F": "MwSt." + } + ], "M\u00e4r.": [ { "F": "M\u00e4r." @@ -452,16 +539,31 @@ "F": "Nr." } ], + "O.K.": [ + { + "F": "O.K." + } + ], "Okt.": [ { "F": "Okt." } ], + "Orig.": [ + { + "F": "Orig." + } + ], "P.S.": [ { "F": "P.S." } ], + "Pkt.": [ + { + "F": "Pkt." + } + ], "Prof.": [ { "F": "Prof." @@ -472,6 +574,11 @@ "F": "R.I.P." } ], + "Red.": [ + { + "F": "Red." + } + ], "S'": [ { "F": "S'", @@ -503,6 +610,41 @@ "F": "St." } ], + "Std.": [ + { + "F": "Std." + } + ], + "Str.": [ + { + "F": "Str." + } + ], + "Tel.": [ + { + "F": "Tel." + } + ], + "Tsd.": [ + { + "F": "Tsd." + } + ], + "U.S.": [ + { + "F": "U.S." + } + ], + "U.S.A.": [ + { + "F": "U.S.A." + } + ], + "U.S.S.": [ + { + "F": "U.S.S." + } + ], "Univ.": [ { "F": "Univ." @@ -513,6 +655,30 @@ "F": "V_V" } ], + "Vol.": [ + { + "F": "Vol." + } + ], + "\\\")": [ + { + "F": "\\\")" + } + ], + "\\n": [ + { + "F": "\\n", + "L": "", + "pos": "SP" + } + ], + "\\t": [ + { + "F": "\\t", + "L": "", + "pos": "SP" + } + ], "^_^": [ { "F": "^_^" @@ -528,6 +694,11 @@ "F": "a.D." } ], + "a.M.": [ + { + "F": "a.M." + } + ], "a.Z.": [ { "F": "a.Z." @@ -548,9 +719,15 @@ "F": "al." } ], + "allg.": [ + { + "F": "allg." + } + ], "auf'm": [ { - "F": "auf" + "F": "auf", + "L": "auf" }, { "F": "'m", @@ -572,11 +749,31 @@ "F": "biol." } ], + "bspw.": [ + { + "F": "bspw." + } + ], + "bzgl.": [ + { + "F": "bzgl." + } + ], + "bzw.": [ + { + "F": "bzw." + } + ], "c.": [ { "F": "c." } ], + "ca.": [ + { + "F": "ca." + } + ], "co.": [ { "F": "co." @@ -587,9 +784,20 @@ "F": "d." } ], + "d.h.": [ + { + "F": "d.h." + } + ], + "dgl.": [ + { + "F": "dgl." + } + ], "du's": [ { - "F": "du" + "F": "du", + "L": "du" }, { "F": "'s", @@ -611,19 +819,35 @@ "F": "e.g." } ], + "ebd.": [ + { + "F": "ebd." + } + ], "ehem.": [ { "F": "ehem." } ], + "eigtl.": [ + { + "F": "eigtl." + } + ], "engl.": [ { "F": "engl." } ], + "entspr.": [ + { + "F": "entspr." + } + ], "er's": [ { - "F": "er" + "F": "er", + "L": "er" }, { "F": "'s", @@ -640,11 +864,26 @@ "F": "etc." } ], + "ev.": [ + { + "F": "ev." + } + ], + "evtl.": [ + { + "F": "evtl." + } + ], "f.": [ { "F": "f." } ], + "frz.": [ + { + "F": "frz." + } + ], "g.": [ { "F": "g." @@ -660,6 +899,11 @@ "F": "gegr." } ], + "gem.": [ + { + "F": "gem." + } + ], "ggf.": [ { "F": "ggf." @@ -687,23 +931,39 @@ ], "hinter'm": [ { - "F": "hinter" + "F": "hinter", + "L": "hinter" }, { "F": "'m", "L": "dem" } ], + "hrsg.": [ + { + "F": "hrsg." + } + ], "i.": [ { "F": "i." } ], + "i.A.": [ + { + "F": "i.A." + } + ], "i.G.": [ { "F": "i.G." } ], + "i.O.": [ + { + "F": "i.O." + } + ], "i.Tr.": [ { "F": "i.Tr." @@ -714,6 +974,11 @@ "F": "i.V." } ], + "i.d.R.": [ + { + "F": "i.d.R." + } + ], "i.e.": [ { "F": "i.e." @@ -721,7 +986,8 @@ ], "ich's": [ { - "F": "ich" + "F": "ich", + "L": "ich" }, { "F": "'s", @@ -730,7 +996,8 @@ ], "ihr's": [ { - "F": "ihr" + "F": "ihr", + "L": "ihr" }, { "F": "'s", @@ -757,6 +1024,11 @@ "F": "j." } ], + "jr.": [ + { + "F": "jr." + } + ], "jun.": [ { "F": "jun." @@ -772,11 +1044,21 @@ "F": "k." } ], + "kath.": [ + { + "F": "kath." + } + ], "l.": [ { "F": "l." } ], + "lat.": [ + { + "F": "lat." + } + ], "lt.": [ { "F": "lt." @@ -787,11 +1069,46 @@ "F": "m." } ], + "m.E.": [ + { + "F": "m.E." + } + ], + "m.M.": [ + { + "F": "m.M." + } + ], + "max.": [ + { + "F": "max." + } + ], + "min.": [ + { + "F": "min." + } + ], + "mind.": [ + { + "F": "mind." + } + ], + "mtl.": [ + { + "F": "mtl." + } + ], "n.": [ { "F": "n." } ], + "n.Chr.": [ + { + "F": "n.Chr." + } + ], "nat.": [ { "F": "nat." @@ -807,6 +1124,31 @@ "F": "o.O" } ], + "o.a.": [ + { + "F": "o.a." + } + ], + "o.g.": [ + { + "F": "o.g." + } + ], + "o.k.": [ + { + "F": "o.k." + } + ], + "o.\u00c4.": [ + { + "F": "o.\u00c4." + } + ], + "o.\u00e4.": [ + { + "F": "o.\u00e4." + } + ], "o_O": [ { "F": "o_O" @@ -817,6 +1159,11 @@ "F": "o_o" } ], + "orig.": [ + { + "F": "orig." + } + ], "p.": [ { "F": "p." @@ -827,6 +1174,21 @@ "F": "p.a." } ], + "p.s.": [ + { + "F": "p.s." + } + ], + "pers.": [ + { + "F": "pers." + } + ], + "phil.": [ + { + "F": "phil." + } + ], "q.": [ { "F": "q." @@ -847,6 +1209,11 @@ "F": "rer." } ], + "r\u00f6m.": [ + { + "F": "r\u00f6m." + } + ], "s'": [ { "F": "s'", @@ -858,6 +1225,11 @@ "F": "s." } ], + "s.o.": [ + { + "F": "s.o." + } + ], "sen.": [ { "F": "sen." @@ -865,23 +1237,49 @@ ], "sie's": [ { - "F": "sie" + "F": "sie", + "L": "sie" }, { "F": "'s", "L": "es" } ], + "sog.": [ + { + "F": "sog." + } + ], + "std.": [ + { + "F": "std." + } + ], + "stellv.": [ + { + "F": "stellv." + } + ], "t.": [ { "F": "t." } ], + "t\u00e4gl.": [ + { + "F": "t\u00e4gl." + } + ], "u.": [ { "F": "u." } ], + "u.U.": [ + { + "F": "u.U." + } + ], "u.a.": [ { "F": "u.a." @@ -892,28 +1290,75 @@ "F": "u.s.w." } ], + "u.v.m.": [ + { + "F": "u.v.m." + } + ], "unter'm": [ { - "F": "unter" + "F": "unter", + "L": "unter" }, { "F": "'m", "L": "dem" } ], + "usf.": [ + { + "F": "usf." + } + ], + "usw.": [ + { + "F": "usw." + } + ], + "uvm.": [ + { + "F": "uvm." + } + ], "v.": [ { "F": "v." } ], + "v.Chr.": [ + { + "F": "v.Chr." + } + ], + "v.a.": [ + { + "F": "v.a." + } + ], + "v.l.n.r.": [ + { + "F": "v.l.n.r." + } + ], "vgl.": [ { "F": "vgl." } ], + "vllt.": [ + { + "F": "vllt." + } + ], + "vlt.": [ + { + "F": "vlt." + } + ], "vor'm": [ { - "F": "vor" + "F": "vor", + "L": "vor" }, { "F": "'m", @@ -932,13 +1377,19 @@ ], "wir's": [ { - "F": "wir" + "F": "wir", + "L": "wir" }, { "F": "'s", "L": "es" } ], + "wiss.": [ + { + "F": "wiss." + } + ], "x.": [ { "F": "x." @@ -969,19 +1420,60 @@ "F": "z.B." } ], + "z.Bsp.": [ + { + "F": "z.Bsp." + } + ], + "z.T.": [ + { + "F": "z.T." + } + ], "z.Z.": [ { "F": "z.Z." } ], + "z.Zt.": [ + { + "F": "z.Zt." + } + ], + "z.b.": [ + { + "F": "z.b." + } + ], "zzgl.": [ { "F": "zzgl." } ], + "\u00e4.": [ + { + "F": "\u00e4." + } + ], + "\u00f6.": [ + { + "F": "\u00f6." + } + ], + "\u00f6sterr.": [ + { + "F": "\u00f6sterr." + } + ], + "\u00fc.": [ + { + "F": "\u00fc." + } + ], "\u00fcber'm": [ { - "F": "\u00fcber" + "F": "\u00fcber", + "L": "\u00fcber" }, { "F": "'m", diff --git a/lang_data/de/suffix.txt b/lang_data/de/suffix.txt index d8c6bc2c2..aeecb85a2 100644 --- a/lang_data/de/suffix.txt +++ b/lang_data/de/suffix.txt @@ -13,14 +13,61 @@ ; ' ” +“ +« +_ '' 's 'S ’s ’S ’ +‘ +° +€ \.\. \.\.\. \.\.\.\. -(?<=[a-z0-9)\]"'%\)])\. +(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\. +\-\- +´ +(?<=[0-9])km² +(?<=[0-9])m² +(?<=[0-9])cm² +(?<=[0-9])mm² +(?<=[0-9])km³ +(?<=[0-9])m³ +(?<=[0-9])cm³ +(?<=[0-9])mm³ +(?<=[0-9])ha (?<=[0-9])km +(?<=[0-9])m +(?<=[0-9])cm +(?<=[0-9])mm +(?<=[0-9])µm +(?<=[0-9])nm +(?<=[0-9])yd +(?<=[0-9])in +(?<=[0-9])ft +(?<=[0-9])kg +(?<=[0-9])g +(?<=[0-9])mg +(?<=[0-9])µg +(?<=[0-9])t +(?<=[0-9])lb +(?<=[0-9])oz +(?<=[0-9])m/s +(?<=[0-9])km/h +(?<=[0-9])mph +(?<=[0-9])°C +(?<=[0-9])°K +(?<=[0-9])°F +(?<=[0-9])hPa +(?<=[0-9])Pa +(?<=[0-9])mbar +(?<=[0-9])mb +(?<=[0-9])T +(?<=[0-9])G +(?<=[0-9])M +(?<=[0-9])K +(?<=[0-9])kb diff --git a/setup.py b/setup.py index c571441d4..e646cc0f8 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ MOD_NAMES = [ 'spacy.syntax._state', 'spacy.tokenizer', 'spacy.syntax.parser', + 'spacy.syntax.nonproj', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', diff --git a/spacy/de/__init__.py b/spacy/de/__init__.py index d7cc3dc65..76817ccff 100644 --- a/spacy/de/__init__.py +++ b/spacy/de/__init__.py @@ -6,4 +6,4 @@ from ..language import Language class German(Language): - pass + lang = 'de' diff --git a/spacy/gold.pyx b/spacy/gold.pyx index d8b100744..67716b0ab 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -14,6 +14,8 @@ try: except ImportError: import json +from .syntax import nonproj + def tags_to_entities(tags): entities = [] @@ -236,34 +238,14 @@ cdef class GoldParse: self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]] self.labels[i] = annot_tuples[4][gold_i] self.ner[i] = annot_tuples[5][gold_i] - - # If we have any non-projective arcs, i.e. crossing brackets, consider - # the heads for those words missing in the gold-standard. - # This way, we can train from these sentences - cdef int w1, w2, h1, h2 - if make_projective: - heads = list(self.heads) - for w1 in range(self.length): - if heads[w1] is not None: - h1 = heads[w1] - for w2 in range(w1+1, self.length): - if heads[w2] is not None: - h2 = heads[w2] - if _arcs_cross(w1, h1, w2, h2): - self.heads[w1] = None - self.labels[w1] = '' - self.heads[w2] = None - self.labels[w2] = '' - # Check there are no cycles in the dependencies, i.e. we are a tree - for w in range(self.length): - seen = set([w]) - head = w - while self.heads[head] != head and self.heads[head] != None: - head = self.heads[head] - if head in seen: - raise Exception("Cycle found: %s" % seen) - seen.add(head) + cycle = nonproj.contains_cycle(self.heads) + if cycle != None: + raise Exception("Cycle found: %s" % cycle) + + if make_projective: + proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads,self.labels) + self.heads = proj_heads self.brackets = {} for (gold_start, gold_end, label_str) in brackets: @@ -278,25 +260,18 @@ cdef class GoldParse: @property def is_projective(self): - heads = list(self.heads) - for w1 in range(self.length): - if heads[w1] is not None: - h1 = heads[w1] - for w2 in range(self.length): - if heads[w2] is not None and _arcs_cross(w1, h1, w2, heads[w2]): - return False - return True - - -cdef int _arcs_cross(int w1, int h1, int w2, int h2) except -1: - if w1 > h1: - w1, h1 = h1, w1 - if w2 > h2: - w2, h2 = h2, w2 - if w1 > w2: - w1, h1, w2, h2 = w2, h2, w1, h1 - return w1 < w2 < h1 < h2 or w1 < w2 == h2 < h1 + return not nonproj.is_nonproj_tree(self.heads) def is_punct_label(label): return label == 'P' or label.lower() == 'punct' + + + + + + + + + + diff --git a/spacy/syntax/nonproj.pxd b/spacy/syntax/nonproj.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx new file mode 100644 index 000000000..dbc5555c3 --- /dev/null +++ b/spacy/syntax/nonproj.pyx @@ -0,0 +1,200 @@ +from copy import copy +from collections import Counter + +from ..tokens.doc cimport Doc +from spacy.attrs import DEP, HEAD + + +def ancestors(tokenid, heads): + # returns all words going from the word up the path to the root + # the path to root cannot be longer than the number of words in the sentence + # this function ends after at most len(heads) steps + # because it would otherwise loop indefinitely on cycles + head = tokenid + cnt = 0 + while heads[head] != head and cnt < len(heads): + head = heads[head] + cnt += 1 + yield head + if head == None: + break + + +def contains_cycle(heads): + # in an acyclic tree, the path from each word following + # the head relation upwards always ends at the root node + for tokenid in range(len(heads)): + seen = set([tokenid]) + for ancestor in ancestors(tokenid,heads): + if ancestor in seen: + return seen + seen.add(ancestor) + return None + + +def is_nonproj_arc(tokenid, heads): + # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective + # if there is a token k, h < k < d such that h is not + # an ancestor of k. Same for h -> d, h > d + head = heads[tokenid] + if head == tokenid: # root arcs cannot be non-projective + return False + elif head == None: # unattached tokens cannot be non-projective + return False + + start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) + for k in range(start,end): + for ancestor in ancestors(k,heads): + if ancestor == None: # for unattached tokens/subtrees + break + elif ancestor == head: # normal case: k dominated by h + break + else: # head not in ancestors: d -> h is non-projective + return True + return False + + +def is_nonproj_tree(heads): + # a tree is non-projective if at least one arc is non-projective + return any( is_nonproj_arc(word,heads) for word in range(len(heads)) ) + + +cdef class PseudoProjectivity: + # implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 + # for doing pseudo-projective parsing + # implementation uses the HEAD decoration scheme + + delimiter = '||' + + @classmethod + def decompose(cls, label): + return label.partition(cls.delimiter)[::2] + + @classmethod + def is_decorated(cls, label): + return label.find(cls.delimiter) != -1 + + @classmethod + def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30): + preprocessed = [] + freqs = Counter() + for raw_text, sents in gold_tuples: + prepro_sents = [] + for (ids, words, tags, heads, labels, iob), ctnts in sents: + proj_heads,deco_labels = cls.projectivize(heads,labels) + # set the label to ROOT for each root dependent + deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ] + # count label frequencies + if label_freq_cutoff > 0: + freqs.update( label for label in deco_labels if cls.is_decorated(label) ) + prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts)) + preprocessed.append((raw_text, prepro_sents)) + + if label_freq_cutoff > 0: + return cls._filter_labels(preprocessed,label_freq_cutoff,freqs) + return preprocessed + + + @classmethod + def projectivize(cls, heads, labels): + # use the algorithm by Nivre & Nilsson 2005 + # assumes heads to be a proper tree, i.e. connected and cycle-free + # returns a new pair (heads,labels) which encode + # a projective and decorated tree + proj_heads = copy(heads) + smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads) + if smallest_np_arc == None: # this sentence is already projective + return proj_heads, copy(labels) + while smallest_np_arc != None: + cls._lift(smallest_np_arc, proj_heads) + smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads) + deco_labels = cls._decorate(heads, proj_heads, labels) + return proj_heads, deco_labels + + + @classmethod + def deprojectivize(cls, Doc tokens): + # reattach arcs with decorated labels (following HEAD scheme) + # for each decorated arc X||Y, search top-down, left-to-right, + # breadth-first until hitting a Y then make this the new head + parse = tokens.to_array([HEAD, DEP]) + labels = [ tokens.vocab.strings[int(p[1])] for p in parse ] + for token in tokens: + if cls.is_decorated(token.dep_): + newlabel,headlabel = cls.decompose(token.dep_) + newhead = cls._find_new_head(token,headlabel) + parse[token.i,1] = tokens.vocab.strings[newlabel] + parse[token.i,0] = newhead.i - token.i + tokens.from_array([HEAD, DEP],parse) + + + @classmethod + def _decorate(cls, heads, proj_heads, labels): + # uses decoration scheme HEAD from Nivre & Nilsson 2005 + assert(len(heads) == len(proj_heads) == len(labels)) + deco_labels = [] + for tokenid,head in enumerate(heads): + if head != proj_heads[tokenid]: + deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head])) + else: + deco_labels.append(labels[tokenid]) + return deco_labels + + + @classmethod + def _get_smallest_nonproj_arc(cls, heads): + # return the smallest non-proj arc or None + # where size is defined as the distance between dep and head + # and ties are broken left to right + smallest_size = float('inf') + smallest_np_arc = None + for tokenid,head in enumerate(heads): + size = abs(tokenid-head) + if size < smallest_size and is_nonproj_arc(tokenid,heads): + smallest_size = size + smallest_np_arc = tokenid + return smallest_np_arc + + + @classmethod + def _lift(cls, tokenid, heads): + # reattaches a word to it's grandfather + head = heads[tokenid] + ghead = heads[head] + # attach to ghead if head isn't attached to root else attach to root + heads[tokenid] = ghead if head != ghead else tokenid + + + @classmethod + def _find_new_head(cls, token, headlabel): + # search through the tree starting from root + # returns the id of the first descendant with the given label + # if there is none, return the current head (no change) + queue = [token.head] + while queue: + next_queue = [] + for qtoken in queue: + for child in qtoken.children: + if child == token: + continue + if child.dep_ == headlabel: + return child + next_queue.append(child) + queue = next_queue + return token.head + + + @classmethod + def _filter_labels(cls, gold_tuples, cutoff, freqs): + # throw away infrequent decorated labels + # can't learn them reliably anyway and keeps label set smaller + filtered = [] + for raw_text, sents in gold_tuples: + filtered_sents = [] + for (ids, words, tags, heads, labels, iob), ctnts in sents: + filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ] + filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) + filtered.append((raw_text, filtered_sents)) + return filtered + + diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 77ea376a1..e10049fb6 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -15,5 +15,6 @@ cdef class ParserModel(AveragedPerceptron): cdef class Parser: cdef readonly ParserModel model cdef readonly TransitionSystem moves + cdef int _projectivize cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 6c77c6c96..a83c397dc 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -12,12 +12,12 @@ from cpython.exc cimport PyErr_CheckSignals from libc.stdint cimport uint32_t, uint64_t from libc.string cimport memset, memcpy from libc.stdlib cimport malloc, calloc, free -import random import os.path from os import path import shutil import json import sys +from .nonproj import PseudoProjectivity from cymem.cymem cimport Pool, Address from murmurhash.mrmr cimport hash64 @@ -79,9 +79,10 @@ cdef class ParserModel(AveragedPerceptron): cdef class Parser: - def __init__(self, StringStore strings, transition_system, ParserModel model): + def __init__(self, StringStore strings, transition_system, ParserModel model, int projectivize = 0): self.moves = transition_system self.model = model + self._projectivize = projectivize @classmethod def from_dir(cls, model_dir, strings, transition_system): @@ -93,9 +94,10 @@ cdef class Parser: moves = transition_system(strings, cfg.labels) templates = get_templates(cfg.features) model = ParserModel(templates) + project = cfg.projectivize if hasattr(cfg,'projectivize') else False if path.exists(path.join(model_dir, 'model')): model.load(path.join(model_dir, 'model')) - return cls(strings, moves, model) + return cls(strings, moves, model, project) @classmethod def load(cls, pkg_or_str_or_file, vocab): @@ -114,6 +116,9 @@ cdef class Parser: tokens.is_parsed = True # Check for KeyboardInterrupt etc. Untested PyErr_CheckSignals() + # projectivize output + if self._projectivize: + PseudoProjectivity.deprojectivize(tokens) def pipe(self, stream, int batch_size=1000, int n_threads=2): cdef Pool mem = Pool() diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 11a6a2005..9e4c8ac43 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -143,7 +143,7 @@ cdef class Tagger: @classmethod def blank(cls, vocab, templates): - model = TaggerModel(N_CONTEXT_FIELDS, templates) + model = TaggerModel(templates) return cls(vocab, model) @classmethod @@ -153,10 +153,8 @@ cdef class Tagger: @classmethod def from_package(cls, pkg, vocab): # TODO: templates.json deprecated? not present in latest package - templates = cls.default_templates() - # templates = package.load_utf8(json.load, - # 'pos', 'templates.json', - # default=cls.default_templates()) + # templates = cls.default_templates() + templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates()) model = TaggerModel(templates) if pkg.has_file('pos', 'model'): @@ -203,7 +201,7 @@ cdef class Tagger: nr_class=self.vocab.morphology.n_tags, nr_feat=self.model.nr_feat) for i in range(tokens.length): - if tokens.c[i].pos == 0: + if tokens.c[i].pos == 0: self.model.set_featuresC(&eg.c, tokens.c, i) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) @@ -221,7 +219,7 @@ cdef class Tagger: def train(self, Doc tokens, object gold_tag_strs): assert len(tokens) == len(gold_tag_strs) for tag in gold_tag_strs: - if tag not in self.tag_names: + if tag != None and tag not in self.tag_names: msg = ("Unrecognized gold tag: %s. tag_map.json must contain all" "gold tags, to maintain coarse-grained mapping.") raise ValueError(msg % tag) @@ -234,10 +232,9 @@ cdef class Tagger: nr_feat=self.model.nr_feat) for i in range(tokens.length): self.model.set_featuresC(&eg.c, tokens.c, i) - eg.set_label(golds[i]) + eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ] self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) - self.model.updateC(&eg.c) self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess) diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py new file mode 100644 index 000000000..71b20bf74 --- /dev/null +++ b/spacy/tests/parser/test_nonproj.py @@ -0,0 +1,136 @@ +from __future__ import unicode_literals +import pytest + +from spacy.attrs import DEP, HEAD +import numpy + +from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjectivity + +def test_ancestors(): + tree = [1,2,2,4,5,2,2] + cyclic_tree = [1,2,2,4,5,3,2] + partial_tree = [1,2,2,4,5,None,2] + multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3] + assert([ a for a in ancestors(3,tree) ] == [4,5,2]) + assert([ a for a in ancestors(3,cyclic_tree) ] == [4,5,3,4,5,3,4]) + assert([ a for a in ancestors(3,partial_tree) ] == [4,5,None]) + assert([ a for a in ancestors(17,multirooted_tree) ] == []) + +def test_contains_cycle(): + tree = [1,2,2,4,5,2,2] + cyclic_tree = [1,2,2,4,5,3,2] + partial_tree = [1,2,2,4,5,None,2] + multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3] + assert(contains_cycle(tree) == None) + assert(contains_cycle(cyclic_tree) == set([3,4,5])) + assert(contains_cycle(partial_tree) == None) + assert(contains_cycle(multirooted_tree) == None) + +def test_is_nonproj_arc(): + nonproj_tree = [1,2,2,4,5,2,7,4,2] + partial_tree = [1,2,2,4,5,None,7,4,2] + multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3] + assert(is_nonproj_arc(0,nonproj_tree) == False) + assert(is_nonproj_arc(1,nonproj_tree) == False) + assert(is_nonproj_arc(2,nonproj_tree) == False) + assert(is_nonproj_arc(3,nonproj_tree) == False) + assert(is_nonproj_arc(4,nonproj_tree) == False) + assert(is_nonproj_arc(5,nonproj_tree) == False) + assert(is_nonproj_arc(6,nonproj_tree) == False) + assert(is_nonproj_arc(7,nonproj_tree) == True) + assert(is_nonproj_arc(8,nonproj_tree) == False) + assert(is_nonproj_arc(7,partial_tree) == False) + assert(is_nonproj_arc(17,multirooted_tree) == False) + assert(is_nonproj_arc(16,multirooted_tree) == True) + +def test_is_nonproj_tree(): + proj_tree = [1,2,2,4,5,2,7,5,2] + nonproj_tree = [1,2,2,4,5,2,7,4,2] + partial_tree = [1,2,2,4,5,None,7,4,2] + multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3] + assert(is_nonproj_tree(proj_tree) == False) + assert(is_nonproj_tree(nonproj_tree) == True) + assert(is_nonproj_tree(partial_tree) == False) + assert(is_nonproj_tree(multirooted_tree) == True) + + +def deprojectivize(proj_heads, deco_labels, EN): + slen = len(proj_heads) + sent = EN.tokenizer.tokens_from_list(['whatever'] * slen) + rel_proj_heads = [ head-i for i,head in enumerate(proj_heads) ] + labelids = [ EN.vocab.strings[label] for label in deco_labels ] + parse = numpy.asarray(zip(rel_proj_heads,labelids), dtype=numpy.int32) + sent.from_array([HEAD,DEP],parse) + PseudoProjectivity.deprojectivize(sent) + parse = sent.to_array([HEAD,DEP]) + deproj_heads = [ i+head for i,head in enumerate(parse[:,0]) ] + undeco_labels = [ EN.vocab.strings[int(labelid)] for labelid in parse[:,1] ] + return deproj_heads, undeco_labels + + +@pytest.mark.models +def test_pseudoprojectivity(EN): + tree = [1,2,2] + nonproj_tree = [1,2,2,4,5,2,7,4,2] + labels = ['det','nsubj','root','det','dobj','aux','nsubj','acl','punct'] + nonproj_tree2 = [9,1,3,1,5,6,9,8,6,1,6,12,13,10,1] + labels2 = ['advmod','root','det','nsubj','advmod','det','dobj','det','nmod','aux','nmod','advmod','det','amod','punct'] + + assert(PseudoProjectivity.decompose('X||Y') == ('X','Y')) + assert(PseudoProjectivity.decompose('X') == ('X','')) + + assert(PseudoProjectivity.is_decorated('X||Y') == True) + assert(PseudoProjectivity.is_decorated('X') == False) + + PseudoProjectivity._lift(0,tree) + assert(tree == [2,2,2]) + + np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree) + assert(np_arc == 7) + + np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree2) + assert(np_arc == 10) + + proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree,labels) + assert(proj_heads == [1,2,2,4,5,2,7,5,2]) + assert(deco_labels == ['det','nsubj','root','det','dobj','aux','nsubj','acl||dobj','punct']) + deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN) + assert(deproj_heads == nonproj_tree) + assert(undeco_labels == labels) + + proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree2,labels2) + assert(proj_heads == [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1]) + assert(deco_labels == ['advmod||aux','root','det','nsubj','advmod','det','dobj','det','nmod','aux','nmod||dobj','advmod','det','amod','punct']) + deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN) + assert(deproj_heads == nonproj_tree2) + assert(undeco_labels == labels2) + + # if decoration is wrong such that there is no head with the desired label + # the structure is kept and the label is undecorated + proj_heads = [1,2,2,4,5,2,7,5,2] + deco_labels = ['det','nsubj','root','det','dobj','aux','nsubj','acl||iobj','punct'] + deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN) + assert(deproj_heads == proj_heads) + assert(undeco_labels == ['det','nsubj','root','det','dobj','aux','nsubj','acl','punct']) + + # if there are two potential new heads, the first one is chosen even if it's wrong + proj_heads = [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1] + deco_labels = ['advmod||aux','root','det','aux','advmod','det','dobj','det','nmod','aux','nmod||dobj','advmod','det','amod','punct'] + deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN) + assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1]) + assert(undeco_labels == ['advmod','root','det','aux','advmod','det','dobj','det','nmod','aux','nmod','advmod','det','amod','punct']) + + + + + + + + + + + + + + + diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 342bcf409..0ff574f1b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -201,17 +201,9 @@ cdef class Token: cdef int nr_iter = 0 cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) while ptr < self.c: - # If this head is still to the right of us, we can skip to it - # No token that's between this token and this head could be our - # child. - if (ptr.head >= 1) and (ptr + ptr.head) < self.c: - ptr += ptr.head - - elif ptr + ptr.head == self.c: + if ptr + ptr.head == self.c: yield self.doc[ptr - (self.c - self.i)] - ptr += 1 - else: - ptr += 1 + ptr += 1 nr_iter += 1 # This is ugly, but it's a way to guard out infinite loops if nr_iter >= 10000000: @@ -226,16 +218,10 @@ cdef class Token: tokens = [] cdef int nr_iter = 0 while ptr > self.c: - # If this head is still to the right of us, we can skip to it - # No token that's between this token and this head could be our - # child. - if (ptr.head < 0) and ((ptr + ptr.head) > self.c): - ptr += ptr.head - elif ptr + ptr.head == self.c: + if ptr + ptr.head == self.c: tokens.append(self.doc[ptr - (self.c - self.i)]) - ptr -= 1 - else: - ptr -= 1 + ptr -= 1 + nr_iter += 1 if nr_iter >= 10000000: raise RuntimeError( "Possibly infinite loop encountered while looking for token.rights")