From 080d29e092d6032d65541d78da53fb6122d1f71d Mon Sep 17 00:00:00 2001 From: root Date: Fri, 25 Nov 2016 08:55:33 -0600 Subject: [PATCH 01/14] Fix train.py for 1.0 --- spacy/train.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/spacy/train.py b/spacy/train.py index 097218310..a86569100 100644 --- a/spacy/train.py +++ b/spacy/train.py @@ -14,22 +14,31 @@ class Trainer(object): self.gold_tuples = gold_tuples def epochs(self, nr_epoch, augment_data=None, gold_preproc=False): - def _epoch(): - for raw_text, paragraph_tuples in self.gold_tuples: + cached_golds = {} + def _epoch(indices): + for i in indices: + raw_text, paragraph_tuples = self.gold_tuples[i] if gold_preproc: raw_text = None else: paragraph_tuples = merge_sents(paragraph_tuples) - if augment_data is not None: + if augment_data is None: + docs = self.make_docs(raw_text, paragraph_tuples) + if i in cached_golds: + golds = cached_golds[i] + else: + golds = self.make_golds(docs, paragraph_tuples) + else: raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples) - docs = self.make_docs(raw_text, paragraph_tuples) - golds = self.make_golds(docs, paragraph_tuples) + docs = self.make_docs(raw_text, paragraph_tuples) + golds = self.make_golds(docs, paragraph_tuples) for doc, gold in zip(docs, golds): yield doc, gold + indices = list(range(len(self.gold_tuples))) for itn in range(nr_epoch): - random.shuffle(self.gold_tuples) - yield _epoch() + random.shuffle(indices) + yield _epoch(indices) def update(self, doc, gold): for process in self.nlp.pipeline: @@ -48,7 +57,7 @@ class Trainer(object): docs = self.make_docs(raw_text, paragraph_tuples) golds = self.make_golds(docs, paragraph_tuples) for doc, gold in zip(docs, golds): - for process in self.nlp.pipeline[1:]: + for process in self.nlp.pipeline: process(doc) scorer.score(doc, gold) return scorer @@ -62,8 +71,8 @@ class Trainer(object): def make_golds(self, docs, paragraph_tuples): if len(docs) == 1: - return [GoldParse(docs[0], sent_tuples[0]) + return [GoldParse.from_annot_tuples(docs[0], sent_tuples[0]) for sent_tuples in paragraph_tuples] else: - return [GoldParse(doc, sent_tuples[0]) + return [GoldParse.from_annot_tuples(doc, sent_tuples[0]) for doc, sent_tuples in zip(docs, paragraph_tuples)] From 314bc8d34fd38887208f94b17bb9a204f1d435a4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 08:57:37 -0600 Subject: [PATCH 02/14] Fix train script for 1.0 --- bin/parser/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 5d588a317..574797ba5 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -100,7 +100,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False nlp.entity(tokens) else: tokens = nlp(raw_text) - gold = GoldParse(tokens, annot_tuples) + gold = GoldParse.from_annot_tuples(tokens, annot_tuples) scorer.score(tokens, gold, verbose=verbose) return scorer From cc7e607a8affb16b34a5b319dd20d9cbf4ec3b67 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 08:57:59 -0600 Subject: [PATCH 03/14] Fix gold.pyx for 1.0 --- spacy/gold.pxd | 1 + spacy/gold.pyx | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 5122fe41a..0afdab46d 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -19,6 +19,7 @@ cdef class GoldParse: cdef int length cdef readonly int loss + cdef readonly list words cdef readonly list tags cdef readonly list heads cdef readonly list labels diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 985887630..75de4034a 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -19,6 +19,8 @@ def tags_to_entities(tags): entities = [] start = None for i, tag in enumerate(tags): + if tag is None: + continue if tag.startswith('O'): # TODO: We shouldn't be getting these malformed inputs. Fix this. if start is not None: @@ -229,7 +231,7 @@ cdef class GoldParse: if deps is None: deps = [None for _ in doc] if entities is None: - entities = [None for _ in doc] + entities = ['-' for _ in doc] elif len(entities) == 0: entities = ['O' for _ in doc] elif not isinstance(entities[0], basestring): @@ -246,6 +248,7 @@ cdef class GoldParse: self.c.labels = self.mem.alloc(len(doc), sizeof(int)) self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) + self.words = [None] * len(doc) self.tags = [None] * len(doc) self.heads = [None] * len(doc) self.labels = [''] * len(doc) @@ -259,6 +262,7 @@ cdef class GoldParse: for i, gold_i in enumerate(self.cand_to_gold): if doc[i].text.isspace(): + self.words[i] = doc[i].text self.tags[i] = 'SP' self.heads[i] = None self.labels[i] = None @@ -266,6 +270,7 @@ cdef class GoldParse: if gold_i is None: pass else: + self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] self.heads[i] = self.gold_to_cand[heads[gold_i]] self.labels[i] = deps[gold_i] From 608d8f5421c318d8b3ced985be9214871b0a4f0a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 09:00:21 -0600 Subject: [PATCH 04/14] Pass cfg through parser, and have is_valid default to 1, not 0 when resetting state --- spacy/syntax/parser.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 62b61c37b..f9d1a1663 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -75,11 +75,11 @@ cdef class ParserModel(AveragedPerceptron): cdef class Parser: @classmethod - def load(cls, path, Vocab vocab, TransitionSystem=None, require=False): + def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg): with (path / 'config.json').open() as file_: cfg = json.load(file_) # TODO: remove this shim when we don't have to support older data - if 'labels' in cfg: + if 'labels' in cfg and 'actions' not in cfg: cfg['actions'] = cfg.pop('labels') self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) if (path / 'model').exists(): @@ -215,7 +215,7 @@ cdef class Parser: loss += eg.costs[eg.guess] eg.fill_scores(0, eg.nr_class) eg.fill_costs(0, eg.nr_class) - eg.fill_is_valid(0, eg.nr_class) + eg.fill_is_valid(1, eg.nr_class) return loss def step_through(self, Doc doc): From a2f55e701551fc5616083daec0c54f3fc8343478 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 09:01:20 -0600 Subject: [PATCH 05/14] Pass cfg through loading, for training. --- spacy/language.py | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index e62431bf1..6eb2d150b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -31,6 +31,8 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP from .syntax.parser import get_templates from .syntax.nonproj import PseudoProjectivity from .pipeline import DependencyParser, EntityRecognizer +from .syntax.arc_eager import ArcEager +from .syntax.ner import BiluoPushDown class BaseDefaults(object): @@ -65,7 +67,7 @@ class BaseDefaults(object): prefix_search = util.compile_prefix_regex(cls.prefixes).search suffix_search = util.compile_suffix_regex(cls.suffixes).search infix_finditer = util.compile_infix_regex(cls.infixes).finditer - vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) + vocab = nlp.vocab if nlp is not None else cls.Default.create_vocab(nlp) return Tokenizer(nlp.vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer) @@ -82,26 +84,27 @@ class BaseDefaults(object): return Tagger.load(nlp.path / 'pos', nlp.vocab) @classmethod - def create_parser(cls, nlp=None): + def create_parser(cls, nlp=None, **cfg): if nlp is None: - return DependencyParser(cls.create_vocab(), features=cls.parser_features) + return DependencyParser(cls.create_vocab(), features=cls.parser_features, + **cfg) elif nlp.path is False: - return DependencyParser(nlp.vocab, features=cls.parser_features) + return DependencyParser(nlp.vocab, features=cls.parser_features, **cfg) elif nlp.path is None or not (nlp.path / 'deps').exists(): return None else: - return DependencyParser.load(nlp.path / 'deps', nlp.vocab) + return DependencyParser.load(nlp.path / 'deps', nlp.vocab, **cfg) @classmethod - def create_entity(cls, nlp=None): + def create_entity(cls, nlp=None, **cfg): if nlp is None: - return EntityRecognizer(cls.create_vocab(), features=cls.entity_features) + return EntityRecognizer(cls.create_vocab(), features=cls.entity_features, **cfg) elif nlp.path is False: - return EntityRecognizer(nlp.vocab, features=cls.entity_features) + return EntityRecognizer(nlp.vocab, features=cls.entity_features, **cfg) elif nlp.path is None or not (nlp.path / 'ner').exists(): return None else: - return EntityRecognizer.load(nlp.path / 'ner', nlp.vocab) + return EntityRecognizer.load(nlp.path / 'ner', nlp.vocab, **cfg) @classmethod def create_matcher(cls, nlp=None): @@ -202,8 +205,8 @@ class Language(object): # preprocess training data here before ArcEager.get_labels() is called gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) - parser_cfg['labels'] = ArcEager.get_labels(gold_tuples) - entity_cfg['labels'] = BiluoPushDown.get_labels(gold_tuples) + parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) + entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) with (dep_model_dir / 'config.json').open('wb') as file_: json.dump(parser_cfg, file_) @@ -224,22 +227,18 @@ class Language(object): vectors=False, pipeline=False) - self.defaults.parser_labels = parser_cfg['labels'] - self.defaults.entity_labels = entity_cfg['labels'] - - self.vocab = self.defaults.Vocab() - self.tokenizer = self.defaults.Tokenizer(self.vocab) - self.tagger = self.defaults.Tagger(self.vocab, **tagger_cfg) - self.parser = self.defaults.Parser(self.vocab, **parser_cfg) - self.entity = self.defaults.Entity(self.vocab, **entity_cfg) - self.pipeline = self.defaults.Pipeline(self) + self.vocab = self.Defaults.create_vocab(self) + self.tokenizer = self.Defaults.create_tokenizer(self) + self.tagger = self.Defaults.create_tagger(self) + self.parser = self.Defaults.create_parser(self) + self.entity = self.Defaults.create_entity(self) + self.pipeline = self.Defaults.create_pipeline(self) yield Trainer(self, gold_tuples) self.end_training() def __init__(self, path=True, **overrides): - if 'data_dir' in overrides and 'path' not in overrides: + if 'data_dir' in overrides and 'path' is True: raise ValueError("The argument 'data_dir' has been renamed to 'path'") - path = overrides.get('path', True) if isinstance(path, basestring): path = pathlib.Path(path) if path is True: @@ -253,7 +252,7 @@ class Language(object): add_vectors = self.Defaults.add_vectors(self) \ if 'add_vectors' not in overrides \ else overrides['add_vectors'] - if add_vectors: + if self.vocab and add_vectors: add_vectors(self.vocab) self.tokenizer = self.Defaults.create_tokenizer(self) \ if 'tokenizer' not in overrides \ From ca773a1f5300649e0edc20d255e4f5e8dfdc0f77 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 09:01:52 -0600 Subject: [PATCH 06/14] Tweak arc_eager n_gold to deal with negative costs, and improve error message. --- spacy/syntax/arc_eager.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 9a10a97cf..08eb23d1c 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -439,7 +439,7 @@ cdef class ArcEager(TransitionSystem): if move_costs[move] == -1: move_costs[move] = move_cost_funcs[move](stcls, &gold.c) costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label) - n_gold += costs[i] == 0 + n_gold += costs[i] <= 0 else: is_valid[i] = False costs[i] = 9000 @@ -456,8 +456,14 @@ cdef class ArcEager(TransitionSystem): "before training and after parsing. Either pass make_projective=True " "to the GoldParse class, or use PseudoProjectivity.preprocess_training_data") else: + print(gold.words) + print(gold.heads) + print(gold.labels) raise ValueError( "Could not find a gold-standard action to supervise the dependency " "parser.\n" - "The GoldParse was projective.") + "The GoldParse was projective.\n" + "The transition system has %d actions.\n" + "State at failure:\n" + "%s" % (self.n_moves, stcls.print_state(gold.words))) assert n_gold >= 1 From 39341598bb9d267c83c4940e781bd5ac4ee4661c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 09:02:22 -0600 Subject: [PATCH 07/14] Fix NER label calculation --- spacy/syntax/ner.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 53eb1496d..a5db70b68 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -65,7 +65,7 @@ cdef class BiluoPushDown(TransitionSystem): for action in (BEGIN, IN, LAST, UNIT): actions[action][entity_type] = True moves = ('M', 'B', 'I', 'L', 'U') - for raw_text, sents in kwargs.get('gold_tuples', []): + for raw_text, sents in kwargs.get('gold_parses', []): for (ids, words, tags, heads, labels, biluo), _ in sents: for i, ner_tag in enumerate(biluo): if ner_tag != 'O' and ner_tag != '-': From 664f2dd1c014ecf51895a8dab0ad5e909413007f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 09:02:49 -0600 Subject: [PATCH 08/14] Allow dep to be None in scorer, for missing labels. --- spacy/scorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index c668845e5..3f8d73e6a 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -87,7 +87,7 @@ class Scorer(object): gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) for id_, word, tag, head, dep, ner in gold.orig_annot: gold_tags.add((id_, tag)) - if dep.lower() not in punct_labels: + if dep is not None and dep.lower() not in punct_labels: gold_deps.add((id_, head, dep.lower())) cand_deps = set() cand_tags = set() From 6dd3b94fa66924c9e08b2235f8522432bb763651 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 09:57:18 -0600 Subject: [PATCH 09/14] Filter out deprecated attributes when reading special-case tokenization rules. --- spacy/attrs.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index f5c94a5b9..ce4005324 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -116,6 +116,10 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): stringy_attrs["TAG"] = stringy_attrs.pop("pos") if 'morph' in stringy_attrs: morphs = stringy_attrs.pop('morph') + if 'number' in stringy_attrs: + stringy_attrs.pop('number') + if 'tenspect' in stringy_attrs: + stringy_attrs.pop('tenspect') # for name, value in morphs.items(): # stringy_attrs[name] = value for name, value in stringy_attrs.items(): From da5f0cce36025f1e3f8c877815c399ad30bc5348 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 11:19:33 -0600 Subject: [PATCH 10/14] Fix train_ud script, which trains models from the Universal Dependencies format. --- bin/parser/train_ud.py | 150 +++++++++++++---------------------------- 1 file changed, 46 insertions(+), 104 deletions(-) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index 213591804..f02aa2a73 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -6,103 +6,24 @@ import os import random import io -from spacy.syntax.util import Config +from spacy.tokens import Doc +from spacy.syntax.nonproj import PseudoProjectivity +from spacy.language import Language from spacy.gold import GoldParse -from spacy.tokenizer import Tokenizer from spacy.vocab import Vocab from spacy.tagger import Tagger -from spacy.syntax.parser import Parser -from spacy.syntax.arc_eager import ArcEager +from spacy.pipeline import DependencyParser from spacy.syntax.parser import get_templates +from spacy.syntax.arc_eager import ArcEager from spacy.scorer import Scorer import spacy.attrs -from spacy.language import Language - -from spacy.tagger import W_orth - -TAGGER_TEMPLATES = ( - (W_orth,), -) - try: from codecs import open except ImportError: pass -class TreebankParser(object): - @staticmethod - def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0): - dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') - if path.exists(dep_model_dir): - shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) - os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) - - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=labels) - - @classmethod - def from_dir(cls, tag_map, model_dir): - vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs()) - vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0 - tokenizer = Tokenizer(vocab, {}, None, None, None) - tagger = Tagger.blank(vocab, TAGGER_TEMPLATES) - - cfg = Config.read(path.join(model_dir, 'deps'), 'config') - parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager) - return cls(vocab, tokenizer, tagger, parser) - - def __init__(self, vocab, tokenizer, tagger, parser): - self.vocab = vocab - self.tokenizer = tokenizer - self.tagger = tagger - self.parser = parser - - def train(self, words, tags, heads, deps): - tokens = self.tokenizer.tokens_from_list(list(words)) - self.tagger.train(tokens, tags) - - tokens = self.tokenizer.tokens_from_list(list(words)) - ids = range(len(words)) - ner = ['O'] * len(words) - gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)), - make_projective=False) - self.tagger(tokens) - if gold.is_projective: - try: - self.parser.train(tokens, gold) - except: - for id_, word, head, dep in zip(ids, words, heads, deps): - print(id_, word, head, dep) - raise - - def __call__(self, words, tags=None): - tokens = self.tokenizer.tokens_from_list(list(words)) - if tags is None: - self.tagger(tokens) - else: - self.tagger.tag_from_strings(tokens, tags) - self.parser(tokens) - return tokens - - def end_training(self, data_dir): - self.parser.model.end_training() - self.parser.model.dump(path.join(data_dir, 'deps', 'model')) - self.tagger.model.end_training() - self.tagger.model.dump(path.join(data_dir, 'pos', 'model')) - strings_loc = path.join(data_dir, 'vocab', 'strings.json') - with io.open(strings_loc, 'w', encoding='utf8') as file_: - self.vocab.strings.dump(file_) - self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin')) - - - - def read_conllx(loc): with open(loc, 'r', 'utf8') as file_: text = file_.read() @@ -113,24 +34,30 @@ def read_conllx(loc): lines.pop(0) tokens = [] for line in lines: - id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split() + id_, word, lemma, tag, pos, morph, head, dep, _1, _2 = line.split() if '-' in id_: continue - id_ = int(id_) - 1 - head = (int(head) - 1) if head != '0' else id_ - dep = 'ROOT' if dep == 'root' else dep - tokens.append((id_, word, tag, head, dep, 'O')) - tuples = zip(*tokens) - yield (None, [(tuples, [])]) + try: + id_ = int(id_) - 1 + head = (int(head) - 1) if head != '0' else id_ + dep = 'ROOT' if dep == 'root' else dep + tokens.append((id_, word, tag, head, dep, 'O')) + except: + print(line) + raise + tuples = [list(t) for t in zip(*tokens)] + yield (None, [[tuples, []]]) -def score_model(nlp, gold_docs, verbose=False): +def score_model(vocab, tagger, parser, gold_docs, verbose=False): scorer = Scorer() for _, gold_doc in gold_docs: - for annot_tuples, _ in gold_doc: - tokens = nlp(list(annot_tuples[1]), tags=list(annot_tuples[2])) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) + for (ids, words, tags, heads, deps, entities), _ in gold_doc: + doc = Doc(vocab, words=words) + tagger(doc) + parser(doc) + gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) + scorer.score(doc, gold, verbose=verbose) return scorer @@ -138,22 +65,37 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc): with open(tag_map_loc) as file_: tag_map = json.loads(file_.read()) train_sents = list(read_conllx(train_loc)) - labels = ArcEager.get_labels(train_sents) - templates = get_templates('basic') + train_sents = PseudoProjectivity.preprocess_training_data(train_sents) + actions = ArcEager.get_actions(gold_parses=train_sents) + features = get_templates('basic') - TreebankParser.setup_model_dir(model_dir, labels, templates) + vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) + # Populate vocab + for _, doc_sents in train_sents: + for (ids, words, tags, heads, deps, ner), _ in doc_sents: + for word in words: + _ = vocab[word] + for tag in tags: + assert tag in tag_map, repr(tag) + print(tags) + tagger = Tagger(vocab, tag_map=tag_map) + parser = DependencyParser(vocab, actions=actions, features=features) - nlp = TreebankParser.from_dir(tag_map, model_dir) - for itn in range(15): for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: - nlp.train(words, tags, heads, deps) + doc = Doc(vocab, words=words) + gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) + tagger(doc) + parser.update(doc, gold) + doc = Doc(vocab, words=words) + tagger.update(doc, gold) random.shuffle(train_sents) - scorer = score_model(nlp, read_conllx(dev_loc)) + scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc)) + nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) - scorer = score_model(nlp, read_conllx(dev_loc)) + scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) From bc0a202c9cf6ea7c3ceb80d3ac5d7e8b524705c5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 17:29:17 -0600 Subject: [PATCH 11/14] Fix unicode problem in nonproj module --- spacy/syntax/nonproj.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 28e6dc9e7..566588da4 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,3 +1,4 @@ +from __future__ import unicode_literals from copy import copy from ..tokens.doc cimport Doc From 22189e60dbf92fe22ad49e296c4e41fc84b71f9d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 17:45:45 -0600 Subject: [PATCH 12/14] Use unicode literals in train_ud --- bin/parser/train_ud.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index f02aa2a73..4efc063d7 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import plac import json from os import path @@ -5,6 +6,7 @@ import shutil import os import random import io +import pathlib from spacy.tokens import Doc from spacy.syntax.nonproj import PseudoProjectivity @@ -17,15 +19,12 @@ from spacy.syntax.parser import get_templates from spacy.syntax.arc_eager import ArcEager from spacy.scorer import Scorer import spacy.attrs +import io -try: - from codecs import open -except ImportError: - pass def read_conllx(loc): - with open(loc, 'r', 'utf8') as file_: + with io.open(loc, 'r', encoding='utf8') as file_: text = file_.read() for sent in text.strip().split('\n\n'): lines = sent.strip().split('\n') @@ -56,6 +55,7 @@ def score_model(vocab, tagger, parser, gold_docs, verbose=False): doc = Doc(vocab, words=words) tagger(doc) parser(doc) + PseudoProjectivity.deprojectivize(doc) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) scorer.score(doc, gold, verbose=verbose) return scorer @@ -66,8 +66,13 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc): tag_map = json.loads(file_.read()) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) + actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') + + model_dir = pathlib.Path(model_dir) + with (model_dir / 'deps' / 'config.json').open('wb') as file_: + json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_) vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) # Populate vocab @@ -75,9 +80,12 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc): for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] + for dep in deps: + _ = vocab[dep] + for tag in tags: + _ = vocab[tag] for tag in tags: assert tag in tag_map, repr(tag) - print(tags) tagger = Tagger(vocab, tag_map=tag_map) parser = DependencyParser(vocab, actions=actions, features=features) From c7889492f9c5c9f06053a0606f96b34c7c19ac64 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 18:04:30 -0600 Subject: [PATCH 13/14] Fix model saving error for Python 3 --- bin/parser/train_ud.py | 2 +- spacy/language.py | 13 +++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index 4efc063d7..62256cc14 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -71,7 +71,7 @@ def main(train_loc, dev_loc, model_dir, tag_map_loc): features = get_templates('basic') model_dir = pathlib.Path(model_dir) - with (model_dir / 'deps' / 'config.json').open('wb') as file_: + with (model_dir / 'deps' / 'config.json').open('w') as file_: json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_) vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) diff --git a/spacy/language.py b/spacy/language.py index a1aad54b7..99ba46889 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -5,10 +5,7 @@ import pathlib from contextlib import contextmanager import shutil -try: - import ujson as json -except ImportError: - import json +import ujson as json try: @@ -222,11 +219,11 @@ class Language(object): parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) - with (dep_model_dir / 'config.json').open('wb') as file_: + with (dep_model_dir / 'config.json').open('w') as file_: json.dump(parser_cfg, file_) - with (ner_model_dir / 'config.json').open('wb') as file_: + with (ner_model_dir / 'config.json').open('w') as file_: json.dump(entity_cfg, file_) - with (pos_model_dir / 'config.json').open('wb') as file_: + with (pos_model_dir / 'config.json').open('w') as file_: json.dump(tagger_cfg, file_) self = cls( @@ -387,7 +384,7 @@ class Language(object): else: entity_iob_freqs = [] entity_type_freqs = [] - with (path / 'vocab' / 'serializer.json').open('wb') as file_: + with (path / 'vocab' / 'serializer.json').open('w') as file_: file_.write( json.dumps([ (TAG, tagger_freqs), From 8373115cbd696dfee9dce179795fba59371dd028 Mon Sep 17 00:00:00 2001 From: Jimi Smoot Date: Fri, 25 Nov 2016 18:22:52 -0800 Subject: [PATCH 14/14] Minor typos --- website/docs/usage/entity-recognition.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index d0e642759..1b65f3331 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -36,10 +36,10 @@ p +code("Example"). doc = nlp(u'London is a big city in the United Kingdom.') - print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_)) + print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_) # (u'London', 2, u'GPE') - print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_)) - (u'is', 3, u'')] + print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_) + # (u'is', 3, u'') +h(2, "setting") Setting entity annotations