Merge branch 'master' into rethinc2

This commit is contained in:
Matthew Honnibal 2016-02-02 23:05:34 +01:00
commit fcfc17a164
8 changed files with 88 additions and 26 deletions

View File

@ -5,7 +5,7 @@ from __future__ import unicode_literals
import os
from os import path
import shutil
import codecs
import io
import random
import time
import gzip
@ -56,12 +56,20 @@ def _parse_line(line):
if len(pieces) == 4:
word, pos, head_idx, label = pieces
head_idx = int(head_idx)
elif len(pieces) == 15:
id_ = int(pieces[0].split('_')[-1])
word = pieces[1]
pos = pieces[4]
head_idx = int(pieces[8])-1
label = pieces[10]
else:
id_ = int(pieces[0])
id_ = int(pieces[0].split('_')[-1])
word = pieces[1]
pos = pieces[4]
head_idx = int(pieces[6])-1
label = pieces[7]
if head_idx == 0:
label = 'ROOT'
return word, pos, head_idx, label
@ -69,8 +77,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
gold = GoldParse(tokens, annot_tuples, make_projective=False)
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
@ -121,12 +129,20 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
print('done')
def main(train_loc, dev_loc, model_dir):
with codecs.open(train_loc, 'r', 'utf8') as file_:
@plac.annotations(
train_loc=("Location of CoNLL 09 formatted training file"),
dev_loc=("Location of CoNLL 09 formatted development file"),
model_dir=("Location of output model directory"),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_iter=("Number of training iterations", "option", "i", int),
)
def main(train_loc, dev_loc, model_dir, n_iter=15):
with io.open(train_loc, 'r', encoding='utf8') as file_:
train_sents = read_conll(file_)
train(English, train_sents, model_dir)
if not eval_only:
train(English, train_sents, model_dir, n_iter=n_iter)
nlp = English(data_dir=model_dir)
dev_sents = read_conll(open(dev_loc))
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
scorer = Scorer()
for _, sents in dev_sents:
for annot_tuples, _ in sents:

View File

@ -62,8 +62,10 @@ def main(output_dir):
tokens = tokenizer.tokens_from_list(words)
tagger.train(tokens, tags)
random.shuffle(DATA)
tagger.model.end_training(path.join(output_dir, 'pos', 'model'))
vocab.strings.dump(path.join(output_dir, 'vocab', 'strings.txt'))
tagger.model.end_training()
tagger.model.dump(path.join(output_dir, 'pos', 'model'))
with io.open(output_dir, 'vocab', 'strings.json') as file_:
tagger.vocab.strings.dump(file_)
if __name__ == '__main__':

View File

@ -0,0 +1 @@
{}

View File

@ -18,8 +18,10 @@
"KOUI": {"pos": "SCONJ"},
"KOUS": {"pos": "SCONJ"},
"NE": {"pos": "PROPN"},
"NNE": {"pos": "PROPN"},
"NN": {"pos": "NOUN"},
"PAV": {"pos": "ADV", "PronType": "Dem"},
"PROAV": {"pos": "ADV", "PronType": "Dem"},
"PDAT": {"pos": "DET", "PronType": "Dem"},
"PDS": {"pos": "PRON", "PronType": "Dem"},
"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"},
@ -52,5 +54,6 @@
"VVINF": {"pos": "VERB", "VerbForm": "Inf"},
"VVIZU": {"pos": "VERB", "VerbForm": "Inf"},
"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
"XY": {"pos": "X"}
"XY": {"pos": "X"},
"SP": {"pos": "SPACE"}
}

View File

@ -283,22 +283,43 @@ class Language(object):
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training()
self.parser.model.dump(path.join(data_dir, 'deps', 'model'))
self.entity.model.end_training()
self.entity.model.dump(path.join(data_dir, 'ner', 'model'))
self.tagger.model.end_training()
self.tagger.model.dump(path.join(data_dir, 'pos', 'model'))
if self.parser:
self.parser.model.end_training()
self.parser.model.dump(path.join(data_dir, 'deps', 'model'))
if self.entity:
self.entity.model.end_training()
self.entity.model.dump(path.join(data_dir, 'ner', 'model'))
if self.tagger:
self.tagger.model.end_training()
self.tagger.model.dump(path.join(data_dir, 'pos', 'model'))
strings_loc = path.join(data_dir, 'vocab', 'strings.json')
with io.open(strings_loc, 'w', encoding='utf8') as file_:
self.vocab.strings.dump(file_)
self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))
if self.tagger:
tagger_freqs = list(self.tagger.freqs[TAG].items())
else:
tagger_freqs = []
if self.parser:
dep_freqs = list(self.parser.moves.freqs[DEP].items())
head_freqs = list(self.parser.moves.freqs[HEAD].items())
else:
dep_freqs = []
head_freqs = []
if self.entity:
entity_iob_freqs = list(self.entity.moves.freqs[ENT_IOB].items())
entity_type_freqs = list(self.entity.moves.freqs[ENT_TYPE].items())
else:
entity_iob_freqs = []
entity_type_freqs = []
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
file_.write(
json.dumps([
(TAG, list(self.tagger.freqs[TAG].items())),
(DEP, list(self.parser.moves.freqs[DEP].items())),
(ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
(ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
(HEAD, list(self.parser.moves.freqs[HEAD].items()))]))
(TAG, tagger_freqs),
(DEP, dep_freqs),
(ENT_IOB, entity_iob_freqs),
(ENT_TYPE, entity_type_freqs),
(HEAD, head_freqs)
]))

View File

@ -70,7 +70,7 @@ class Scorer(object):
def ents_f(self):
return self.ner.fscore * 100
def score(self, tokens, gold, verbose=False):
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
assert len(tokens) == len(gold)
gold_deps = set()
@ -78,7 +78,7 @@ class Scorer(object):
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
for id_, word, tag, head, dep, ner in gold.orig_annot:
gold_tags.add((id_, tag))
if dep.lower() not in ('p', 'punct'):
if dep.lower() not in punct_labels:
gold_deps.add((id_, head, dep.lower()))
cand_deps = set()
cand_tags = set()
@ -87,12 +87,12 @@ class Scorer(object):
continue
gold_i = gold.cand_to_gold[token.i]
if gold_i is None:
if token.dep_.lower() not in ('p', 'punct'):
if token.dep_.lower() not in punct_labels:
self.tokens.fp += 1
else:
self.tokens.tp += 1
cand_tags.add((gold_i, token.tag_))
if token.dep_.lower() not in ('p', 'punct') and token.orth_.strip():
if token.dep_.lower() not in punct_labels and token.orth_.strip():
gold_head = gold.cand_to_gold[token.head.i]
# None is indistinct, so we can't just add it to the set
# Multiple (None, None) deps are possible

View File

@ -215,6 +215,11 @@ cdef class Tagger:
def train(self, Doc tokens, object gold_tag_strs):
assert len(tokens) == len(gold_tag_strs)
for tag in gold_tag_strs:
if tag not in self.tag_names:
msg = ("Unrecognized gold tag: %s. tag_map.json must contain all"
"gold tags, to maintain coarse-grained mapping.")
raise ValueError(msg % tag)
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
cdef int correct = 0
cdef Pool mem = Pool()

View File

@ -145,6 +145,9 @@ mixin LexemeDistributional
+Define("vector")
| A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.
+Define("has_vector")
| A boolean value indicating whether a vector.
mixin Func(type1, type2)
#{"λ " + type1 + ", " + type2}
@ -373,6 +376,17 @@ mixin Func(type1, type2)
+Define("whitespace_")
| The number of immediate syntactic children following the word in the string.
details(open=true)
summary: h4 Part-of-Speech Tags
ul
+Define("pos / pos_")
| A coarse-grained, less detailed tag that represents the word-class of the token. The set of #[code .pos] tags are consistent across languages. The available tags are ADJ, ADP, ADV, AUX, CONJ, DET, INTJ, NOUN, NUM, PART, PRON, PROPN, PUNCT, SCONJ, SYM, VERB, X, EOL, SPACE.
ul
+Define("tag / tag_")
| A fine-grained, more detailed tag that represents the word-class and some basic morphological information for the token. These tags are primarily designed to be good features for subsequent models, particularly the syntactic parser. They are language and treebank dependent. The tagger is trained to predict these fine-grained tags, and then a mapping table is used to reduce them to the coarse-grained #[code .pos] tags.
details(open=true)
summary: h4 Navigating the Parse Tree