Merge branch 'master' into rethinc2

2025-08-23 21:44:54 +03:00 · 2016-02-02 23:05:34 +01:00 · 2016-02-02 23:05:34 +01:00 · fcfc17a164
commit fcfc17a164
parent c9aa91041d 1a2ee73e98
8 changed files with 88 additions and 26 deletions
--- a/bin/parser/conll_train.py
+++ b/bin/parser/conll_train.py
@ -5,7 +5,7 @@ from __future__ import unicode_literals
 import os
 from os import path
 import shutil
-import codecs
+import io
 import random
 import time
 import gzip
@ -56,12 +56,20 @@ def _parse_line(line):
    if len(pieces) == 4:
        word, pos, head_idx, label = pieces
        head_idx = int(head_idx)
+    elif len(pieces) == 15:
+        id_ = int(pieces[0].split('_')[-1])
+        word = pieces[1]
+        pos = pieces[4]
+        head_idx = int(pieces[8])-1
+        label = pieces[10]
    else:
-        id_ = int(pieces[0])
+        id_ = int(pieces[0].split('_')[-1])
        word = pieces[1]
        pos = pieces[4]
        head_idx = int(pieces[6])-1
        label = pieces[7]
+    if head_idx == 0:
+        label = 'ROOT'
    return word, pos, head_idx, label

        
@ -69,8 +77,8 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
    nlp.tagger(tokens)
    nlp.parser(tokens)
-    gold = GoldParse(tokens, annot_tuples)
-    scorer.score(tokens, gold, verbose=verbose)
+    gold = GoldParse(tokens, annot_tuples, make_projective=False)
+    scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))


 def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
@ -121,12 +129,20 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
    print('done')


-def main(train_loc, dev_loc, model_dir):
-    with codecs.open(train_loc, 'r', 'utf8') as file_:
+@plac.annotations(
+    train_loc=("Location of CoNLL 09 formatted training file"),
+    dev_loc=("Location of CoNLL 09 formatted development file"),
+    model_dir=("Location of output model directory"),
+    eval_only=("Skip training, and only evaluate", "flag", "e", bool),
+    n_iter=("Number of training iterations", "option", "i", int),
+)
+def main(train_loc, dev_loc, model_dir, n_iter=15):
+    with io.open(train_loc, 'r', encoding='utf8') as file_:
        train_sents = read_conll(file_)
-    train(English, train_sents, model_dir)
+    if not eval_only:
+        train(English, train_sents, model_dir, n_iter=n_iter)
    nlp = English(data_dir=model_dir)
-    dev_sents = read_conll(open(dev_loc))
+    dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
    scorer = Scorer()
    for _, sents in dev_sents:
        for annot_tuples, _ in sents:
--- a/examples/train_pos_tagger.py
+++ b/examples/train_pos_tagger.py
@ -62,8 +62,10 @@ def main(output_dir):
            tokens = tokenizer.tokens_from_list(words)
            tagger.train(tokens, tags)
        random.shuffle(DATA)
-    tagger.model.end_training(path.join(output_dir, 'pos', 'model'))
-    vocab.strings.dump(path.join(output_dir, 'vocab', 'strings.txt'))
+    tagger.model.end_training()
+    tagger.model.dump(path.join(output_dir, 'pos', 'model'))
+    with io.open(output_dir, 'vocab', 'strings.json') as file_:
+        tagger.vocab.strings.dump(file_)


 if __name__ == '__main__':
--- a/lang_data/de/lemma_rules.json
+++ b/lang_data/de/lemma_rules.json
@ -0,0 +1 @@
+{}
--- a/lang_data/de/tag_map.json
+++ b/lang_data/de/tag_map.json
@ -18,8 +18,10 @@
 "KOUI":	{"pos": "SCONJ"},
 "KOUS":	{"pos": "SCONJ"},
 "NE": {"pos": "PROPN"},
+"NNE": {"pos": "PROPN"},
 "NN": {"pos": "NOUN"},
 "PAV": {"pos": "ADV", "PronType": "Dem"},
+"PROAV": {"pos": "ADV", "PronType": "Dem"},
 "PDAT":	{"pos": "DET", "PronType": "Dem"},
 "PDS": {"pos": "PRON", "PronType": "Dem"},
 "PIAT":	{"pos": "DET", "PronType": "Ind,Neg,Tot"},
@ -52,5 +54,6 @@
 "VVINF":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVIZU":	{"pos": "VERB", "VerbForm": "Inf"},
 "VVPP":	{"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
-"XY":	{"pos": "X"}
+"XY":	{"pos": "X"},
+"SP": {"pos": "SPACE"}
 }
--- a/spacy/language.py
+++ b/spacy/language.py
@ -283,22 +283,43 @@ class Language(object):
    def end_training(self, data_dir=None):
        if data_dir is None:
            data_dir = self.data_dir
-        self.parser.model.end_training()
-        self.parser.model.dump(path.join(data_dir, 'deps', 'model'))
-        self.entity.model.end_training()
-        self.entity.model.dump(path.join(data_dir, 'ner', 'model'))
-        self.tagger.model.end_training()
-        self.tagger.model.dump(path.join(data_dir, 'pos', 'model'))
+        if self.parser:
+            self.parser.model.end_training()
+            self.parser.model.dump(path.join(data_dir, 'deps', 'model'))
+        if self.entity:
+            self.entity.model.end_training()
+            self.entity.model.dump(path.join(data_dir, 'ner', 'model'))
+        if self.tagger:
+            self.tagger.model.end_training()
+            self.tagger.model.dump(path.join(data_dir, 'pos', 'model'))

        strings_loc = path.join(data_dir, 'vocab', 'strings.json')
        with io.open(strings_loc, 'w', encoding='utf8') as file_:
            self.vocab.strings.dump(file_)
+        self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin'))

+        if self.tagger:
+            tagger_freqs = list(self.tagger.freqs[TAG].items())
+        else:
+            tagger_freqs = []
+        if self.parser:
+            dep_freqs = list(self.parser.moves.freqs[DEP].items())
+            head_freqs = list(self.parser.moves.freqs[HEAD].items())
+        else:
+            dep_freqs = []
+            head_freqs = []
+        if self.entity:
+            entity_iob_freqs = list(self.entity.moves.freqs[ENT_IOB].items())
+            entity_type_freqs = list(self.entity.moves.freqs[ENT_TYPE].items())
+        else:
+            entity_iob_freqs = []
+            entity_type_freqs = []
        with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
            file_.write(
                json.dumps([
-                    (TAG, list(self.tagger.freqs[TAG].items())),
-                    (DEP, list(self.parser.moves.freqs[DEP].items())),
-                    (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
-                    (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
-                    (HEAD, list(self.parser.moves.freqs[HEAD].items()))]))
+                    (TAG, tagger_freqs),
+                    (DEP, dep_freqs),
+                    (ENT_IOB, entity_iob_freqs),
+                    (ENT_TYPE, entity_type_freqs),
+                    (HEAD, head_freqs)
+                ]))
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -70,7 +70,7 @@ class Scorer(object):
    def ents_f(self):
        return self.ner.fscore * 100

-    def score(self, tokens, gold, verbose=False):
+    def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
        assert len(tokens) == len(gold)

        gold_deps = set()
@ -78,7 +78,7 @@ class Scorer(object):
        gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
        for id_, word, tag, head, dep, ner in gold.orig_annot:
            gold_tags.add((id_, tag))
-            if dep.lower() not in ('p', 'punct'):
+            if dep.lower() not in punct_labels:
                gold_deps.add((id_, head, dep.lower()))
        cand_deps = set()
        cand_tags = set()
@ -87,12 +87,12 @@ class Scorer(object):
                continue
            gold_i = gold.cand_to_gold[token.i]
            if gold_i is None:
-                if token.dep_.lower() not in ('p', 'punct'):
+                if token.dep_.lower() not in punct_labels:
                    self.tokens.fp += 1
            else:
                self.tokens.tp += 1
                cand_tags.add((gold_i, token.tag_))
-            if token.dep_.lower() not in ('p', 'punct') and token.orth_.strip():
+            if token.dep_.lower() not in punct_labels and token.orth_.strip():
                gold_head = gold.cand_to_gold[token.head.i]
                # None is indistinct, so we can't just add it to the set
                # Multiple (None, None) deps are possible
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -215,6 +215,11 @@ cdef class Tagger:
    
    def train(self, Doc tokens, object gold_tag_strs):
        assert len(tokens) == len(gold_tag_strs)
+        for tag in gold_tag_strs:
+            if tag not in self.tag_names:
+                msg = ("Unrecognized gold tag: %s. tag_map.json must contain all"
+                       "gold tags, to maintain coarse-grained mapping.")
+                raise ValueError(msg % tag)
        golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
        cdef int correct = 0
        cdef Pool mem = Pool()
--- a/website/src/jade/docs/_api.jade
+++ b/website/src/jade/docs/_api.jade
@ -145,6 +145,9 @@ mixin LexemeDistributional
      +Define("vector")
        |  A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.

+      +Define("has_vector")
+        | A boolean value indicating whether a vector.
+

 mixin Func(type1, type2)
    #{"λ " + type1 + ", " + type2}
@ -373,6 +376,17 @@ mixin Func(type1, type2)
            +Define("whitespace_")
                | The number of immediate syntactic children following the word in the string.

+    details(open=true)
+        summary: h4 Part-of-Speech Tags
+
+        ul
+            +Define("pos / pos_")
+                | A coarse-grained, less detailed tag that represents the word-class of the token. The set of #[code .pos] tags are consistent across languages. The available tags are ADJ, ADP, ADV, AUX, CONJ, DET, INTJ, NOUN, NUM, PART, PRON, PROPN, PUNCT, SCONJ, SYM, VERB, X, EOL, SPACE.
+
+        ul
+            +Define("tag / tag_")
+                | A fine-grained, more detailed tag that represents the word-class and some basic morphological information for the token. These tags are primarily designed to be good features for subsequent models, particularly the syntactic parser. They are language and treebank dependent. The tagger is trained to predict these fine-grained tags, and then a mapping table is used to reduce them to the coarse-grained #[code .pos] tags.
+
    details(open=true)
        summary: h4 Navigating the Parse Tree