Update matcher branch with v2.0.8 master

2025-08-08 22:24:55 +03:00 · 2018-02-18 13:42:58 +01:00 · 2018-02-18 13:42:58 +01:00 · 3d7285870b
commit 3d7285870b
parent f7dc64d2a3 1b3c98e01b
8 changed files with 44 additions and 13 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -3,7 +3,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py

 __title__ = 'spacy'
-__version__ = '2.0.7'
+__version__ = '2.0.8'
 __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 __uri__ = 'https://spacy.io'
 __author__ = 'Explosion AI'
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -7,6 +7,7 @@ import tqdm
 from thinc.neural._classes.model import Model
 from timeit import default_timer as timer

+from ..attrs import PROB, IS_OOV, CLUSTER, LANG
 from ..gold import GoldCorpus, minibatch
 from ..util import prints
 from .. import util
@ -29,11 +30,14 @@ from ..compat import json_dumps
    no_tagger=("Don't train tagger", "flag", "T", bool),
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool),
+    parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
+    entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    version=("Model version", "option", "V", str),
    meta_path=("Optional path to meta.json. All relevant properties will be "
               "overwritten.", "option", "m", Path))
 def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
+         parser_multitasks='', entity_multitasks='',
          use_gpu=-1, vectors=None, no_tagger=False,
          no_parser=False, no_entities=False, gold_preproc=False,
          version="0.0.0", meta_path=None):
@ -90,8 +94,23 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    nlp.meta.update(meta)
    if vectors:
        util.load_model(vectors, vocab=nlp.vocab)
+        for lex in nlp.vocab:
+            values = {}
+            for attr, func in nlp.vocab.lex_attr_getters.items():
+                # These attrs are expected to be set by data. Others should
+                # be set by calling the language functions.
+                if attr not in (CLUSTER, PROB, IS_OOV, LANG):
+                    values[lex.vocab.strings[attr]] = func(lex.orth_)
+            lex.set_attrs(**values)
+            lex.is_oov = False
    for name in pipeline:
        nlp.add_pipe(nlp.create_pipe(name), name=name)
+    if parser_multitasks:
+        for objective in parser_multitasks.split(','):
+            nlp.parser.add_multitask_objective(objective)
+    if entity_multitasks:
+        for objective in entity_multitasks.split(','):
+            nlp.entity.add_multitask_objective(objective)
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None

--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -681,13 +681,19 @@ class MultitaskObjective(Tagger):
        return tokvecs, scores

    def get_loss(self, docs, golds, scores):
+        assert len(docs) == len(golds)
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype='i')
        guesses = scores.argmax(axis=1)
-        for gold in golds:
-            for i in range(len(gold.labels)):
-                label = self.make_label(i, gold.words, gold.tags, gold.heads,
-                                        gold.labels, gold.ents)
+        for i, gold in enumerate(golds):
+            for j in range(len(docs[i])):
+                # Handes alignment for tokenization differences
+                gold_idx = gold.cand_to_gold[j]
+                if gold_idx is None:
+                    idx += 1
+                    continue
+                label = self.make_label(gold_idx, gold.words, gold.tags,
+                                        gold.heads, gold.labels, gold.ents)
                if label is None or label not in self.labels:
                    correct[idx] = guesses[idx]
                else:
@ -892,12 +898,10 @@ cdef class DependencyParser(Parser):
        self._multitasks.append(labeller)

    def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
-        self.add_multitask_objective('tag')
        for labeller in self._multitasks:
            tok2vec = self.model[0]
            labeller.begin_training(gold_tuples, pipeline=pipeline,
                                    tok2vec=tok2vec, sgd=sgd)
-            pipeline.append((labeller.name, labeller))

    def __reduce__(self):
        return (DependencyParser, (self.vocab, self.moves, self.model),
@ -919,7 +923,6 @@ cdef class EntityRecognizer(Parser):
            tok2vec = self.model[0]
            labeller.begin_training(gold_tuples, pipeline=pipeline,
                                    tok2vec=tok2vec)
-            pipeline.append((labeller.name, labeller))

    def __reduce__(self):
        return (EntityRecognizer, (self.vocab, self.moves, self.model),
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -457,3 +457,4 @@ cdef enum symbol_t:

    acl
    LAW
+    LANG
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -90,6 +90,7 @@ IDS = {
    "SENT_START": SENT_START,
    "SPACY": SPACY,
    "PROB": PROB,
+    "LANG": LANG,

    "ADJ": ADJ,
    "ADP": ADP,
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -542,6 +542,7 @@ cdef class Parser:
    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
            return None
+        assert len(docs) == len(golds)
        if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
            return self.update_beam(docs, golds,
                    self.cfg['beam_width'], self.cfg['beam_density'],
@ -551,6 +552,8 @@ cdef class Parser:
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
            docs = [docs]
            golds = [golds]
+        for multitask in self._multitasks:
+            multitask.update(docs, golds, drop=drop, sgd=sgd)
        cuda_stream = util.get_cuda_stream()
        states, golds, max_steps = self._init_gold_batch(docs, golds)
        (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
@ -605,7 +608,7 @@ cdef class Parser:
                break
        self._make_updates(d_tokvecs,
            bp_tokvecs, backprops, sgd, cuda_stream)
-
+    
    def update_beam(self, docs, golds, width=None, density=None,
            drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
--- a/spacy/tests/regression/test_issue1450.py
+++ b/spacy/tests/regression/test_issue1450.py
@ -13,8 +13,8 @@ from ...vocab import Vocab
        ('a b', 0, 2),
        ('a c', 0, 1),
        ('a b c', 0, 2),
-        ('a b b c', 0, 3),
-        ('a b b', 0, 3),
+        ('a b b c', 0, 2),
+        ('a b b', 0, 2),
    ]
 )
 def test_issue1450_matcher_end_zero_plus(string, start, end):
--- a/website/usage/_linguistic-features/_rule-based-matching.jade
+++ b/website/usage/_linguistic-features/_rule-based-matching.jade
@ -179,8 +179,12 @@ p
        +cell Allow the pattern to match zero or more times.

 p
-    |  The #[code +] and #[code *] operators are usually interpretted
-    |  "greedily", i.e. longer matches are returned where possible.
+    |  In versions before v2.1.0, the semantics of the #[code +] and #[code *] operators
+    |  behave inconsistently. They were usually interpretted
+    |  "greedily", i.e. longer matches are returned where possible. However, if
+    |  you specify two #[code +] and #[code *] patterns in a row and their
+    |  matches overlap, the first operator will behave non-greedily. This quirk
+    |  in the semantics is corrected in spaCy v2.1.0.

 +h(3, "adding-phrase-patterns") Adding phrase patterns