Update matcher branch with v2.0.8 master

This commit is contained in:
Matthew Honnibal 2018-02-18 13:42:58 +01:00
commit 3d7285870b
8 changed files with 44 additions and 13 deletions

View File

@ -3,7 +3,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy'
__version__ = '2.0.7'
__version__ = '2.0.8'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
__author__ = 'Explosion AI'

View File

@ -7,6 +7,7 @@ import tqdm
from thinc.neural._classes.model import Model
from timeit import default_timer as timer
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
@ -29,11 +30,14 @@ from ..compat import json_dumps
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool),
parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be "
"overwritten.", "option", "m", Path))
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
parser_multitasks='', entity_multitasks='',
use_gpu=-1, vectors=None, no_tagger=False,
no_parser=False, no_entities=False, gold_preproc=False,
version="0.0.0", meta_path=None):
@ -90,8 +94,23 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
nlp.meta.update(meta)
if vectors:
util.load_model(vectors, vocab=nlp.vocab)
for lex in nlp.vocab:
values = {}
for attr, func in nlp.vocab.lex_attr_getters.items():
# These attrs are expected to be set by data. Others should
# be set by calling the language functions.
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
values[lex.vocab.strings[attr]] = func(lex.orth_)
lex.set_attrs(**values)
lex.is_oov = False
for name in pipeline:
nlp.add_pipe(nlp.create_pipe(name), name=name)
if parser_multitasks:
for objective in parser_multitasks.split(','):
nlp.parser.add_multitask_objective(objective)
if entity_multitasks:
for objective in entity_multitasks.split(','):
nlp.entity.add_multitask_objective(objective)
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
nlp._optimizer = None

View File

@ -681,13 +681,19 @@ class MultitaskObjective(Tagger):
return tokvecs, scores
def get_loss(self, docs, golds, scores):
assert len(docs) == len(golds)
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)
for gold in golds:
for i in range(len(gold.labels)):
label = self.make_label(i, gold.words, gold.tags, gold.heads,
gold.labels, gold.ents)
for i, gold in enumerate(golds):
for j in range(len(docs[i])):
# Handes alignment for tokenization differences
gold_idx = gold.cand_to_gold[j]
if gold_idx is None:
idx += 1
continue
label = self.make_label(gold_idx, gold.words, gold.tags,
gold.heads, gold.labels, gold.ents)
if label is None or label not in self.labels:
correct[idx] = guesses[idx]
else:
@ -892,12 +898,10 @@ cdef class DependencyParser(Parser):
self._multitasks.append(labeller)
def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
self.add_multitask_objective('tag')
for labeller in self._multitasks:
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline,
tok2vec=tok2vec, sgd=sgd)
pipeline.append((labeller.name, labeller))
def __reduce__(self):
return (DependencyParser, (self.vocab, self.moves, self.model),
@ -919,7 +923,6 @@ cdef class EntityRecognizer(Parser):
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline,
tok2vec=tok2vec)
pipeline.append((labeller.name, labeller))
def __reduce__(self):
return (EntityRecognizer, (self.vocab, self.moves, self.model),

View File

@ -457,3 +457,4 @@ cdef enum symbol_t:
acl
LAW
LANG

View File

@ -90,6 +90,7 @@ IDS = {
"SENT_START": SENT_START,
"SPACY": SPACY,
"PROB": PROB,
"LANG": LANG,
"ADJ": ADJ,
"ADP": ADP,

View File

@ -542,6 +542,7 @@ cdef class Parser:
def update(self, docs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
assert len(docs) == len(golds)
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
return self.update_beam(docs, golds,
self.cfg['beam_width'], self.cfg['beam_density'],
@ -551,6 +552,8 @@ cdef class Parser:
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs]
golds = [golds]
for multitask in self._multitasks:
multitask.update(docs, golds, drop=drop, sgd=sgd)
cuda_stream = util.get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
@ -605,7 +608,7 @@ cdef class Parser:
break
self._make_updates(d_tokvecs,
bp_tokvecs, backprops, sgd, cuda_stream)
def update_beam(self, docs, golds, width=None, density=None,
drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):

View File

@ -13,8 +13,8 @@ from ...vocab import Vocab
('a b', 0, 2),
('a c', 0, 1),
('a b c', 0, 2),
('a b b c', 0, 3),
('a b b', 0, 3),
('a b b c', 0, 2),
('a b b', 0, 2),
]
)
def test_issue1450_matcher_end_zero_plus(string, start, end):

View File

@ -179,8 +179,12 @@ p
+cell Allow the pattern to match zero or more times.
p
| The #[code +] and #[code *] operators are usually interpretted
| "greedily", i.e. longer matches are returned where possible.
| In versions before v2.1.0, the semantics of the #[code +] and #[code *] operators
| behave inconsistently. They were usually interpretted
| "greedily", i.e. longer matches are returned where possible. However, if
| you specify two #[code +] and #[code *] patterns in a row and their
| matches overlap, the first operator will behave non-greedily. This quirk
| in the semantics is corrected in spaCy v2.1.0.
+h(3, "adding-phrase-patterns") Adding phrase patterns