mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Update matcher branch with v2.0.8 master
This commit is contained in:
commit
3d7285870b
|
@ -3,7 +3,7 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy'
|
||||
__version__ = '2.0.7'
|
||||
__version__ = '2.0.8'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Explosion AI'
|
||||
|
|
|
@ -7,6 +7,7 @@ import tqdm
|
|||
from thinc.neural._classes.model import Model
|
||||
from timeit import default_timer as timer
|
||||
|
||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
@ -29,11 +30,14 @@ from ..compat import json_dumps
|
|||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool),
|
||||
parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
|
||||
entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||
"overwritten.", "option", "m", Path))
|
||||
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||
parser_multitasks='', entity_multitasks='',
|
||||
use_gpu=-1, vectors=None, no_tagger=False,
|
||||
no_parser=False, no_entities=False, gold_preproc=False,
|
||||
version="0.0.0", meta_path=None):
|
||||
|
@ -90,8 +94,23 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
nlp.meta.update(meta)
|
||||
if vectors:
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
for lex in nlp.vocab:
|
||||
values = {}
|
||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||
# These attrs are expected to be set by data. Others should
|
||||
# be set by calling the language functions.
|
||||
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||
lex.set_attrs(**values)
|
||||
lex.is_oov = False
|
||||
for name in pipeline:
|
||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||
if parser_multitasks:
|
||||
for objective in parser_multitasks.split(','):
|
||||
nlp.parser.add_multitask_objective(objective)
|
||||
if entity_multitasks:
|
||||
for objective in entity_multitasks.split(','):
|
||||
nlp.entity.add_multitask_objective(objective)
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
nlp._optimizer = None
|
||||
|
||||
|
|
|
@ -681,13 +681,19 @@ class MultitaskObjective(Tagger):
|
|||
return tokvecs, scores
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
assert len(docs) == len(golds)
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||
guesses = scores.argmax(axis=1)
|
||||
for gold in golds:
|
||||
for i in range(len(gold.labels)):
|
||||
label = self.make_label(i, gold.words, gold.tags, gold.heads,
|
||||
gold.labels, gold.ents)
|
||||
for i, gold in enumerate(golds):
|
||||
for j in range(len(docs[i])):
|
||||
# Handes alignment for tokenization differences
|
||||
gold_idx = gold.cand_to_gold[j]
|
||||
if gold_idx is None:
|
||||
idx += 1
|
||||
continue
|
||||
label = self.make_label(gold_idx, gold.words, gold.tags,
|
||||
gold.heads, gold.labels, gold.ents)
|
||||
if label is None or label not in self.labels:
|
||||
correct[idx] = guesses[idx]
|
||||
else:
|
||||
|
@ -892,12 +898,10 @@ cdef class DependencyParser(Parser):
|
|||
self._multitasks.append(labeller)
|
||||
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
|
||||
self.add_multitask_objective('tag')
|
||||
for labeller in self._multitasks:
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||
tok2vec=tok2vec, sgd=sgd)
|
||||
pipeline.append((labeller.name, labeller))
|
||||
|
||||
def __reduce__(self):
|
||||
return (DependencyParser, (self.vocab, self.moves, self.model),
|
||||
|
@ -919,7 +923,6 @@ cdef class EntityRecognizer(Parser):
|
|||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||
tok2vec=tok2vec)
|
||||
pipeline.append((labeller.name, labeller))
|
||||
|
||||
def __reduce__(self):
|
||||
return (EntityRecognizer, (self.vocab, self.moves, self.model),
|
||||
|
|
|
@ -457,3 +457,4 @@ cdef enum symbol_t:
|
|||
|
||||
acl
|
||||
LAW
|
||||
LANG
|
||||
|
|
|
@ -90,6 +90,7 @@ IDS = {
|
|||
"SENT_START": SENT_START,
|
||||
"SPACY": SPACY,
|
||||
"PROB": PROB,
|
||||
"LANG": LANG,
|
||||
|
||||
"ADJ": ADJ,
|
||||
"ADP": ADP,
|
||||
|
|
|
@ -542,6 +542,7 @@ cdef class Parser:
|
|||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||
return None
|
||||
assert len(docs) == len(golds)
|
||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
|
||||
return self.update_beam(docs, golds,
|
||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||
|
@ -551,6 +552,8 @@ cdef class Parser:
|
|||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||
docs = [docs]
|
||||
golds = [golds]
|
||||
for multitask in self._multitasks:
|
||||
multitask.update(docs, golds, drop=drop, sgd=sgd)
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
|
@ -605,7 +608,7 @@ cdef class Parser:
|
|||
break
|
||||
self._make_updates(d_tokvecs,
|
||||
bp_tokvecs, backprops, sgd, cuda_stream)
|
||||
|
||||
|
||||
def update_beam(self, docs, golds, width=None, density=None,
|
||||
drop=0., sgd=None, losses=None):
|
||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||
|
|
|
@ -13,8 +13,8 @@ from ...vocab import Vocab
|
|||
('a b', 0, 2),
|
||||
('a c', 0, 1),
|
||||
('a b c', 0, 2),
|
||||
('a b b c', 0, 3),
|
||||
('a b b', 0, 3),
|
||||
('a b b c', 0, 2),
|
||||
('a b b', 0, 2),
|
||||
]
|
||||
)
|
||||
def test_issue1450_matcher_end_zero_plus(string, start, end):
|
||||
|
|
|
@ -179,8 +179,12 @@ p
|
|||
+cell Allow the pattern to match zero or more times.
|
||||
|
||||
p
|
||||
| The #[code +] and #[code *] operators are usually interpretted
|
||||
| "greedily", i.e. longer matches are returned where possible.
|
||||
| In versions before v2.1.0, the semantics of the #[code +] and #[code *] operators
|
||||
| behave inconsistently. They were usually interpretted
|
||||
| "greedily", i.e. longer matches are returned where possible. However, if
|
||||
| you specify two #[code +] and #[code *] patterns in a row and their
|
||||
| matches overlap, the first operator will behave non-greedily. This quirk
|
||||
| in the semantics is corrected in spaCy v2.1.0.
|
||||
|
||||
+h(3, "adding-phrase-patterns") Adding phrase patterns
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user