mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-13 07:55:49 +03:00
Update matcher branch with v2.0.8 master
This commit is contained in:
commit
3d7285870b
|
@ -3,7 +3,7 @@
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
__title__ = 'spacy'
|
__title__ = 'spacy'
|
||||||
__version__ = '2.0.7'
|
__version__ = '2.0.8'
|
||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Explosion AI'
|
__author__ = 'Explosion AI'
|
||||||
|
|
|
@ -7,6 +7,7 @@ import tqdm
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
|
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||||
from ..gold import GoldCorpus, minibatch
|
from ..gold import GoldCorpus, minibatch
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -29,11 +30,14 @@ from ..compat import json_dumps
|
||||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
no_parser=("Don't train parser", "flag", "P", bool),
|
||||||
no_entities=("Don't train NER", "flag", "N", bool),
|
no_entities=("Don't train NER", "flag", "N", bool),
|
||||||
|
parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
|
||||||
|
entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
version=("Model version", "option", "V", str),
|
version=("Model version", "option", "V", str),
|
||||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||||
"overwritten.", "option", "m", Path))
|
"overwritten.", "option", "m", Path))
|
||||||
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
|
parser_multitasks='', entity_multitasks='',
|
||||||
use_gpu=-1, vectors=None, no_tagger=False,
|
use_gpu=-1, vectors=None, no_tagger=False,
|
||||||
no_parser=False, no_entities=False, gold_preproc=False,
|
no_parser=False, no_entities=False, gold_preproc=False,
|
||||||
version="0.0.0", meta_path=None):
|
version="0.0.0", meta_path=None):
|
||||||
|
@ -90,8 +94,23 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
nlp.meta.update(meta)
|
nlp.meta.update(meta)
|
||||||
if vectors:
|
if vectors:
|
||||||
util.load_model(vectors, vocab=nlp.vocab)
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
|
for lex in nlp.vocab:
|
||||||
|
values = {}
|
||||||
|
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||||
|
# These attrs are expected to be set by data. Others should
|
||||||
|
# be set by calling the language functions.
|
||||||
|
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
||||||
|
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||||
|
lex.set_attrs(**values)
|
||||||
|
lex.is_oov = False
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||||
|
if parser_multitasks:
|
||||||
|
for objective in parser_multitasks.split(','):
|
||||||
|
nlp.parser.add_multitask_objective(objective)
|
||||||
|
if entity_multitasks:
|
||||||
|
for objective in entity_multitasks.split(','):
|
||||||
|
nlp.entity.add_multitask_objective(objective)
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
|
|
|
@ -681,13 +681,19 @@ class MultitaskObjective(Tagger):
|
||||||
return tokvecs, scores
|
return tokvecs, scores
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
|
assert len(docs) == len(golds)
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
for gold in golds:
|
for i, gold in enumerate(golds):
|
||||||
for i in range(len(gold.labels)):
|
for j in range(len(docs[i])):
|
||||||
label = self.make_label(i, gold.words, gold.tags, gold.heads,
|
# Handes alignment for tokenization differences
|
||||||
gold.labels, gold.ents)
|
gold_idx = gold.cand_to_gold[j]
|
||||||
|
if gold_idx is None:
|
||||||
|
idx += 1
|
||||||
|
continue
|
||||||
|
label = self.make_label(gold_idx, gold.words, gold.tags,
|
||||||
|
gold.heads, gold.labels, gold.ents)
|
||||||
if label is None or label not in self.labels:
|
if label is None or label not in self.labels:
|
||||||
correct[idx] = guesses[idx]
|
correct[idx] = guesses[idx]
|
||||||
else:
|
else:
|
||||||
|
@ -892,12 +898,10 @@ cdef class DependencyParser(Parser):
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, gold_tuples, pipeline, sgd=None, **cfg):
|
||||||
self.add_multitask_objective('tag')
|
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
tok2vec = self.model[0]
|
tok2vec = self.model[0]
|
||||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||||
tok2vec=tok2vec, sgd=sgd)
|
tok2vec=tok2vec, sgd=sgd)
|
||||||
pipeline.append((labeller.name, labeller))
|
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (DependencyParser, (self.vocab, self.moves, self.model),
|
return (DependencyParser, (self.vocab, self.moves, self.model),
|
||||||
|
@ -919,7 +923,6 @@ cdef class EntityRecognizer(Parser):
|
||||||
tok2vec = self.model[0]
|
tok2vec = self.model[0]
|
||||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||||
tok2vec=tok2vec)
|
tok2vec=tok2vec)
|
||||||
pipeline.append((labeller.name, labeller))
|
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (EntityRecognizer, (self.vocab, self.moves, self.model),
|
return (EntityRecognizer, (self.vocab, self.moves, self.model),
|
||||||
|
|
|
@ -457,3 +457,4 @@ cdef enum symbol_t:
|
||||||
|
|
||||||
acl
|
acl
|
||||||
LAW
|
LAW
|
||||||
|
LANG
|
||||||
|
|
|
@ -90,6 +90,7 @@ IDS = {
|
||||||
"SENT_START": SENT_START,
|
"SENT_START": SENT_START,
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
"PROB": PROB,
|
||||||
|
"LANG": LANG,
|
||||||
|
|
||||||
"ADJ": ADJ,
|
"ADJ": ADJ,
|
||||||
"ADP": ADP,
|
"ADP": ADP,
|
||||||
|
|
|
@ -542,6 +542,7 @@ cdef class Parser:
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
return None
|
return None
|
||||||
|
assert len(docs) == len(golds)
|
||||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
|
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
|
||||||
return self.update_beam(docs, golds,
|
return self.update_beam(docs, golds,
|
||||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||||
|
@ -551,6 +552,8 @@ cdef class Parser:
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
|
for multitask in self._multitasks:
|
||||||
|
multitask.update(docs, golds, drop=drop, sgd=sgd)
|
||||||
cuda_stream = util.get_cuda_stream()
|
cuda_stream = util.get_cuda_stream()
|
||||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||||
|
@ -605,7 +608,7 @@ cdef class Parser:
|
||||||
break
|
break
|
||||||
self._make_updates(d_tokvecs,
|
self._make_updates(d_tokvecs,
|
||||||
bp_tokvecs, backprops, sgd, cuda_stream)
|
bp_tokvecs, backprops, sgd, cuda_stream)
|
||||||
|
|
||||||
def update_beam(self, docs, golds, width=None, density=None,
|
def update_beam(self, docs, golds, width=None, density=None,
|
||||||
drop=0., sgd=None, losses=None):
|
drop=0., sgd=None, losses=None):
|
||||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
|
|
|
@ -13,8 +13,8 @@ from ...vocab import Vocab
|
||||||
('a b', 0, 2),
|
('a b', 0, 2),
|
||||||
('a c', 0, 1),
|
('a c', 0, 1),
|
||||||
('a b c', 0, 2),
|
('a b c', 0, 2),
|
||||||
('a b b c', 0, 3),
|
('a b b c', 0, 2),
|
||||||
('a b b', 0, 3),
|
('a b b', 0, 2),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_issue1450_matcher_end_zero_plus(string, start, end):
|
def test_issue1450_matcher_end_zero_plus(string, start, end):
|
||||||
|
|
|
@ -179,8 +179,12 @@ p
|
||||||
+cell Allow the pattern to match zero or more times.
|
+cell Allow the pattern to match zero or more times.
|
||||||
|
|
||||||
p
|
p
|
||||||
| The #[code +] and #[code *] operators are usually interpretted
|
| In versions before v2.1.0, the semantics of the #[code +] and #[code *] operators
|
||||||
| "greedily", i.e. longer matches are returned where possible.
|
| behave inconsistently. They were usually interpretted
|
||||||
|
| "greedily", i.e. longer matches are returned where possible. However, if
|
||||||
|
| you specify two #[code +] and #[code *] patterns in a row and their
|
||||||
|
| matches overlap, the first operator will behave non-greedily. This quirk
|
||||||
|
| in the semantics is corrected in spaCy v2.1.0.
|
||||||
|
|
||||||
+h(3, "adding-phrase-patterns") Adding phrase patterns
|
+h(3, "adding-phrase-patterns") Adding phrase patterns
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user