mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge Span._ and Span.as_doc methods
This commit is contained in:
commit
e0a9b02b67
|
@ -21,7 +21,6 @@ import thinc.neural._classes.layernorm
|
|||
thinc.neural._classes.layernorm.set_compat_six_eight(False)
|
||||
|
||||
|
||||
|
||||
def train_textcat(tokenizer, textcat,
|
||||
train_texts, train_cats, dev_texts, dev_cats,
|
||||
n_iter=20):
|
||||
|
@ -57,18 +56,20 @@ def evaluate(tokenizer, textcat, texts, cats):
|
|||
for i, doc in enumerate(textcat.pipe(docs)):
|
||||
gold = cats[i]
|
||||
for label, score in doc.cats.items():
|
||||
if score >= 0.5 and label in gold:
|
||||
if label not in gold:
|
||||
continue
|
||||
if score >= 0.5 and gold[label] >= 0.5:
|
||||
tp += 1.
|
||||
elif score >= 0.5 and label not in gold:
|
||||
elif score >= 0.5 and gold[label] < 0.5:
|
||||
fp += 1.
|
||||
elif score < 0.5 and label not in gold:
|
||||
elif score < 0.5 and gold[label] < 0.5:
|
||||
tn += 1
|
||||
if score < 0.5 and label in gold:
|
||||
elif score < 0.5 and gold[label] >= 0.5:
|
||||
fn += 1
|
||||
precis = tp / (tp + fp)
|
||||
recall = tp / (tp + fn)
|
||||
fscore = 2 * (precis * recall) / (precis + recall)
|
||||
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
|
||||
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
|
||||
|
||||
|
||||
def load_data(limit=0):
|
||||
|
@ -80,7 +81,7 @@ def load_data(limit=0):
|
|||
train_data = train_data[-limit:]
|
||||
|
||||
texts, labels = zip(*train_data)
|
||||
cats = [(['POSITIVE'] if y else []) for y in labels]
|
||||
cats = [{'POSITIVE': bool(y)} for y in labels]
|
||||
|
||||
split = int(len(train_data) * 0.8)
|
||||
|
||||
|
@ -97,7 +98,7 @@ def main(model_loc=None):
|
|||
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
|
||||
|
||||
print("Load IMDB data")
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000)
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
|
||||
|
||||
print("Itn.\tLoss\tP\tR\tF")
|
||||
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
|
||||
|
|
|
@ -264,7 +264,8 @@ def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
|
|||
return layerize(noop())
|
||||
embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
|
||||
for i in range(hist_size)]
|
||||
embed = concatenate(*embed_tables)
|
||||
embed = chain(concatenate(*embed_tables),
|
||||
LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
|
||||
ops = embed.ops
|
||||
def add_history_fwd(vectors_hists, drop=0.):
|
||||
vectors, hist_ids = vectors_hists
|
||||
|
@ -742,5 +743,3 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
|||
return ys, concatenate_lists_bwd
|
||||
model = wrap(concatenate_lists_fwd, concat)
|
||||
return model
|
||||
|
||||
|
||||
|
|
|
@ -3,13 +3,13 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy-nightly'
|
||||
__version__ = '2.0.0a16'
|
||||
__version__ = '2.0.0a17'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Explosion AI'
|
||||
__email__ = 'contact@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
__release__ = True
|
||||
__release__ = False
|
||||
|
||||
__docs_models__ = 'https://alpha.spacy.io/usage/models'
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
from pathlib import Path
|
||||
|
||||
from .converters import conllu2json, iob2json
|
||||
from .converters import conllu2json, iob2json, conll_ner2json
|
||||
from ..util import prints
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new entry
|
||||
|
@ -12,9 +12,10 @@ from ..util import prints
|
|||
# from /converters.
|
||||
|
||||
CONVERTERS = {
|
||||
'.conllu': conllu2json,
|
||||
'.conll': conllu2json,
|
||||
'.iob': iob2json,
|
||||
'conllu': conllu2json,
|
||||
'conll': conllu2json,
|
||||
'ner': conll_ner2json,
|
||||
'iob': iob2json,
|
||||
}
|
||||
|
||||
|
||||
|
@ -22,9 +23,11 @@ CONVERTERS = {
|
|||
input_file=("input file", "positional", None, str),
|
||||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
|
||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
||||
converter='auto'):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
|
@ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
|
|||
prints(input_path, title="Input file not found", exits=1)
|
||||
if not output_path.exists():
|
||||
prints(output_path, title="Output directory not found", exits=1)
|
||||
file_ext = input_path.suffix
|
||||
if not file_ext in CONVERTERS:
|
||||
prints("Can't find converter for %s" % input_path.parts[-1],
|
||||
title="Unknown format", exits=1)
|
||||
CONVERTERS[file_ext](input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology)
|
||||
if converter == 'auto':
|
||||
converter = input_path.suffix[1:]
|
||||
if not converter in CONVERTERS:
|
||||
prints("Can't find converter for %s" % converter,
|
||||
title="Unknown format", exits=1)
|
||||
func = CONVERTERS[converter]
|
||||
func(input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology)
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
from .conllu2json import conllu2json
|
||||
from .iob2json import iob2json
|
||||
from .conll_ner2json import conll_ner2json
|
||||
|
|
|
@ -114,15 +114,33 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
|||
nlp.to_disk(epoch_model_path)
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
scorer = nlp_loaded.evaluate(
|
||||
list(corpus.dev_docs(
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=gold_preproc)))
|
||||
gold_preproc=gold_preproc))
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
end_time = timer()
|
||||
if use_gpu < 0:
|
||||
gpu_wps = None
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
else:
|
||||
gpu_wps = nwords/(end_time-start_time)
|
||||
with Model.use_device('cpu'):
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded, gold_preproc=gold_preproc))
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
end_time = timer()
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||
with acc_loc.open('w') as file_:
|
||||
file_.write(json_dumps(scorer.scores))
|
||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||
meta['accuracy'] = scorer.scores
|
||||
meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps}
|
||||
meta['lang'] = nlp.lang
|
||||
meta['pipeline'] = pipeline
|
||||
meta['spacy_version'] = '>=%s' % about.__version__
|
||||
|
@ -132,7 +150,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
|||
with meta_loc.open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores)
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
try:
|
||||
|
@ -153,16 +171,17 @@ def _render_parses(i, to_render):
|
|||
file_.write(html)
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['wps'] = wps
|
||||
scores['cpu_wps'] = cpu_wps
|
||||
scores['gpu_wps'] = gpu_wps or 0.0
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.3f}',
|
||||
|
@ -173,7 +192,9 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
|
|||
'{ents_f:.3f}',
|
||||
'{tags_acc:.3f}',
|
||||
'{token_acc:.3f}',
|
||||
'{wps:.1f}'))
|
||||
'{cpu_wps:.1f}',
|
||||
'{gpu_wps:.1f}',
|
||||
))
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
|
|
|
@ -387,7 +387,7 @@ cdef class GoldParse:
|
|||
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||
deps=None, entities=None, make_projective=False,
|
||||
cats=tuple()):
|
||||
cats=None):
|
||||
"""Create a GoldParse.
|
||||
|
||||
doc (Doc): The document the annotations refer to.
|
||||
|
@ -398,12 +398,15 @@ cdef class GoldParse:
|
|||
entities (iterable): A sequence of named entity annotations, either as
|
||||
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||
representing the entity positions.
|
||||
cats (iterable): A sequence of labels for text classification. Each
|
||||
label may be a string or an int, or a `(start_char, end_char, label)`
|
||||
cats (dict): Labels for text classification. Each key in the dictionary
|
||||
may be a string or an int, or a `(start_char, end_char, label)`
|
||||
tuple, indicating that the label is applied to only part of the
|
||||
document (usually a sentence). Unlike entity annotations, label
|
||||
annotations can overlap, i.e. a single word can be covered by
|
||||
multiple labelled spans.
|
||||
multiple labelled spans. The TextCategorizer component expects
|
||||
true examples of a label to have the value 1.0, and negative examples
|
||||
of a label to have the value 0.0. Labels not in the dictionary are
|
||||
treated as missing -- the gradient for those labels will be zero.
|
||||
RETURNS (GoldParse): The newly constructed object.
|
||||
"""
|
||||
if words is None:
|
||||
|
@ -434,7 +437,7 @@ cdef class GoldParse:
|
|||
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||
|
||||
self.cats = list(cats)
|
||||
self.cats = {} if cats is None else dict(cats)
|
||||
self.words = [None] * len(doc)
|
||||
self.tags = [None] * len(doc)
|
||||
self.heads = [None] * len(doc)
|
||||
|
|
|
@ -126,7 +126,7 @@ def word_shape(text):
|
|||
LEX_ATTRS = {
|
||||
attrs.LOWER: lambda string: string.lower(),
|
||||
attrs.NORM: lambda string: string.lower(),
|
||||
attrs.PREFIX: lambda string: string[0],
|
||||
attrs.PREFIX: lambda string: string[:3],
|
||||
attrs.SUFFIX: lambda string: string[-3:],
|
||||
attrs.CLUSTER: lambda string: 0,
|
||||
attrs.IS_ALPHA: lambda string: string.isalpha(),
|
||||
|
|
|
@ -158,11 +158,13 @@ class BaseThincComponent(object):
|
|||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the pipe to a bytestring."""
|
||||
serialize = OrderedDict((
|
||||
('cfg', lambda: json_dumps(self.cfg)),
|
||||
('model', lambda: self.model.to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes())
|
||||
))
|
||||
serialize = OrderedDict()
|
||||
serialize['cfg'] = lambda: json_dumps(self.cfg)
|
||||
if self.model in (True, False, None):
|
||||
serialize['model'] = lambda: self.model
|
||||
else:
|
||||
serialize['model'] = self.model.to_bytes
|
||||
serialize['vocab'] = self.vocab.to_bytes
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
|
@ -183,11 +185,11 @@ class BaseThincComponent(object):
|
|||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Serialize the pipe to disk."""
|
||||
serialize = OrderedDict((
|
||||
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||
))
|
||||
serialize = OrderedDict()
|
||||
serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
|
||||
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
|
||||
if self.model not in (None, True, False):
|
||||
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
|
@ -438,13 +440,16 @@ class NeuralTagger(BaseThincComponent):
|
|||
yield
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
serialize = OrderedDict((
|
||||
('model', lambda: self.model.to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
|
||||
use_bin_type=True,
|
||||
encoding='utf8'))
|
||||
))
|
||||
serialize = OrderedDict()
|
||||
if self.model in (None, True, False):
|
||||
serialize['model'] = lambda: self.model
|
||||
else:
|
||||
serialize['model'] = self.model.to_bytes
|
||||
serialize['vocab'] = self.vocab.to_bytes
|
||||
|
||||
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
|
||||
use_bin_type=True,
|
||||
encoding='utf8')
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
|
@ -552,7 +557,6 @@ class NeuralLabeller(NeuralTagger):
|
|||
label = self.make_label(i, words, tags, heads, deps, ents)
|
||||
if label is not None and label not in self.labels:
|
||||
self.labels[label] = len(self.labels)
|
||||
print(len(self.labels))
|
||||
if self.model is True:
|
||||
token_vector_width = util.env_opt('token_vector_width')
|
||||
self.model = chain(
|
||||
|
@ -721,11 +725,17 @@ class TextCategorizer(BaseThincComponent):
|
|||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
|
||||
not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
|
||||
for i, gold in enumerate(golds):
|
||||
for j, label in enumerate(self.labels):
|
||||
truths[i, j] = label in gold.cats
|
||||
if label in gold.cats:
|
||||
truths[i, j] = gold.cats[label]
|
||||
else:
|
||||
not_missing[i, j] = 0.
|
||||
truths = self.model.ops.asarray(truths)
|
||||
not_missing = self.model.ops.asarray(not_missing)
|
||||
d_scores = (scores-truths) / scores.shape[0]
|
||||
d_scores *= not_missing
|
||||
mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
|
||||
return mean_square_error, d_scores
|
||||
|
||||
|
|
|
@ -61,13 +61,13 @@ cdef struct TokenC:
|
|||
attr_t sense
|
||||
int head
|
||||
attr_t dep
|
||||
bint sent_start
|
||||
|
||||
uint32_t l_kids
|
||||
uint32_t r_kids
|
||||
uint32_t l_edge
|
||||
uint32_t r_edge
|
||||
|
||||
int sent_start
|
||||
int ent_iob
|
||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||
hash_t ent_id
|
||||
|
|
|
@ -307,6 +307,8 @@ cdef cppclass StateC:
|
|||
this._stack[this._s_i] = this.B(0)
|
||||
this._s_i += 1
|
||||
this._b_i += 1
|
||||
if this.B_(0).sent_start == 1:
|
||||
this.set_break(this.B(0))
|
||||
if this._b_i > this._break:
|
||||
this._break = -1
|
||||
|
||||
|
@ -383,7 +385,7 @@ cdef cppclass StateC:
|
|||
|
||||
void set_break(int i) nogil:
|
||||
if 0 <= i < this.length:
|
||||
this._sent[i].sent_start = True
|
||||
this._sent[i].sent_start = 1
|
||||
this._break = this._b_i
|
||||
|
||||
void clone(const StateC* src) nogil:
|
||||
|
|
|
@ -118,7 +118,7 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
|
|||
cdef class Shift:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
|
||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
|
@ -178,7 +178,7 @@ cdef class Reduce:
|
|||
cdef class LeftArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
return st.B_(0).sent_start != 1
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
|
@ -212,7 +212,7 @@ cdef class LeftArc:
|
|||
cdef class RightArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
return st.B_(0).sent_start != 1
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
|
@ -248,6 +248,10 @@ cdef class Break:
|
|||
return False
|
||||
elif st.stack_depth() < 1:
|
||||
return False
|
||||
elif st.B_(0).l_edge < 0:
|
||||
return False
|
||||
elif st._sent[st.B_(0).l_edge].sent_start < 0:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
|
|
@ -219,30 +219,28 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
raise Exception(move)
|
||||
return t
|
||||
|
||||
#def add_action(self, int action, label_name):
|
||||
# cdef attr_t label_id
|
||||
# if not isinstance(label_name, (int, long)):
|
||||
# label_id = self.strings.add(label_name)
|
||||
# else:
|
||||
# label_id = label_name
|
||||
# if action == OUT and label_id != 0:
|
||||
# return
|
||||
# if action == MISSING or action == ISNT:
|
||||
# return
|
||||
# # Check we're not creating a move we already have, so that this is
|
||||
# # idempotent
|
||||
# for trans in self.c[:self.n_moves]:
|
||||
# if trans.move == action and trans.label == label_id:
|
||||
# return 0
|
||||
# if self.n_moves >= self._size:
|
||||
# self._size *= 2
|
||||
# self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||
# self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
# assert self.c[self.n_moves].label == label_id
|
||||
# self.n_moves += 1
|
||||
# return 1
|
||||
|
||||
|
||||
def add_action(self, int action, label_name):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, (int, long)):
|
||||
label_id = self.strings.add(label_name)
|
||||
else:
|
||||
label_id = label_name
|
||||
if action == OUT and label_id != 0:
|
||||
return
|
||||
if action == MISSING or action == ISNT:
|
||||
return
|
||||
# Check we're not creating a move we already have, so that this is
|
||||
# idempotent
|
||||
for trans in self.c[:self.n_moves]:
|
||||
if trans.move == action and trans.label == label_id:
|
||||
return 0
|
||||
if self.n_moves >= self._size:
|
||||
self._size *= 2
|
||||
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
assert self.c[self.n_moves].label == label_id
|
||||
self.n_moves += 1
|
||||
return 1
|
||||
|
||||
cdef int initialize_state(self, StateC* st) nogil:
|
||||
# This is especially necessary when we use limited training data.
|
||||
|
|
|
@ -51,7 +51,7 @@ from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
|||
from .._ml import Residual, drop_layer, flatten
|
||||
from .._ml import link_vectors_to_models
|
||||
from .._ml import HistoryFeatures
|
||||
from ..compat import json_dumps
|
||||
from ..compat import json_dumps, copy_array
|
||||
|
||||
from . import _parse_features
|
||||
from ._parse_features cimport CONTEXT_SIZE
|
||||
|
@ -239,13 +239,13 @@ cdef class Parser:
|
|||
"""
|
||||
@classmethod
|
||||
def Model(cls, nr_class, **cfg):
|
||||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 2))
|
||||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 0))
|
||||
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
|
||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1))
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 3))
|
||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
||||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4))
|
||||
hist_width = util.env_opt('history_width', cfg.get('hist_width', 16))
|
||||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
||||
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
||||
if hist_size >= 1 and depth == 0:
|
||||
raise ValueError("Inconsistent hyper-params: "
|
||||
"history_feats >= 1 but parser_hidden_depth==0")
|
||||
|
@ -789,12 +789,22 @@ cdef class Parser:
|
|||
return []
|
||||
|
||||
def add_label(self, label):
|
||||
resized = False
|
||||
for action in self.moves.action_types:
|
||||
added = self.moves.add_action(action, label)
|
||||
if added:
|
||||
# Important that the labels be stored as a list! We need the
|
||||
# order, or the model goes out of synch
|
||||
self.cfg.setdefault('extra_labels', []).append(label)
|
||||
resized = True
|
||||
if self.model not in (True, False, None) and resized:
|
||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||
# just adding rows here.
|
||||
smaller = self.model[-1]._layers[-1]
|
||||
larger = Affine(self.moves.n_moves, smaller.nI)
|
||||
copy_array(larger.W[:smaller.nO], smaller.W)
|
||||
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||
self.model[-1]._layers[-1] = larger
|
||||
|
||||
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||
if 'model' in cfg:
|
||||
|
|
68
spacy/tests/parser/test_add_label.py
Normal file
68
spacy/tests/parser/test_add_label.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
'''Test the ability to add a label to a (potentially trained) parsing model.'''
|
||||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import numpy.random
|
||||
from thinc.neural.optimizers import Adam
|
||||
from thinc.neural.ops import NumpyOps
|
||||
|
||||
from ...attrs import NORM
|
||||
from ...gold import GoldParse
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Doc
|
||||
from ...pipeline import NeuralDependencyParser
|
||||
|
||||
numpy.random.seed(0)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vocab():
|
||||
return Vocab(lex_attr_getters={NORM: lambda s: s})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
parser = NeuralDependencyParser(vocab)
|
||||
parser.cfg['token_vector_width'] = 4
|
||||
parser.cfg['hidden_width'] = 6
|
||||
parser.cfg['hist_size'] = 0
|
||||
parser.add_label('left')
|
||||
parser.begin_training([], **parser.cfg)
|
||||
sgd = Adam(NumpyOps(), 0.001)
|
||||
|
||||
for i in range(30):
|
||||
losses = {}
|
||||
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
|
||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
||||
deps=['left', 'ROOT', 'left', 'ROOT'])
|
||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||
return parser
|
||||
|
||||
|
||||
def test_add_label(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = parser(doc)
|
||||
assert doc[0].head.i == 1
|
||||
assert doc[0].dep_ == 'left'
|
||||
assert doc[1].head.i == 1
|
||||
assert doc[2].head.i == 3
|
||||
assert doc[2].head.i == 3
|
||||
parser.add_label('right')
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = parser(doc)
|
||||
assert doc[0].head.i == 1
|
||||
assert doc[0].dep_ == 'left'
|
||||
assert doc[1].head.i == 1
|
||||
assert doc[2].head.i == 3
|
||||
assert doc[2].head.i == 3
|
||||
sgd = Adam(NumpyOps(), 0.001)
|
||||
for i in range(10):
|
||||
losses = {}
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
||||
deps=['right', 'ROOT', 'left', 'ROOT'])
|
||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = parser(doc)
|
||||
assert doc[0].dep_ == 'right'
|
||||
assert doc[2].dep_ == 'left'
|
||||
|
|
@ -35,7 +35,8 @@ def parser(vocab, arc_eager):
|
|||
|
||||
@pytest.fixture
|
||||
def model(arc_eager, tok2vec):
|
||||
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
|
||||
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO,
|
||||
hist_size=0)[0]
|
||||
|
||||
@pytest.fixture
|
||||
def doc(vocab):
|
||||
|
@ -51,7 +52,7 @@ def test_can_init_nn_parser(parser):
|
|||
|
||||
|
||||
def test_build_model(parser):
|
||||
parser.model = Parser.Model(parser.moves.n_moves)[0]
|
||||
parser.model = Parser.Model(parser.moves.n_moves, hist_size=0)[0]
|
||||
assert parser.model is not None
|
||||
|
||||
|
||||
|
|
73
spacy/tests/parser/test_preset_sbd.py
Normal file
73
spacy/tests/parser/test_preset_sbd.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
'''Test that the parser respects preset sentence boundaries.'''
|
||||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
from thinc.neural.optimizers import Adam
|
||||
from thinc.neural.ops import NumpyOps
|
||||
|
||||
from ...attrs import NORM
|
||||
from ...gold import GoldParse
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Doc
|
||||
from ...pipeline import NeuralDependencyParser
|
||||
|
||||
@pytest.fixture
|
||||
def vocab():
|
||||
return Vocab(lex_attr_getters={NORM: lambda s: s})
|
||||
|
||||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
parser = NeuralDependencyParser(vocab)
|
||||
parser.cfg['token_vector_width'] = 4
|
||||
parser.cfg['hidden_width'] = 32
|
||||
#parser.add_label('right')
|
||||
parser.add_label('left')
|
||||
parser.begin_training([], **parser.cfg)
|
||||
sgd = Adam(NumpyOps(), 0.001)
|
||||
|
||||
for i in range(10):
|
||||
losses = {}
|
||||
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
|
||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
||||
deps=['left', 'ROOT', 'left', 'ROOT'])
|
||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||
return parser
|
||||
|
||||
def test_no_sentences(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
def test_sents_1(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc[2].sent_start = True
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) >= 2
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc[1].sent_start = False
|
||||
doc[2].sent_start = True
|
||||
doc[3].sent_start = False
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
def test_sents_1_2(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc[1].sent_start = True
|
||||
doc[2].sent_start = True
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) == 3
|
||||
|
||||
|
||||
def test_sents_1_3(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc[1].sent_start = True
|
||||
doc[3].sent_start = True
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) == 4
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc[1].sent_start = True
|
||||
doc[2].sent_start = False
|
||||
doc[3].sent_start = True
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) == 3
|
9
spacy/tests/serialize/test_serialize_empty_model.py
Normal file
9
spacy/tests/serialize/test_serialize_empty_model.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
import spacy
|
||||
import spacy.lang.en
|
||||
from spacy.pipeline import TextCategorizer
|
||||
|
||||
def test_bytes_serialize_issue_1105():
|
||||
nlp = spacy.lang.en.English()
|
||||
tokenizer = nlp.tokenizer
|
||||
textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
|
||||
textcat_bytes = textcat.to_bytes()
|
|
@ -506,7 +506,7 @@ cdef class Doc:
|
|||
cdef int i
|
||||
start = 0
|
||||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start:
|
||||
if self.c[i].sent_start == 1:
|
||||
yield Span(self, start, i)
|
||||
start = i
|
||||
if start != self.length:
|
||||
|
|
|
@ -25,7 +25,7 @@ cdef class Span:
|
|||
@classmethod
|
||||
def set_extension(cls, name, default=None, method=None,
|
||||
getter=None, setter=None):
|
||||
Underscore.span_extensions[name] = (default, method, getter, setter)
|
||||
Underscore.span_extensions[name] = (default, method, getter, setter)
|
||||
|
||||
@classmethod
|
||||
def get_extension(cls, name):
|
||||
|
@ -129,6 +129,29 @@ cdef class Span:
|
|||
def _(self):
|
||||
return Underscore(Underscore.span_extensions, self,
|
||||
start=self.start_char, end=self.end_char)
|
||||
def as_doc(self):
|
||||
'''Create a Doc object view of the Span's data.
|
||||
|
||||
This is mostly useful for C-typed interfaces.
|
||||
'''
|
||||
cdef Doc doc = Doc(self.doc.vocab)
|
||||
doc.length = self.end-self.start
|
||||
doc.c = &self.doc.c[self.start]
|
||||
doc.mem = self.doc.mem
|
||||
doc.is_parsed = self.doc.is_parsed
|
||||
doc.is_tagged = self.doc.is_tagged
|
||||
doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
|
||||
doc.user_hooks = self.doc.user_hooks
|
||||
doc.user_span_hooks = self.doc.user_span_hooks
|
||||
doc.user_token_hooks = self.doc.user_token_hooks
|
||||
doc.vector = self.vector
|
||||
doc.vector_norm = self.vector_norm
|
||||
for key, value in self.doc.cats.items():
|
||||
if hasattr(key, '__len__') and len(key) == 3:
|
||||
cat_start, cat_end, cat_label = key
|
||||
if cat_start == self.start_char and cat_end == self.end_char:
|
||||
doc.cats[cat_label] = value
|
||||
return doc
|
||||
|
||||
def merge(self, *args, **attributes):
|
||||
"""Retokenize the document, such that the span is merged into a single
|
||||
|
|
|
@ -300,13 +300,21 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return self.c.sent_start
|
||||
|
||||
def __set__(self, bint value):
|
||||
def __set__(self, value):
|
||||
if self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
'Refusing to write to token.sent_start if its document is parsed, '
|
||||
'because this may cause inconsistent state. '
|
||||
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
|
||||
self.c.sent_start = value
|
||||
if value is None:
|
||||
self.c.sent_start = 0
|
||||
elif value is True:
|
||||
self.c.sent_start = 1
|
||||
elif value is False:
|
||||
self.c.sent_start = -1
|
||||
else:
|
||||
raise ValueError("Invalid value for token.sent_start -- must be one of "
|
||||
"None, True, False")
|
||||
|
||||
property lefts:
|
||||
def __get__(self):
|
||||
|
|
Loading…
Reference in New Issue
Block a user