Merge Span._ and Span.as_doc methods

This commit is contained in:
Matthew Honnibal 2017-10-09 22:00:15 -05:00
commit e0a9b02b67
21 changed files with 333 additions and 97 deletions

View File

@ -21,7 +21,6 @@ import thinc.neural._classes.layernorm
thinc.neural._classes.layernorm.set_compat_six_eight(False)
def train_textcat(tokenizer, textcat,
train_texts, train_cats, dev_texts, dev_cats,
n_iter=20):
@ -57,13 +56,15 @@ def evaluate(tokenizer, textcat, texts, cats):
for i, doc in enumerate(textcat.pipe(docs)):
gold = cats[i]
for label, score in doc.cats.items():
if score >= 0.5 and label in gold:
if label not in gold:
continue
if score >= 0.5 and gold[label] >= 0.5:
tp += 1.
elif score >= 0.5 and label not in gold:
elif score >= 0.5 and gold[label] < 0.5:
fp += 1.
elif score < 0.5 and label not in gold:
elif score < 0.5 and gold[label] < 0.5:
tn += 1
if score < 0.5 and label in gold:
elif score < 0.5 and gold[label] >= 0.5:
fn += 1
precis = tp / (tp + fp)
recall = tp / (tp + fn)
@ -80,7 +81,7 @@ def load_data(limit=0):
train_data = train_data[-limit:]
texts, labels = zip(*train_data)
cats = [(['POSITIVE'] if y else []) for y in labels]
cats = [{'POSITIVE': bool(y)} for y in labels]
split = int(len(train_data) * 0.8)
@ -97,7 +98,7 @@ def main(model_loc=None):
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
print("Load IMDB data")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000)
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
print("Itn.\tLoss\tP\tR\tF")
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'

View File

@ -264,7 +264,8 @@ def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
return layerize(noop())
embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
for i in range(hist_size)]
embed = concatenate(*embed_tables)
embed = chain(concatenate(*embed_tables),
LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
ops = embed.ops
def add_history_fwd(vectors_hists, drop=0.):
vectors, hist_ids = vectors_hists
@ -742,5 +743,3 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover
return ys, concatenate_lists_bwd
model = wrap(concatenate_lists_fwd, concat)
return model

View File

@ -3,13 +3,13 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy-nightly'
__version__ = '2.0.0a16'
__version__ = '2.0.0a17'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
__author__ = 'Explosion AI'
__email__ = 'contact@explosion.ai'
__license__ = 'MIT'
__release__ = True
__release__ = False
__docs_models__ = 'https://alpha.spacy.io/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
from .converters import conllu2json, iob2json
from .converters import conllu2json, iob2json, conll_ner2json
from ..util import prints
# Converters are matched by file extension. To add a converter, add a new entry
@ -12,9 +12,10 @@ from ..util import prints
# from /converters.
CONVERTERS = {
'.conllu': conllu2json,
'.conll': conllu2json,
'.iob': iob2json,
'conllu': conllu2json,
'conll': conllu2json,
'ner': conll_ner2json,
'iob': iob2json,
}
@ -22,9 +23,11 @@ CONVERTERS = {
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", int),
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
converter='auto'):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
@ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
prints(input_path, title="Input file not found", exits=1)
if not output_path.exists():
prints(output_path, title="Output directory not found", exits=1)
file_ext = input_path.suffix
if not file_ext in CONVERTERS:
prints("Can't find converter for %s" % input_path.parts[-1],
if converter == 'auto':
converter = input_path.suffix[1:]
if not converter in CONVERTERS:
prints("Can't find converter for %s" % converter,
title="Unknown format", exits=1)
CONVERTERS[file_ext](input_path, output_path,
func = CONVERTERS[converter]
func(input_path, output_path,
n_sents=n_sents, use_morphology=morphology)

View File

@ -1,2 +1,3 @@
from .conllu2json import conllu2json
from .iob2json import iob2json
from .conll_ner2json import conll_ner2json

View File

@ -114,15 +114,33 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
nlp.to_disk(epoch_model_path)
nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate(
list(corpus.dev_docs(
dev_docs = list(corpus.dev_docs(
nlp_loaded,
gold_preproc=gold_preproc)))
gold_preproc=gold_preproc))
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs)
end_time = timer()
if use_gpu < 0:
gpu_wps = None
cpu_wps = nwords/(end_time-start_time)
else:
gpu_wps = nwords/(end_time-start_time)
with Model.use_device('cpu'):
nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
dev_docs = list(corpus.dev_docs(
nlp_loaded, gold_preproc=gold_preproc))
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs)
end_time = timer()
cpu_wps = nwords/(end_time-start_time)
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores))
meta_loc = output_path / ('model%d' % i) / 'meta.json'
meta['accuracy'] = scorer.scores
meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps}
meta['lang'] = nlp.lang
meta['pipeline'] = pipeline
meta['spacy_version'] = '>=%s' % about.__version__
@ -132,7 +150,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
with meta_loc.open('w') as file_:
file_.write(json_dumps(meta))
util.set_env_log(True)
print_progress(i, losses, scorer.scores)
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
finally:
print("Saving model...")
try:
@ -153,16 +171,17 @@ def _render_parses(i, to_render):
file_.write(html)
def print_progress(itn, losses, dev_scores, wps=0.0):
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f', 'wps']:
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
scores[col] = 0.0
scores['dep_loss'] = losses.get('parser', 0.0)
scores['ner_loss'] = losses.get('ner', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores)
scores['wps'] = wps
scores['cpu_wps'] = cpu_wps
scores['gpu_wps'] = gpu_wps or 0.0
tpl = '\t'.join((
'{:d}',
'{dep_loss:.3f}',
@ -173,7 +192,9 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
'{ents_f:.3f}',
'{tags_acc:.3f}',
'{token_acc:.3f}',
'{wps:.1f}'))
'{cpu_wps:.1f}',
'{gpu_wps:.1f}',
))
print(tpl.format(itn, **scores))

View File

@ -387,7 +387,7 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False,
cats=tuple()):
cats=None):
"""Create a GoldParse.
doc (Doc): The document the annotations refer to.
@ -398,12 +398,15 @@ cdef class GoldParse:
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
cats (iterable): A sequence of labels for text classification. Each
label may be a string or an int, or a `(start_char, end_char, label)`
cats (dict): Labels for text classification. Each key in the dictionary
may be a string or an int, or a `(start_char, end_char, label)`
tuple, indicating that the label is applied to only part of the
document (usually a sentence). Unlike entity annotations, label
annotations can overlap, i.e. a single word can be covered by
multiple labelled spans.
multiple labelled spans. The TextCategorizer component expects
true examples of a label to have the value 1.0, and negative examples
of a label to have the value 0.0. Labels not in the dictionary are
treated as missing -- the gradient for those labels will be zero.
RETURNS (GoldParse): The newly constructed object.
"""
if words is None:
@ -434,7 +437,7 @@ cdef class GoldParse:
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.cats = list(cats)
self.cats = {} if cats is None else dict(cats)
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
self.heads = [None] * len(doc)

View File

@ -126,7 +126,7 @@ def word_shape(text):
LEX_ATTRS = {
attrs.LOWER: lambda string: string.lower(),
attrs.NORM: lambda string: string.lower(),
attrs.PREFIX: lambda string: string[0],
attrs.PREFIX: lambda string: string[:3],
attrs.SUFFIX: lambda string: string[-3:],
attrs.CLUSTER: lambda string: 0,
attrs.IS_ALPHA: lambda string: string.isalpha(),

View File

@ -158,11 +158,13 @@ class BaseThincComponent(object):
def to_bytes(self, **exclude):
"""Serialize the pipe to a bytestring."""
serialize = OrderedDict((
('cfg', lambda: json_dumps(self.cfg)),
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
serialize = OrderedDict()
serialize['cfg'] = lambda: json_dumps(self.cfg)
if self.model in (True, False, None):
serialize['model'] = lambda: self.model
else:
serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
@ -183,11 +185,11 @@ class BaseThincComponent(object):
def to_disk(self, path, **exclude):
"""Serialize the pipe to disk."""
serialize = OrderedDict((
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
('vocab', lambda p: self.vocab.to_disk(p)),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
))
serialize = OrderedDict()
serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
if self.model not in (None, True, False):
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
@ -438,13 +440,16 @@ class NeuralTagger(BaseThincComponent):
yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes()),
('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
serialize = OrderedDict()
if self.model in (None, True, False):
serialize['model'] = lambda: self.model
else:
serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
use_bin_type=True,
encoding='utf8'))
))
encoding='utf8')
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
@ -552,7 +557,6 @@ class NeuralLabeller(NeuralTagger):
label = self.make_label(i, words, tags, heads, deps, ents)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
print(len(self.labels))
if self.model is True:
token_vector_width = util.env_opt('token_vector_width')
self.model = chain(
@ -721,11 +725,17 @@ class TextCategorizer(BaseThincComponent):
def get_loss(self, docs, golds, scores):
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
for i, gold in enumerate(golds):
for j, label in enumerate(self.labels):
truths[i, j] = label in gold.cats
if label in gold.cats:
truths[i, j] = gold.cats[label]
else:
not_missing[i, j] = 0.
truths = self.model.ops.asarray(truths)
not_missing = self.model.ops.asarray(not_missing)
d_scores = (scores-truths) / scores.shape[0]
d_scores *= not_missing
mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
return mean_square_error, d_scores

View File

@ -61,13 +61,13 @@ cdef struct TokenC:
attr_t sense
int head
attr_t dep
bint sent_start
uint32_t l_kids
uint32_t r_kids
uint32_t l_edge
uint32_t r_edge
int sent_start
int ent_iob
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
hash_t ent_id

View File

@ -307,6 +307,8 @@ cdef cppclass StateC:
this._stack[this._s_i] = this.B(0)
this._s_i += 1
this._b_i += 1
if this.B_(0).sent_start == 1:
this.set_break(this.B(0))
if this._b_i > this._break:
this._break = -1
@ -383,7 +385,7 @@ cdef cppclass StateC:
void set_break(int i) nogil:
if 0 <= i < this.length:
this._sent[i].sent_start = True
this._sent[i].sent_start = 1
this._break = this._b_i
void clone(const StateC* src) nogil:

View File

@ -118,7 +118,7 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
cdef class Shift:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
@ -178,7 +178,7 @@ cdef class Reduce:
cdef class LeftArc:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start
return st.B_(0).sent_start != 1
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
@ -212,7 +212,7 @@ cdef class LeftArc:
cdef class RightArc:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start
return st.B_(0).sent_start != 1
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
@ -248,6 +248,10 @@ cdef class Break:
return False
elif st.stack_depth() < 1:
return False
elif st.B_(0).l_edge < 0:
return False
elif st._sent[st.B_(0).l_edge].sent_start < 0:
return False
else:
return True

View File

@ -219,30 +219,28 @@ cdef class BiluoPushDown(TransitionSystem):
raise Exception(move)
return t
#def add_action(self, int action, label_name):
# cdef attr_t label_id
# if not isinstance(label_name, (int, long)):
# label_id = self.strings.add(label_name)
# else:
# label_id = label_name
# if action == OUT and label_id != 0:
# return
# if action == MISSING or action == ISNT:
# return
# # Check we're not creating a move we already have, so that this is
# # idempotent
# for trans in self.c[:self.n_moves]:
# if trans.move == action and trans.label == label_id:
# return 0
# if self.n_moves >= self._size:
# self._size *= 2
# self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
# self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
# assert self.c[self.n_moves].label == label_id
# self.n_moves += 1
# return 1
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, (int, long)):
label_id = self.strings.add(label_name)
else:
label_id = label_name
if action == OUT and label_id != 0:
return
if action == MISSING or action == ISNT:
return
# Check we're not creating a move we already have, so that this is
# idempotent
for trans in self.c[:self.n_moves]:
if trans.move == action and trans.label == label_id:
return 0
if self.n_moves >= self._size:
self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id
self.n_moves += 1
return 1
cdef int initialize_state(self, StateC* st) nogil:
# This is especially necessary when we use limited training data.

View File

@ -51,7 +51,7 @@ from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
from .._ml import Residual, drop_layer, flatten
from .._ml import link_vectors_to_models
from .._ml import HistoryFeatures
from ..compat import json_dumps
from ..compat import json_dumps, copy_array
from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE
@ -239,13 +239,13 @@ cdef class Parser:
"""
@classmethod
def Model(cls, nr_class, **cfg):
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 2))
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 0))
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 3))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 16))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
if hist_size >= 1 and depth == 0:
raise ValueError("Inconsistent hyper-params: "
"history_feats >= 1 but parser_hidden_depth==0")
@ -789,12 +789,22 @@ cdef class Parser:
return []
def add_label(self, label):
resized = False
for action in self.moves.action_types:
added = self.moves.add_action(action, label)
if added:
# Important that the labels be stored as a list! We need the
# order, or the model goes out of synch
self.cfg.setdefault('extra_labels', []).append(label)
resized = True
if self.model not in (True, False, None) and resized:
# Weights are stored in (nr_out, nr_in) format, so we're basically
# just adding rows here.
smaller = self.model[-1]._layers[-1]
larger = Affine(self.moves.n_moves, smaller.nI)
copy_array(larger.W[:smaller.nO], smaller.W)
copy_array(larger.b[:smaller.nO], smaller.b)
self.model[-1]._layers[-1] = larger
def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg:

View File

@ -0,0 +1,68 @@
'''Test the ability to add a label to a (potentially trained) parsing model.'''
from __future__ import unicode_literals
import pytest
import numpy.random
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
from ...attrs import NORM
from ...gold import GoldParse
from ...vocab import Vocab
from ...tokens import Doc
from ...pipeline import NeuralDependencyParser
numpy.random.seed(0)
@pytest.fixture
def vocab():
return Vocab(lex_attr_getters={NORM: lambda s: s})
@pytest.fixture
def parser(vocab):
parser = NeuralDependencyParser(vocab)
parser.cfg['token_vector_width'] = 4
parser.cfg['hidden_width'] = 6
parser.cfg['hist_size'] = 0
parser.add_label('left')
parser.begin_training([], **parser.cfg)
sgd = Adam(NumpyOps(), 0.001)
for i in range(30):
losses = {}
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
gold = GoldParse(doc, heads=[1, 1, 3, 3],
deps=['left', 'ROOT', 'left', 'ROOT'])
parser.update([doc], [gold], sgd=sgd, losses=losses)
return parser
def test_add_label(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = parser(doc)
assert doc[0].head.i == 1
assert doc[0].dep_ == 'left'
assert doc[1].head.i == 1
assert doc[2].head.i == 3
assert doc[2].head.i == 3
parser.add_label('right')
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = parser(doc)
assert doc[0].head.i == 1
assert doc[0].dep_ == 'left'
assert doc[1].head.i == 1
assert doc[2].head.i == 3
assert doc[2].head.i == 3
sgd = Adam(NumpyOps(), 0.001)
for i in range(10):
losses = {}
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
gold = GoldParse(doc, heads=[1, 1, 3, 3],
deps=['right', 'ROOT', 'left', 'ROOT'])
parser.update([doc], [gold], sgd=sgd, losses=losses)
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = parser(doc)
assert doc[0].dep_ == 'right'
assert doc[2].dep_ == 'left'

View File

@ -35,7 +35,8 @@ def parser(vocab, arc_eager):
@pytest.fixture
def model(arc_eager, tok2vec):
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO,
hist_size=0)[0]
@pytest.fixture
def doc(vocab):
@ -51,7 +52,7 @@ def test_can_init_nn_parser(parser):
def test_build_model(parser):
parser.model = Parser.Model(parser.moves.n_moves)[0]
parser.model = Parser.Model(parser.moves.n_moves, hist_size=0)[0]
assert parser.model is not None

View File

@ -0,0 +1,73 @@
'''Test that the parser respects preset sentence boundaries.'''
from __future__ import unicode_literals
import pytest
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
from ...attrs import NORM
from ...gold import GoldParse
from ...vocab import Vocab
from ...tokens import Doc
from ...pipeline import NeuralDependencyParser
@pytest.fixture
def vocab():
return Vocab(lex_attr_getters={NORM: lambda s: s})
@pytest.fixture
def parser(vocab):
parser = NeuralDependencyParser(vocab)
parser.cfg['token_vector_width'] = 4
parser.cfg['hidden_width'] = 32
#parser.add_label('right')
parser.add_label('left')
parser.begin_training([], **parser.cfg)
sgd = Adam(NumpyOps(), 0.001)
for i in range(10):
losses = {}
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
gold = GoldParse(doc, heads=[1, 1, 3, 3],
deps=['left', 'ROOT', 'left', 'ROOT'])
parser.update([doc], [gold], sgd=sgd, losses=losses)
return parser
def test_no_sentences(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = parser(doc)
assert len(list(doc.sents)) == 2
def test_sents_1(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc[2].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) >= 2
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc[1].sent_start = False
doc[2].sent_start = True
doc[3].sent_start = False
doc = parser(doc)
assert len(list(doc.sents)) == 2
def test_sents_1_2(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc[1].sent_start = True
doc[2].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) == 3
def test_sents_1_3(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc[1].sent_start = True
doc[3].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) == 4
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc[1].sent_start = True
doc[2].sent_start = False
doc[3].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) == 3

View File

@ -0,0 +1,9 @@
import spacy
import spacy.lang.en
from spacy.pipeline import TextCategorizer
def test_bytes_serialize_issue_1105():
nlp = spacy.lang.en.English()
tokenizer = nlp.tokenizer
textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
textcat_bytes = textcat.to_bytes()

View File

@ -506,7 +506,7 @@ cdef class Doc:
cdef int i
start = 0
for i in range(1, self.length):
if self.c[i].sent_start:
if self.c[i].sent_start == 1:
yield Span(self, start, i)
start = i
if start != self.length:

View File

@ -129,6 +129,29 @@ cdef class Span:
def _(self):
return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char)
def as_doc(self):
'''Create a Doc object view of the Span's data.
This is mostly useful for C-typed interfaces.
'''
cdef Doc doc = Doc(self.doc.vocab)
doc.length = self.end-self.start
doc.c = &self.doc.c[self.start]
doc.mem = self.doc.mem
doc.is_parsed = self.doc.is_parsed
doc.is_tagged = self.doc.is_tagged
doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
doc.user_hooks = self.doc.user_hooks
doc.user_span_hooks = self.doc.user_span_hooks
doc.user_token_hooks = self.doc.user_token_hooks
doc.vector = self.vector
doc.vector_norm = self.vector_norm
for key, value in self.doc.cats.items():
if hasattr(key, '__len__') and len(key) == 3:
cat_start, cat_end, cat_label = key
if cat_start == self.start_char and cat_end == self.end_char:
doc.cats[cat_label] = value
return doc
def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single

View File

@ -300,13 +300,21 @@ cdef class Token:
def __get__(self):
return self.c.sent_start
def __set__(self, bint value):
def __set__(self, value):
if self.doc.is_parsed:
raise ValueError(
'Refusing to write to token.sent_start if its document is parsed, '
'because this may cause inconsistent state. '
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
self.c.sent_start = value
if value is None:
self.c.sent_start = 0
elif value is True:
self.c.sent_start = 1
elif value is False:
self.c.sent_start = -1
else:
raise ValueError("Invalid value for token.sent_start -- must be one of "
"None, True, False")
property lefts:
def __get__(self):