Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-11-04 00:25:02 +01:00
commit 3ca16ddbd4
19 changed files with 290 additions and 138 deletions

View File

@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT 7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements: mark both statements:
* [ ] I am signing on behalf of myself as an individual and no other person * [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my or entity, including my employer, has or will have rights with respect to my
contributions. contributions.
@ -98,9 +98,9 @@ mark both statements:
| Field | Entry | | Field | Entry |
|------------------------------- | -------------------- | |------------------------------- | -------------------- |
| Name | | | Name | Abhinav Sharma |
| Company name (if applicable) | | | Company name (if applicable) | Fourtek I.T. Solutions Pvt. Ltd. |
| Title or role (if applicable) | | | Title or role (if applicable) | Machine Learning Engineer |
| Date | | | Date | 3 Novermber 2017 |
| GitHub username | | | GitHub username | abhi18av |
| Website (optional) | | | Website (optional) | https://abhi18av.github.io/ |

View File

@ -409,12 +409,14 @@ def build_tagger_model(nr_class, **cfg):
else: else:
tok2vec = Tok2Vec(token_vector_width, embed_size, tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=pretrained_dims) pretrained_dims=pretrained_dims)
softmax = with_flatten(Softmax(nr_class, token_vector_width))
model = ( model = (
tok2vec tok2vec
>> with_flatten(Softmax(nr_class, token_vector_width)) >> softmax
) )
model.nI = None model.nI = None
model.tok2vec = tok2vec model.tok2vec = tok2vec
model.softmax = softmax
return model return model

View File

@ -391,6 +391,7 @@ class Language(object):
for name, proc in pipes: for name, proc in pipes:
if not hasattr(proc, 'update'): if not hasattr(proc, 'update'):
continue continue
grads = {}
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
for key, (W, dW) in grads.items(): for key, (W, dW) in grads.items():
sgd(W, dW, key=key) sgd(W, dW, key=key)

View File

@ -129,8 +129,14 @@ cdef class Morphology:
tag (unicode): The part-of-speech tag to key the exception. tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception. orth (unicode): The word-form to key the exception.
""" """
# TODO: Currently we've assumed that we know the number of tags --
# RichTagC is an array, and _cache is a PreshMapArray
# This is really bad: it makes the morphology typed to the tagger
# classes, which is all wrong.
self.exc[(tag_str, orth_str)] = dict(attrs) self.exc[(tag_str, orth_str)] = dict(attrs)
tag = self.strings.add(tag_str) tag = self.strings.add(tag_str)
if tag not in self.reverse_index:
return
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
orth = self.strings[orth_str] orth = self.strings[orth_str]
cdef RichTagC rich_tag = self.rich_tags[tag_id] cdef RichTagC rich_tag = self.rich_tags[tag_id]

View File

@ -11,7 +11,7 @@ import ujson
import msgpack import msgpack
from thinc.api import chain from thinc.api import chain
from thinc.v2v import Affine, Softmax from thinc.v2v import Affine, SELU, Softmax
from thinc.t2v import Pooling, max_pool, mean_pool from thinc.t2v import Pooling, max_pool, mean_pool
from thinc.neural.util import to_categorical, copy_array from thinc.neural.util import to_categorical, copy_array
from thinc.neural._classes.difference import Siamese, CauchySimilarity from thinc.neural._classes.difference import Siamese, CauchySimilarity
@ -29,7 +29,7 @@ from .compat import json_dumps
from .attrs import POS from .attrs import POS
from .parts_of_speech import X from .parts_of_speech import X
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models from ._ml import link_vectors_to_models, zero_init, flatten
from . import util from . import util
@ -91,8 +91,8 @@ class Pipe(object):
Both __call__ and pipe should delegate to the `predict()` Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods. and `set_annotations()` methods.
""" """
scores = self.predict([doc]) scores, tensors = self.predict([doc])
self.set_annotations([doc], scores) self.set_annotations([doc], scores, tensors=tensors)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
@ -103,8 +103,8 @@ class Pipe(object):
""" """
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
scores = self.predict(docs) scores, tensors = self.predict(docs)
self.set_annotations(docs, scores) self.set_annotations(docs, scores, tensor=tensors)
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
@ -113,7 +113,7 @@ class Pipe(object):
""" """
raise NotImplementedError raise NotImplementedError
def set_annotations(self, docs, scores): def set_annotations(self, docs, scores, tensors=None):
"""Modify a batch of documents, using pre-computed scores.""" """Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError raise NotImplementedError
@ -216,7 +216,7 @@ class Tensorizer(Pipe):
name = 'tensorizer' name = 'tensorizer'
@classmethod @classmethod
def Model(cls, width=128, embed_size=4000, **cfg): def Model(cls, output_size=300, input_size=384, **cfg):
"""Create a new statistical model for the class. """Create a new statistical model for the class.
width (int): Output size of the model. width (int): Output size of the model.
@ -224,9 +224,11 @@ class Tensorizer(Pipe):
**cfg: Config parameters. **cfg: Config parameters.
RETURNS (Model): A `thinc.neural.Model` or similar instance. RETURNS (Model): A `thinc.neural.Model` or similar instance.
""" """
width = util.env_opt('token_vector_width', width) model = chain(
embed_size = util.env_opt('embed_size', embed_size) SELU(output_size, input_size),
return Tok2Vec(width, embed_size, **cfg) SELU(output_size, output_size),
zero_init(Affine(output_size, output_size)))
return model
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
"""Construct a new statistical model. Weights are not allocated on """Construct a new statistical model. Weights are not allocated on
@ -244,6 +246,7 @@ class Tensorizer(Pipe):
""" """
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.input_models = []
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.cfg.setdefault('cnn_maxout_pieces', 3) self.cfg.setdefault('cnn_maxout_pieces', 3)
@ -269,8 +272,8 @@ class Tensorizer(Pipe):
""" """
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
tokvecses = self.predict(docs) tensors = self.predict(docs)
self.set_annotations(docs, tokvecses) self.set_annotations(docs, tensors)
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
@ -279,18 +282,19 @@ class Tensorizer(Pipe):
docs (iterable): A sequence of `Doc` objects. docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the docs. RETURNS (object): Vector representations for each token in the docs.
""" """
tokvecs = self.model(docs) inputs = self.model.ops.flatten([doc.tensor for doc in docs])
return tokvecs outputs = self.model(inputs)
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
def set_annotations(self, docs, tokvecses): def set_annotations(self, docs, tensors):
"""Set the tensor attribute for a batch of documents. """Set the tensor attribute for a batch of documents.
docs (iterable): A sequence of `Doc` objects. docs (iterable): A sequence of `Doc` objects.
tokvecs (object): Vector representation for each token in the docs. tensors (object): Vector representation for each token in the docs.
""" """
for doc, tokvecs in zip(docs, tokvecses): for doc, tensor in zip(docs, tensors):
assert tokvecs.shape[0] == len(doc) assert tensor.shape[0] == len(doc)
doc.tensor = tokvecs doc.tensor = tensor
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
"""Update the model. """Update the model.
@ -303,11 +307,34 @@ class Tensorizer(Pipe):
""" """
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop) inputs = []
return tokvecs, bp_tokvecs bp_inputs = []
for tok2vec in self.input_models:
tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
inputs.append(tensor)
bp_inputs.append(bp_tensor)
inputs = self.model.ops.xp.hstack(inputs)
scores, bp_scores = self.model.begin_update(inputs, drop=drop)
loss, d_scores = self.get_loss(docs, golds, scores)
d_inputs = bp_scores(d_scores, sgd=sgd)
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
for d_input, bp_input in zip(d_inputs, bp_inputs):
bp_input(d_input, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.)
losses[self.name] += loss
return loss
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, prediction):
raise NotImplementedError target = []
i = 0
for doc in docs:
vectors = self.model.ops.xp.vstack([w.vector for w in doc])
target.append(vectors)
target = self.model.ops.xp.vstack(target)
d_scores = (prediction - target) / prediction.shape[0]
loss = (d_scores**2).sum()
return loss, d_scores
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None):
"""Allocate models, pre-process training data and acquire a trainer and """Allocate models, pre-process training data and acquire a trainer and
@ -316,8 +343,13 @@ class Tensorizer(Pipe):
gold_tuples (iterable): Gold-standard training data. gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of. pipeline (list): The pipeline the model is part of.
""" """
for name, model in pipeline:
if getattr(model, 'tok2vec', None):
self.input_models.append(model.tok2vec)
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['input_size'] = 384
self.cfg['output_size'] = 300
#self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg) self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
@ -337,28 +369,37 @@ class Tagger(Pipe):
def labels(self): def labels(self):
return self.vocab.morphology.tag_names return self.vocab.morphology.tag_names
@property
def tok2vec(self):
if self.model in (None, True, False):
return None
else:
return chain(self.model.tok2vec, flatten)
def __call__(self, doc): def __call__(self, doc):
tags = self.predict([doc]) tags, tokvecs = self.predict([doc])
self.set_annotations([doc], tags) self.set_annotations([doc], tags, tensors=tokvecs)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
tag_ids = self.predict(docs) tag_ids, tokvecs = self.predict(docs)
self.set_annotations(docs, tag_ids) self.set_annotations(docs, tag_ids, tensors=tokvecs)
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
scores = self.model(docs) tokvecs = self.model.tok2vec(docs)
scores = self.model.ops.flatten(scores) scores = self.model.softmax(tokvecs)
guesses = scores.argmax(axis=1) guesses = []
if not isinstance(guesses, numpy.ndarray): for doc_scores in scores:
guesses = guesses.get() doc_guesses = doc_scores.argmax(axis=1)
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs]) if not isinstance(doc_guesses, numpy.ndarray):
return guesses doc_guesses = doc_guesses.get()
guesses.append(doc_guesses)
return guesses, tokvecs
def set_annotations(self, docs, batch_tag_ids): def set_annotations(self, docs, batch_tag_ids, tensors=None):
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
@ -373,6 +414,8 @@ class Tagger(Pipe):
if doc.c[j].tag == 0 and doc.c[j].pos == 0: if doc.c[j].tag == 0 and doc.c[j].pos == 0:
vocab.morphology.assign_tag_id(&doc.c[j], tag_id) vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
idx += 1 idx += 1
if tensors is not None:
doc.extend_tensor(tensors[i])
doc.is_tagged = True doc.is_tagged = True
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
@ -573,7 +616,7 @@ class MultitaskObjective(Tagger):
def labels(self, value): def labels(self, value):
self.cfg['labels'] = value self.cfg['labels'] = value
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids, tensors=None):
pass pass
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None): def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
@ -720,15 +763,15 @@ class TextCategorizer(Pipe):
self.cfg['labels'] = value self.cfg['labels'] = value
def __call__(self, doc): def __call__(self, doc):
scores = self.predict([doc]) scores, tensors = self.predict([doc])
self.set_annotations([doc], scores) self.set_annotations([doc], scores, tensors=tensors)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
scores = self.predict(docs) scores, tensors = self.predict(docs)
self.set_annotations(docs, scores) self.set_annotations(docs, scores, tensors=tensors)
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
@ -736,8 +779,10 @@ class TextCategorizer(Pipe):
scores = self.model.ops.asarray(scores) scores = self.model.ops.asarray(scores)
return scores return scores
def set_annotations(self, docs, scores): def set_annotations(self, docs, scores, tensors=None):
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
if tensors is not None:
doc.extend_tensor(tensors[i])
for j, label in enumerate(self.labels): for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j]) doc.cats[label] = float(scores[i, j])

View File

@ -1,6 +1,7 @@
# cython: infer_types=True # cython: infer_types=True
# cython: cdivision=True # cython: cdivision=True
# cython: boundscheck=False # cython: boundscheck=False
# cython: profile=True
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
@ -322,15 +323,17 @@ cdef class Parser:
beam_density = self.cfg.get('beam_density', 0.0) beam_density = self.cfg.get('beam_density', 0.0)
cdef Beam beam cdef Beam beam
if beam_width == 1: if beam_width == 1:
states = self.parse_batch([doc]) states, tokvecs = self.parse_batch([doc])
self.set_annotations([doc], states) self.set_annotations([doc], states, tensors=tokvecs)
return doc return doc
else: else:
beam = self.beam_parse([doc], beams, tokvecs = self.beam_parse([doc],
beam_width=beam_width, beam_density=beam_density)[0] beam_width=beam_width,
beam_density=beam_density)
beam = beams[0]
output = self.moves.get_beam_annot(beam) output = self.moves.get_beam_annot(beam)
state = <StateClass>beam.at(0) state = <StateClass>beam.at(0)
self.set_annotations([doc], [state]) self.set_annotations([doc], [state], tensors=tokvecs)
_cleanup(beam) _cleanup(beam)
return output return output
@ -356,15 +359,16 @@ cdef class Parser:
for subbatch in cytoolz.partition_all(8, by_length): for subbatch in cytoolz.partition_all(8, by_length):
subbatch = list(subbatch) subbatch = list(subbatch)
if beam_width == 1: if beam_width == 1:
parse_states = self.parse_batch(subbatch) parse_states, tokvecs = self.parse_batch(subbatch)
beams = [] beams = []
else: else:
beams = self.beam_parse(subbatch, beam_width=beam_width, beams, tokvecs = self.beam_parse(subbatch,
beam_width=beam_width,
beam_density=beam_density) beam_density=beam_density)
parse_states = [] parse_states = []
for beam in beams: for beam in beams:
parse_states.append(<StateClass>beam.at(0)) parse_states.append(<StateClass>beam.at(0))
self.set_annotations(subbatch, parse_states) self.set_annotations(subbatch, parse_states, tensors=tokvecs)
yield from batch yield from batch
def parse_batch(self, docs): def parse_batch(self, docs):
@ -411,7 +415,9 @@ cdef class Parser:
feat_weights, bias, hW, hb, feat_weights, bias, hW, hb,
nr_class, nr_hidden, nr_feat, nr_piece) nr_class, nr_hidden, nr_feat, nr_piece)
PyErr_CheckSignals() PyErr_CheckSignals()
return state_objs tokvecs = self.model[0].ops.unflatten(tokvecs,
[len(doc) for doc in docs])
return state_objs, tokvecs
cdef void _parseC(self, StateC* state, cdef void _parseC(self, StateC* state,
const float* feat_weights, const float* bias, const float* feat_weights, const float* bias,
@ -508,7 +514,9 @@ cdef class Parser:
beam.advance(_transition_state, _hash_state, <void*>self.moves.c) beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL) beam.check_done(_check_final_state, NULL)
beams.append(beam) beams.append(beam)
return beams tokvecs = self.model[0].ops.unflatten(tokvecs,
[len(doc) for doc in docs])
return beams, tokvecs
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds): if not any(self.moves.has_gold(gold) for gold in golds):
@ -735,18 +743,29 @@ cdef class Parser:
c_d_scores += d_scores.shape[1] c_d_scores += d_scores.shape[1]
return d_scores return d_scores
def set_annotations(self, docs, states): def set_annotations(self, docs, states, tensors=None):
cdef StateClass state cdef StateClass state
cdef Doc doc cdef Doc doc
for state, doc in zip(states, docs): for i, (state, doc) in enumerate(zip(states, docs)):
self.moves.finalize_state(state.c) self.moves.finalize_state(state.c)
for i in range(doc.length): for j in range(doc.length):
doc.c[i] = state.c._sent[i] doc.c[j] = state.c._sent[j]
if tensors is not None:
doc.extend_tensor(tensors[i])
self.moves.finalize_doc(doc) self.moves.finalize_doc(doc)
for hook in self.postprocesses: for hook in self.postprocesses:
for doc in docs: for doc in docs:
hook(doc) hook(doc)
@property
def tok2vec(self):
'''Return the embedding and convolutional layer of the model.'''
if self.model in (None, True, False):
return None
else:
return self.model[0]
@property @property
def postprocesses(self): def postprocesses(self):
# Available for subclasses, e.g. to deprojectivize # Available for subclasses, e.g. to deprojectivize

View File

@ -22,35 +22,37 @@ def test_doc_lemmatization(EN):
("ring", ["ring"]), ("ring", ["ring"]),
("axes", ["axis", "axe", "ax"])]) ("axes", ["axis", "axe", "ax"])])
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
assert en_lemmatizer.noun(text) == set(lemmas) assert en_lemmatizer.noun(text) == lemmas
@pytest.mark.models('en') @pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]), @pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
("feed", ["feed"]), ("feed", ["feed"]),
("need", ["need"]), ("need", ["need"]),
("ring", ["ring"]), ("ring", ["ring"])])
("axes", ["axis", "axe", "ax"])])
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
assert en_lemmatizer.noun(text) == set(lemmas) # Cases like this are problematic -- not clear what we should do to resolve
# ambiguity?
# ("axes", ["ax", "axes", "axis"])])
assert en_lemmatizer.noun(text) == lemmas
@pytest.mark.xfail @pytest.mark.xfail
@pytest.mark.models('en') @pytest.mark.models('en')
def test_en_lemmatizer_base_forms(en_lemmatizer): def test_en_lemmatizer_base_forms(en_lemmatizer):
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive']) assert en_lemmatizer.noun('dive', {'number': 'sing'}) == ['dive']
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva']) assert en_lemmatizer.noun('dive', {'number': 'plur'}) == ['diva']
@pytest.mark.models('en') @pytest.mark.models('en')
def test_en_lemmatizer_base_form_verb(en_lemmatizer): def test_en_lemmatizer_base_form_verb(en_lemmatizer):
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see']) assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == ['see']
@pytest.mark.models('en') @pytest.mark.models('en')
def test_en_lemmatizer_punct(en_lemmatizer): def test_en_lemmatizer_punct(en_lemmatizer):
assert en_lemmatizer.punct('') == set(['"']) assert en_lemmatizer.punct('') == ['"']
assert en_lemmatizer.punct('') == set(['"']) assert en_lemmatizer.punct('') == ['"']
@pytest.mark.models('en') @pytest.mark.models('en')

View File

@ -75,3 +75,11 @@ def test_en_models_probs(example):
assert not prob0 == prob1 assert not prob0 == prob1
assert not prob0 == prob2 assert not prob0 == prob2
assert not prob1 == prob2 assert not prob1 == prob2
@pytest.mark.models('en')
def test_no_vectors_similarity(EN):
doc1 = EN(u'hallo')
doc2 = EN(u'hi')
assert doc1.similarity(doc2) > 0

View File

@ -56,7 +56,7 @@ def test_sents_1_2(parser):
doc[1].sent_start = True doc[1].sent_start = True
doc[2].sent_start = True doc[2].sent_start = True
doc = parser(doc) doc = parser(doc)
assert len(list(doc.sents)) == 3 assert len(list(doc.sents)) >= 3
def test_sents_1_3(parser): def test_sents_1_3(parser):

View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
import regex as re
from ...lang.en import English
from ...tokenizer import Tokenizer
def test_issue1488():
prefix_re = re.compile(r'''[\[\("']''')
suffix_re = re.compile(r'''[\]\)"']''')
infix_re = re.compile(r'''[-~\.]''')
simple_url_re = re.compile(r'''^https?://''')
def my_tokenizer(nlp):
return Tokenizer(nlp.vocab, {},
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=simple_url_re.match)
nlp = English()
nlp.tokenizer = my_tokenizer(nlp)
doc = nlp("This is a test.")
for token in doc:
assert token.text

View File

@ -72,7 +72,17 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
dY = model.ops.allocate((15, nO, nP)) dY = model.ops.allocate((15, nO, nP))
ids = model.ops.allocate((15, nF)) ids = model.ops.allocate((15, nF))
ids[1,2] = -1 ids[1,2] = -1
dY[1,2] = 1 dY[1] = 1
assert model.d_pad[0, 2, 0, 0] == 0. assert model.d_pad[0, 2, 0, 0] == 0.
model._backprop_padding(dY, ids) model._backprop_padding(dY, ids)
assert model.d_pad[0, 2, 0, 0] == 1. assert model.d_pad[0, 2, 0, 0] == 1.
model.d_pad.fill(0.)
ids.fill(0.)
dY.fill(0.)
ids[1,2] = -1
ids[1,1] = -1
ids[1,0] = -1
dY[1] = 1
assert model.d_pad[0, 2, 0, 0] == 0.
model._backprop_padding(dY, ids)
assert model.d_pad[0, 2, 0, 0] == 3.

View File

@ -10,6 +10,7 @@ import numpy.linalg
import struct import struct
import dill import dill
import msgpack import msgpack
from thinc.neural.util import get_array_module, copy_array
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.math cimport sqrt from libc.math cimport sqrt
@ -306,9 +307,9 @@ cdef class Doc:
def __get__(self): def __get__(self):
if 'has_vector' in self.user_hooks: if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self) return self.user_hooks['has_vector'](self)
elif any(token.has_vector for token in self): elif self.vocab.vectors.data.size:
return True return True
elif self.tensor is not None: elif self.tensor.size:
return True return True
else: else:
return False return False
@ -329,13 +330,13 @@ cdef class Doc:
self._vector = numpy.zeros((self.vocab.vectors_length,), self._vector = numpy.zeros((self.vocab.vectors_length,),
dtype='f') dtype='f')
return self._vector return self._vector
elif self.has_vector: elif self.vocab.vectors.data.size > 0:
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
for token in self.c[:self.length]: for token in self.c[:self.length]:
vector += self.vocab.get_vector(token.lex.orth) vector += self.vocab.get_vector(token.lex.orth)
self._vector = vector / len(self) self._vector = vector / len(self)
return self._vector return self._vector
elif self.tensor is not None: elif self.tensor.size > 0:
self._vector = self.tensor.mean(axis=0) self._vector = self.tensor.mean(axis=0)
return self._vector return self._vector
else: else:
@ -827,6 +828,23 @@ cdef class Doc:
attrs[:, 2:]) attrs[:, 2:])
return self return self
def extend_tensor(self, tensor):
'''Concatenate a new tensor onto the doc.tensor object.
The doc.tensor attribute holds dense feature vectors
computed by the models in the pipeline. Let's say a
document with 30 words has a tensor with 128 dimensions
per word. doc.tensor.shape will be (30, 128). After
calling doc.extend_tensor with an array of hape (30, 64),
doc.tensor == (30, 192).
'''
xp = get_array_module(self.tensor)
if self.tensor.size == 0:
self.tensor.resize(tensor.shape)
copy_array(self.tensor, tensor)
else:
self.tensor = xp.hstack((self.tensor, tensor))
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at """Retokenize the document, such that the span at
`doc.text[start_idx : end_idx]` is merged into a single token. If `doc.text[start_idx : end_idx]` is merged into a single token. If

View File

@ -283,7 +283,12 @@ cdef class Span:
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.user_span_hooks: if 'has_vector' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['has_vector'](self) return self.doc.user_span_hooks['has_vector'](self)
elif self.vocab.vectors.data.size > 0:
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
elif self.doc.tensor.size > 0:
return True
else:
return False
property vector: property vector:
"""A real-valued meaning representation. Defaults to an average of the """A real-valued meaning representation. Defaults to an average of the

View File

@ -292,6 +292,8 @@ cdef class Token:
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.user_token_hooks: if 'has_vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['has_vector'](self) return self.doc.user_token_hooks['has_vector'](self)
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
return True
return self.vocab.has_vector(self.c.lex.orth) return self.vocab.has_vector(self.c.lex.orth)
property vector: property vector:
@ -303,6 +305,9 @@ cdef class Token:
def __get__(self): def __get__(self):
if 'vector' in self.doc.user_token_hooks: if 'vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector'](self) return self.doc.user_token_hooks['vector'](self)
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
return self.doc.tensor[self.i]
else:
return self.vocab.get_vector(self.c.lex.orth) return self.vocab.get_vector(self.c.lex.orth)
property vector_norm: property vector_norm:

View File

@ -11,9 +11,8 @@ if environment == "deploy"
script(src="/assets/js/vendor/prism.min.js") script(src="/assets/js/vendor/prism.min.js")
if SECTION == "models" if compare_models
script(src="/assets/js/vendor/chart.min.js") script(src="/assets/js/vendor/chart.min.js")
script(src="/assets/js/models.js?v#{V_JS}" type="module")
script script
if quickstart if quickstart
@ -24,15 +23,15 @@ script
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
if IS_PAGE
if IS_PAGE
script
| ((window.gitter = {}).chat = {}).options = { | ((window.gitter = {}).chat = {}).options = {
| useStyles: false, | useStyles: false,
| activationElement: '.js-gitter-button', | activationElement: '.js-gitter-button',
| targetElement: '.js-gitter', | targetElement: '.js-gitter',
| room: '!{SOCIAL.gitter}' | room: '!{SOCIAL.gitter}'
| }; | };
if IS_PAGE
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
@ -48,10 +47,23 @@ if IS_PAGE
- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");" - ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
- ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");" - ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
//- Browsers with JS module support. if environment == "deploy"
Will be ignored otherwise. //- DEPLOY: use compiled rollup.js and instantiate classes directly
script(src="/assets/js/rollup.js")
script(type="module") script
!=ProgressBar
if changelog
!=Changelog
if IS_PAGE
!=NavHighlighter
!=GitHubEmbed
if HAS_MODELS
!=ModelLoader
if compare_models
!=ModelComparer
else
//- DEVELOPMENT: Use ES6 modules
script(type="module")
| import ProgressBar from '/assets/js/progress.js'; | import ProgressBar from '/assets/js/progress.js';
!=ProgressBar !=ProgressBar
if changelog if changelog
@ -68,19 +80,3 @@ script(type="module")
if compare_models if compare_models
| import { ModelComparer } from '/assets/js/models.js'; | import { ModelComparer } from '/assets/js/models.js';
!=ModelComparer !=ModelComparer
//- Browsers with no JS module support.
Won't be fetched or interpreted otherwise.
script(nomodule src="/assets/js/rollup.js")
script(nomodule)
!=ProgressBar
if changelog
!=Changelog
if IS_PAGE
!=NavHighlighter
!=GitHubEmbed
if HAS_MODELS
!=ModeLoader
if compare_models
!=ModelComparer

View File

@ -198,6 +198,7 @@ export class ModelComparer {
this.fonts = CHART_FONTS; this.fonts = CHART_FONTS;
this.defaultModels = defaultModels; this.defaultModels = defaultModels;
this.tpl.get('result').style.display = 'block'; this.tpl.get('result').style.display = 'block';
this.tpl.get('error').style.display = 'none';
this.fetchCompat() this.fetchCompat()
.then(compat => this.init(compat)) .then(compat => this.init(compat))
.catch(this.showError.bind(this)) .catch(this.showError.bind(this))

View File

@ -40,13 +40,10 @@
}, },
"MODELS": { "MODELS": {
"en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"], "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
"de": ["de_core_news_sm", "de_core_news_md"], "de": ["de_core_news_sm"],
"es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"], "es": ["es_core_news_sm", "es_core_news_md"],
"pt": ["pt_core_news_sm"],
"fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"],
"it": ["it_core_news_sm"], "it": ["it_core_news_sm"],
"nl": ["nl_core_news_sm"],
"xx": ["xx_ent_wiki_sm"] "xx": ["xx_ent_wiki_sm"]
}, },

View File

@ -218,7 +218,7 @@ p
| If an exception consists of more than one token, the #[code ORTH] values | If an exception consists of more than one token, the #[code ORTH] values
| combined always need to #[strong match the original string]. The way the | combined always need to #[strong match the original string]. The way the
| original string is split up can be pretty arbitrary sometimes for | original string is split up can be pretty arbitrary sometimes for
| example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to"). | example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
| Because of how the tokenizer works, it's currently not possible to split | Because of how the tokenizer works, it's currently not possible to split
| single-letter strings into multiple tokens. | single-letter strings into multiple tokens.

View File

@ -198,11 +198,11 @@ p
| #[code .finditer()] methods: | #[code .finditer()] methods:
+code. +code.
import re import regex as re
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
prefix_re = re.compile(r'''[\[\(&quot;&apos;]''') prefix_re = re.compile(r'''^[\[\(&quot;&apos;]''')
suffix_re = re.compile(r'''[\]\)&quot;&apos;]''') suffix_re = re.compile(r'''[\]\)&quot;&apos;]$''')
infix_re = re.compile(r'''[-~]''') infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''') simple_url_re = re.compile(r'''^https?://''')
@ -220,6 +220,17 @@ p
| specialize are #[code find_prefix], #[code find_suffix] and | specialize are #[code find_prefix], #[code find_suffix] and
| #[code find_infix]. | #[code find_infix].
+infobox("Important note", "⚠️")
| When customising the prefix, suffix and infix handling, remember that
| you're passing in #[strong functions] for spaCy to execute, e.g.
| #[code prefix_re.search] not just the regular expressions. This means
| that your functions also need to define how the rules should be applied.
| For example, if you're adding your own prefix rules, you need
| to make sure they're only applied to characters at the
| #[strong beginning of a token], e.g. by adding #[code ^]. Similarly,
| suffix rules should only be applied at the #[strong end of a token],
| so your expression should end with a #[code $].
+h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline +h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
p p