mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 05:34:10 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
3ca16ddbd4
14
.github/CONTRIBUTOR_AGREEMENT.md
vendored
14
.github/CONTRIBUTOR_AGREEMENT.md
vendored
|
@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
|
||||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
mark both statements:
|
mark both statements:
|
||||||
|
|
||||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
or entity, including my employer, has or will have rights with respect to my
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
contributions.
|
contributions.
|
||||||
|
|
||||||
|
@ -98,9 +98,9 @@ mark both statements:
|
||||||
|
|
||||||
| Field | Entry |
|
| Field | Entry |
|
||||||
|------------------------------- | -------------------- |
|
|------------------------------- | -------------------- |
|
||||||
| Name | |
|
| Name | Abhinav Sharma |
|
||||||
| Company name (if applicable) | |
|
| Company name (if applicable) | Fourtek I.T. Solutions Pvt. Ltd. |
|
||||||
| Title or role (if applicable) | |
|
| Title or role (if applicable) | Machine Learning Engineer |
|
||||||
| Date | |
|
| Date | 3 Novermber 2017 |
|
||||||
| GitHub username | |
|
| GitHub username | abhi18av |
|
||||||
| Website (optional) | |
|
| Website (optional) | https://abhi18av.github.io/ |
|
||||||
|
|
|
@ -409,12 +409,14 @@ def build_tagger_model(nr_class, **cfg):
|
||||||
else:
|
else:
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||||
pretrained_dims=pretrained_dims)
|
pretrained_dims=pretrained_dims)
|
||||||
|
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
||||||
model = (
|
model = (
|
||||||
tok2vec
|
tok2vec
|
||||||
>> with_flatten(Softmax(nr_class, token_vector_width))
|
>> softmax
|
||||||
)
|
)
|
||||||
model.nI = None
|
model.nI = None
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
|
model.softmax = softmax
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -391,6 +391,7 @@ class Language(object):
|
||||||
for name, proc in pipes:
|
for name, proc in pipes:
|
||||||
if not hasattr(proc, 'update'):
|
if not hasattr(proc, 'update'):
|
||||||
continue
|
continue
|
||||||
|
grads = {}
|
||||||
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
|
|
|
@ -129,8 +129,14 @@ cdef class Morphology:
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
orth (unicode): The word-form to key the exception.
|
||||||
"""
|
"""
|
||||||
|
# TODO: Currently we've assumed that we know the number of tags --
|
||||||
|
# RichTagC is an array, and _cache is a PreshMapArray
|
||||||
|
# This is really bad: it makes the morphology typed to the tagger
|
||||||
|
# classes, which is all wrong.
|
||||||
self.exc[(tag_str, orth_str)] = dict(attrs)
|
self.exc[(tag_str, orth_str)] = dict(attrs)
|
||||||
tag = self.strings.add(tag_str)
|
tag = self.strings.add(tag_str)
|
||||||
|
if tag not in self.reverse_index:
|
||||||
|
return
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
orth = self.strings[orth_str]
|
orth = self.strings[orth_str]
|
||||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||||
|
|
|
@ -11,7 +11,7 @@ import ujson
|
||||||
import msgpack
|
import msgpack
|
||||||
|
|
||||||
from thinc.api import chain
|
from thinc.api import chain
|
||||||
from thinc.v2v import Affine, Softmax
|
from thinc.v2v import Affine, SELU, Softmax
|
||||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||||
from thinc.neural.util import to_categorical, copy_array
|
from thinc.neural.util import to_categorical, copy_array
|
||||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||||
|
@ -29,7 +29,7 @@ from .compat import json_dumps
|
||||||
from .attrs import POS
|
from .attrs import POS
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||||
from ._ml import link_vectors_to_models
|
from ._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -91,8 +91,8 @@ class Pipe(object):
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
scores = self.predict([doc])
|
scores, tensors = self.predict([doc])
|
||||||
self.set_annotations([doc], scores)
|
self.set_annotations([doc], scores, tensors=tensors)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
@ -103,8 +103,8 @@ class Pipe(object):
|
||||||
"""
|
"""
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
scores = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores)
|
self.set_annotations(docs, scores, tensor=tensors)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -113,7 +113,7 @@ class Pipe(object):
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def set_annotations(self, docs, scores):
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
"""Modify a batch of documents, using pre-computed scores."""
|
"""Modify a batch of documents, using pre-computed scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -216,7 +216,7 @@ class Tensorizer(Pipe):
|
||||||
name = 'tensorizer'
|
name = 'tensorizer'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, width=128, embed_size=4000, **cfg):
|
def Model(cls, output_size=300, input_size=384, **cfg):
|
||||||
"""Create a new statistical model for the class.
|
"""Create a new statistical model for the class.
|
||||||
|
|
||||||
width (int): Output size of the model.
|
width (int): Output size of the model.
|
||||||
|
@ -224,9 +224,11 @@ class Tensorizer(Pipe):
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
||||||
"""
|
"""
|
||||||
width = util.env_opt('token_vector_width', width)
|
model = chain(
|
||||||
embed_size = util.env_opt('embed_size', embed_size)
|
SELU(output_size, input_size),
|
||||||
return Tok2Vec(width, embed_size, **cfg)
|
SELU(output_size, output_size),
|
||||||
|
zero_init(Affine(output_size, output_size)))
|
||||||
|
return model
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
"""Construct a new statistical model. Weights are not allocated on
|
"""Construct a new statistical model. Weights are not allocated on
|
||||||
|
@ -244,6 +246,7 @@ class Tensorizer(Pipe):
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.input_models = []
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
||||||
|
@ -269,8 +272,8 @@ class Tensorizer(Pipe):
|
||||||
"""
|
"""
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tokvecses = self.predict(docs)
|
tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, tokvecses)
|
self.set_annotations(docs, tensors)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -279,18 +282,19 @@ class Tensorizer(Pipe):
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
RETURNS (object): Vector representations for each token in the docs.
|
RETURNS (object): Vector representations for each token in the docs.
|
||||||
"""
|
"""
|
||||||
tokvecs = self.model(docs)
|
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
|
||||||
return tokvecs
|
outputs = self.model(inputs)
|
||||||
|
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
|
||||||
|
|
||||||
def set_annotations(self, docs, tokvecses):
|
def set_annotations(self, docs, tensors):
|
||||||
"""Set the tensor attribute for a batch of documents.
|
"""Set the tensor attribute for a batch of documents.
|
||||||
|
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
tokvecs (object): Vector representation for each token in the docs.
|
tensors (object): Vector representation for each token in the docs.
|
||||||
"""
|
"""
|
||||||
for doc, tokvecs in zip(docs, tokvecses):
|
for doc, tensor in zip(docs, tensors):
|
||||||
assert tokvecs.shape[0] == len(doc)
|
assert tensor.shape[0] == len(doc)
|
||||||
doc.tensor = tokvecs
|
doc.tensor = tensor
|
||||||
|
|
||||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
||||||
"""Update the model.
|
"""Update the model.
|
||||||
|
@ -303,11 +307,34 @@ class Tensorizer(Pipe):
|
||||||
"""
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop)
|
inputs = []
|
||||||
return tokvecs, bp_tokvecs
|
bp_inputs = []
|
||||||
|
for tok2vec in self.input_models:
|
||||||
|
tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
|
||||||
|
inputs.append(tensor)
|
||||||
|
bp_inputs.append(bp_tensor)
|
||||||
|
inputs = self.model.ops.xp.hstack(inputs)
|
||||||
|
scores, bp_scores = self.model.begin_update(inputs, drop=drop)
|
||||||
|
loss, d_scores = self.get_loss(docs, golds, scores)
|
||||||
|
d_inputs = bp_scores(d_scores, sgd=sgd)
|
||||||
|
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
|
||||||
|
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
||||||
|
bp_input(d_input, sgd=sgd)
|
||||||
|
if losses is not None:
|
||||||
|
losses.setdefault(self.name, 0.)
|
||||||
|
losses[self.name] += loss
|
||||||
|
return loss
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, prediction):
|
||||||
raise NotImplementedError
|
target = []
|
||||||
|
i = 0
|
||||||
|
for doc in docs:
|
||||||
|
vectors = self.model.ops.xp.vstack([w.vector for w in doc])
|
||||||
|
target.append(vectors)
|
||||||
|
target = self.model.ops.xp.vstack(target)
|
||||||
|
d_scores = (prediction - target) / prediction.shape[0]
|
||||||
|
loss = (d_scores**2).sum()
|
||||||
|
return loss, d_scores
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
"""Allocate models, pre-process training data and acquire a trainer and
|
"""Allocate models, pre-process training data and acquire a trainer and
|
||||||
|
@ -316,8 +343,13 @@ class Tensorizer(Pipe):
|
||||||
gold_tuples (iterable): Gold-standard training data.
|
gold_tuples (iterable): Gold-standard training data.
|
||||||
pipeline (list): The pipeline the model is part of.
|
pipeline (list): The pipeline the model is part of.
|
||||||
"""
|
"""
|
||||||
|
for name, model in pipeline:
|
||||||
|
if getattr(model, 'tok2vec', None):
|
||||||
|
self.input_models.append(model.tok2vec)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
self.cfg['input_size'] = 384
|
||||||
|
self.cfg['output_size'] = 300
|
||||||
|
#self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
|
@ -337,28 +369,37 @@ class Tagger(Pipe):
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return self.vocab.morphology.tag_names
|
return self.vocab.morphology.tag_names
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tok2vec(self):
|
||||||
|
if self.model in (None, True, False):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return chain(self.model.tok2vec, flatten)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict([doc])
|
tags, tokvecs = self.predict([doc])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags, tensors=tokvecs)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
tag_ids = self.predict(docs)
|
tag_ids, tokvecs = self.predict(docs)
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids, tensors=tokvecs)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
scores = self.model(docs)
|
tokvecs = self.model.tok2vec(docs)
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.softmax(tokvecs)
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = []
|
||||||
if not isinstance(guesses, numpy.ndarray):
|
for doc_scores in scores:
|
||||||
guesses = guesses.get()
|
doc_guesses = doc_scores.argmax(axis=1)
|
||||||
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
|
if not isinstance(doc_guesses, numpy.ndarray):
|
||||||
return guesses
|
doc_guesses = doc_guesses.get()
|
||||||
|
guesses.append(doc_guesses)
|
||||||
|
return guesses, tokvecs
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids, tensors=None):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
@ -373,6 +414,8 @@ class Tagger(Pipe):
|
||||||
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
||||||
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
||||||
idx += 1
|
idx += 1
|
||||||
|
if tensors is not None:
|
||||||
|
doc.extend_tensor(tensors[i])
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
@ -573,7 +616,7 @@ class MultitaskObjective(Tagger):
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
self.cfg['labels'] = value
|
self.cfg['labels'] = value
|
||||||
|
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
|
||||||
|
@ -720,15 +763,15 @@ class TextCategorizer(Pipe):
|
||||||
self.cfg['labels'] = value
|
self.cfg['labels'] = value
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
scores = self.predict([doc])
|
scores, tensors = self.predict([doc])
|
||||||
self.set_annotations([doc], scores)
|
self.set_annotations([doc], scores, tensors=tensors)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
scores = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -736,8 +779,10 @@ class TextCategorizer(Pipe):
|
||||||
scores = self.model.ops.asarray(scores)
|
scores = self.model.ops.asarray(scores)
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
def set_annotations(self, docs, scores):
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
|
if tensors is not None:
|
||||||
|
doc.extend_tensor(tensors[i])
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
doc.cats[label] = float(scores[i, j])
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: cdivision=True
|
# cython: cdivision=True
|
||||||
# cython: boundscheck=False
|
# cython: boundscheck=False
|
||||||
|
# cython: profile=True
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
@ -322,15 +323,17 @@ cdef class Parser:
|
||||||
beam_density = self.cfg.get('beam_density', 0.0)
|
beam_density = self.cfg.get('beam_density', 0.0)
|
||||||
cdef Beam beam
|
cdef Beam beam
|
||||||
if beam_width == 1:
|
if beam_width == 1:
|
||||||
states = self.parse_batch([doc])
|
states, tokvecs = self.parse_batch([doc])
|
||||||
self.set_annotations([doc], states)
|
self.set_annotations([doc], states, tensors=tokvecs)
|
||||||
return doc
|
return doc
|
||||||
else:
|
else:
|
||||||
beam = self.beam_parse([doc],
|
beams, tokvecs = self.beam_parse([doc],
|
||||||
beam_width=beam_width, beam_density=beam_density)[0]
|
beam_width=beam_width,
|
||||||
|
beam_density=beam_density)
|
||||||
|
beam = beams[0]
|
||||||
output = self.moves.get_beam_annot(beam)
|
output = self.moves.get_beam_annot(beam)
|
||||||
state = <StateClass>beam.at(0)
|
state = <StateClass>beam.at(0)
|
||||||
self.set_annotations([doc], [state])
|
self.set_annotations([doc], [state], tensors=tokvecs)
|
||||||
_cleanup(beam)
|
_cleanup(beam)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
@ -356,15 +359,16 @@ cdef class Parser:
|
||||||
for subbatch in cytoolz.partition_all(8, by_length):
|
for subbatch in cytoolz.partition_all(8, by_length):
|
||||||
subbatch = list(subbatch)
|
subbatch = list(subbatch)
|
||||||
if beam_width == 1:
|
if beam_width == 1:
|
||||||
parse_states = self.parse_batch(subbatch)
|
parse_states, tokvecs = self.parse_batch(subbatch)
|
||||||
beams = []
|
beams = []
|
||||||
else:
|
else:
|
||||||
beams = self.beam_parse(subbatch, beam_width=beam_width,
|
beams, tokvecs = self.beam_parse(subbatch,
|
||||||
|
beam_width=beam_width,
|
||||||
beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
parse_states = []
|
parse_states = []
|
||||||
for beam in beams:
|
for beam in beams:
|
||||||
parse_states.append(<StateClass>beam.at(0))
|
parse_states.append(<StateClass>beam.at(0))
|
||||||
self.set_annotations(subbatch, parse_states)
|
self.set_annotations(subbatch, parse_states, tensors=tokvecs)
|
||||||
yield from batch
|
yield from batch
|
||||||
|
|
||||||
def parse_batch(self, docs):
|
def parse_batch(self, docs):
|
||||||
|
@ -411,7 +415,9 @@ cdef class Parser:
|
||||||
feat_weights, bias, hW, hb,
|
feat_weights, bias, hW, hb,
|
||||||
nr_class, nr_hidden, nr_feat, nr_piece)
|
nr_class, nr_hidden, nr_feat, nr_piece)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
return state_objs
|
tokvecs = self.model[0].ops.unflatten(tokvecs,
|
||||||
|
[len(doc) for doc in docs])
|
||||||
|
return state_objs, tokvecs
|
||||||
|
|
||||||
cdef void _parseC(self, StateC* state,
|
cdef void _parseC(self, StateC* state,
|
||||||
const float* feat_weights, const float* bias,
|
const float* feat_weights, const float* bias,
|
||||||
|
@ -508,7 +514,9 @@ cdef class Parser:
|
||||||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||||
beam.check_done(_check_final_state, NULL)
|
beam.check_done(_check_final_state, NULL)
|
||||||
beams.append(beam)
|
beams.append(beam)
|
||||||
return beams
|
tokvecs = self.model[0].ops.unflatten(tokvecs,
|
||||||
|
[len(doc) for doc in docs])
|
||||||
|
return beams, tokvecs
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||||
|
@ -735,18 +743,29 @@ cdef class Parser:
|
||||||
c_d_scores += d_scores.shape[1]
|
c_d_scores += d_scores.shape[1]
|
||||||
return d_scores
|
return d_scores
|
||||||
|
|
||||||
def set_annotations(self, docs, states):
|
def set_annotations(self, docs, states, tensors=None):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
for state, doc in zip(states, docs):
|
for i, (state, doc) in enumerate(zip(states, docs)):
|
||||||
self.moves.finalize_state(state.c)
|
self.moves.finalize_state(state.c)
|
||||||
for i in range(doc.length):
|
for j in range(doc.length):
|
||||||
doc.c[i] = state.c._sent[i]
|
doc.c[j] = state.c._sent[j]
|
||||||
|
if tensors is not None:
|
||||||
|
doc.extend_tensor(tensors[i])
|
||||||
self.moves.finalize_doc(doc)
|
self.moves.finalize_doc(doc)
|
||||||
|
|
||||||
for hook in self.postprocesses:
|
for hook in self.postprocesses:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
hook(doc)
|
hook(doc)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tok2vec(self):
|
||||||
|
'''Return the embedding and convolutional layer of the model.'''
|
||||||
|
if self.model in (None, True, False):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return self.model[0]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def postprocesses(self):
|
def postprocesses(self):
|
||||||
# Available for subclasses, e.g. to deprojectivize
|
# Available for subclasses, e.g. to deprojectivize
|
||||||
|
|
|
@ -22,35 +22,37 @@ def test_doc_lemmatization(EN):
|
||||||
("ring", ["ring"]),
|
("ring", ["ring"]),
|
||||||
("axes", ["axis", "axe", "ax"])])
|
("axes", ["axis", "axe", "ax"])])
|
||||||
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||||
assert en_lemmatizer.noun(text) == set(lemmas)
|
assert en_lemmatizer.noun(text) == lemmas
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
|
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
|
||||||
("feed", ["feed"]),
|
("feed", ["feed"]),
|
||||||
("need", ["need"]),
|
("need", ["need"]),
|
||||||
("ring", ["ring"]),
|
("ring", ["ring"])])
|
||||||
("axes", ["axis", "axe", "ax"])])
|
|
||||||
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||||
assert en_lemmatizer.noun(text) == set(lemmas)
|
# Cases like this are problematic -- not clear what we should do to resolve
|
||||||
|
# ambiguity?
|
||||||
|
# ("axes", ["ax", "axes", "axis"])])
|
||||||
|
assert en_lemmatizer.noun(text) == lemmas
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_lemmatizer_base_forms(en_lemmatizer):
|
def test_en_lemmatizer_base_forms(en_lemmatizer):
|
||||||
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
|
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == ['dive']
|
||||||
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
|
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == ['diva']
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_lemmatizer_base_form_verb(en_lemmatizer):
|
def test_en_lemmatizer_base_form_verb(en_lemmatizer):
|
||||||
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
|
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == ['see']
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_lemmatizer_punct(en_lemmatizer):
|
def test_en_lemmatizer_punct(en_lemmatizer):
|
||||||
assert en_lemmatizer.punct('“') == set(['"'])
|
assert en_lemmatizer.punct('“') == ['"']
|
||||||
assert en_lemmatizer.punct('“') == set(['"'])
|
assert en_lemmatizer.punct('“') == ['"']
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
|
|
|
@ -75,3 +75,11 @@ def test_en_models_probs(example):
|
||||||
assert not prob0 == prob1
|
assert not prob0 == prob1
|
||||||
assert not prob0 == prob2
|
assert not prob0 == prob2
|
||||||
assert not prob1 == prob2
|
assert not prob1 == prob2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_no_vectors_similarity(EN):
|
||||||
|
doc1 = EN(u'hallo')
|
||||||
|
doc2 = EN(u'hi')
|
||||||
|
assert doc1.similarity(doc2) > 0
|
||||||
|
|
||||||
|
|
|
@ -56,7 +56,7 @@ def test_sents_1_2(parser):
|
||||||
doc[1].sent_start = True
|
doc[1].sent_start = True
|
||||||
doc[2].sent_start = True
|
doc[2].sent_start = True
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
assert len(list(doc.sents)) == 3
|
assert len(list(doc.sents)) >= 3
|
||||||
|
|
||||||
|
|
||||||
def test_sents_1_3(parser):
|
def test_sents_1_3(parser):
|
||||||
|
|
26
spacy/tests/regression/test_issue1488.py
Normal file
26
spacy/tests/regression/test_issue1488.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import regex as re
|
||||||
|
from ...lang.en import English
|
||||||
|
from ...tokenizer import Tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue1488():
|
||||||
|
prefix_re = re.compile(r'''[\[\("']''')
|
||||||
|
suffix_re = re.compile(r'''[\]\)"']''')
|
||||||
|
infix_re = re.compile(r'''[-~\.]''')
|
||||||
|
simple_url_re = re.compile(r'''^https?://''')
|
||||||
|
|
||||||
|
def my_tokenizer(nlp):
|
||||||
|
return Tokenizer(nlp.vocab, {},
|
||||||
|
prefix_search=prefix_re.search,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
token_match=simple_url_re.match)
|
||||||
|
|
||||||
|
nlp = English()
|
||||||
|
nlp.tokenizer = my_tokenizer(nlp)
|
||||||
|
doc = nlp("This is a test.")
|
||||||
|
for token in doc:
|
||||||
|
assert token.text
|
|
@ -72,7 +72,17 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
||||||
dY = model.ops.allocate((15, nO, nP))
|
dY = model.ops.allocate((15, nO, nP))
|
||||||
ids = model.ops.allocate((15, nF))
|
ids = model.ops.allocate((15, nF))
|
||||||
ids[1,2] = -1
|
ids[1,2] = -1
|
||||||
dY[1,2] = 1
|
dY[1] = 1
|
||||||
assert model.d_pad[0, 2, 0, 0] == 0.
|
assert model.d_pad[0, 2, 0, 0] == 0.
|
||||||
model._backprop_padding(dY, ids)
|
model._backprop_padding(dY, ids)
|
||||||
assert model.d_pad[0, 2, 0, 0] == 1.
|
assert model.d_pad[0, 2, 0, 0] == 1.
|
||||||
|
model.d_pad.fill(0.)
|
||||||
|
ids.fill(0.)
|
||||||
|
dY.fill(0.)
|
||||||
|
ids[1,2] = -1
|
||||||
|
ids[1,1] = -1
|
||||||
|
ids[1,0] = -1
|
||||||
|
dY[1] = 1
|
||||||
|
assert model.d_pad[0, 2, 0, 0] == 0.
|
||||||
|
model._backprop_padding(dY, ids)
|
||||||
|
assert model.d_pad[0, 2, 0, 0] == 3.
|
||||||
|
|
|
@ -10,6 +10,7 @@ import numpy.linalg
|
||||||
import struct
|
import struct
|
||||||
import dill
|
import dill
|
||||||
import msgpack
|
import msgpack
|
||||||
|
from thinc.neural.util import get_array_module, copy_array
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
@ -306,9 +307,9 @@ cdef class Doc:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.user_hooks:
|
if 'has_vector' in self.user_hooks:
|
||||||
return self.user_hooks['has_vector'](self)
|
return self.user_hooks['has_vector'](self)
|
||||||
elif any(token.has_vector for token in self):
|
elif self.vocab.vectors.data.size:
|
||||||
return True
|
return True
|
||||||
elif self.tensor is not None:
|
elif self.tensor.size:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -329,13 +330,13 @@ cdef class Doc:
|
||||||
self._vector = numpy.zeros((self.vocab.vectors_length,),
|
self._vector = numpy.zeros((self.vocab.vectors_length,),
|
||||||
dtype='f')
|
dtype='f')
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.has_vector:
|
elif self.vocab.vectors.data.size > 0:
|
||||||
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||||
for token in self.c[:self.length]:
|
for token in self.c[:self.length]:
|
||||||
vector += self.vocab.get_vector(token.lex.orth)
|
vector += self.vocab.get_vector(token.lex.orth)
|
||||||
self._vector = vector / len(self)
|
self._vector = vector / len(self)
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.tensor is not None:
|
elif self.tensor.size > 0:
|
||||||
self._vector = self.tensor.mean(axis=0)
|
self._vector = self.tensor.mean(axis=0)
|
||||||
return self._vector
|
return self._vector
|
||||||
else:
|
else:
|
||||||
|
@ -827,6 +828,23 @@ cdef class Doc:
|
||||||
attrs[:, 2:])
|
attrs[:, 2:])
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def extend_tensor(self, tensor):
|
||||||
|
'''Concatenate a new tensor onto the doc.tensor object.
|
||||||
|
|
||||||
|
The doc.tensor attribute holds dense feature vectors
|
||||||
|
computed by the models in the pipeline. Let's say a
|
||||||
|
document with 30 words has a tensor with 128 dimensions
|
||||||
|
per word. doc.tensor.shape will be (30, 128). After
|
||||||
|
calling doc.extend_tensor with an array of hape (30, 64),
|
||||||
|
doc.tensor == (30, 192).
|
||||||
|
'''
|
||||||
|
xp = get_array_module(self.tensor)
|
||||||
|
if self.tensor.size == 0:
|
||||||
|
self.tensor.resize(tensor.shape)
|
||||||
|
copy_array(self.tensor, tensor)
|
||||||
|
else:
|
||||||
|
self.tensor = xp.hstack((self.tensor, tensor))
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""Retokenize the document, such that the span at
|
"""Retokenize the document, such that the span at
|
||||||
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
||||||
|
|
|
@ -283,7 +283,12 @@ cdef class Span:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.user_span_hooks:
|
if 'has_vector' in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks['has_vector'](self)
|
return self.doc.user_span_hooks['has_vector'](self)
|
||||||
|
elif self.vocab.vectors.data.size > 0:
|
||||||
return any(token.has_vector for token in self)
|
return any(token.has_vector for token in self)
|
||||||
|
elif self.doc.tensor.size > 0:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
"""A real-valued meaning representation. Defaults to an average of the
|
"""A real-valued meaning representation. Defaults to an average of the
|
||||||
|
|
|
@ -292,6 +292,8 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.user_token_hooks:
|
if 'has_vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['has_vector'](self)
|
return self.doc.user_token_hooks['has_vector'](self)
|
||||||
|
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||||
|
return True
|
||||||
return self.vocab.has_vector(self.c.lex.orth)
|
return self.vocab.has_vector(self.c.lex.orth)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
|
@ -303,6 +305,9 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.doc.user_token_hooks:
|
if 'vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['vector'](self)
|
return self.doc.user_token_hooks['vector'](self)
|
||||||
|
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||||
|
return self.doc.tensor[self.i]
|
||||||
|
else:
|
||||||
return self.vocab.get_vector(self.c.lex.orth)
|
return self.vocab.get_vector(self.c.lex.orth)
|
||||||
|
|
||||||
property vector_norm:
|
property vector_norm:
|
||||||
|
|
|
@ -11,9 +11,8 @@ if environment == "deploy"
|
||||||
|
|
||||||
script(src="/assets/js/vendor/prism.min.js")
|
script(src="/assets/js/vendor/prism.min.js")
|
||||||
|
|
||||||
if SECTION == "models"
|
if compare_models
|
||||||
script(src="/assets/js/vendor/chart.min.js")
|
script(src="/assets/js/vendor/chart.min.js")
|
||||||
script(src="/assets/js/models.js?v#{V_JS}" type="module")
|
|
||||||
|
|
||||||
script
|
script
|
||||||
if quickstart
|
if quickstart
|
||||||
|
@ -24,15 +23,15 @@ script
|
||||||
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
||||||
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
||||||
|
|
||||||
|
if IS_PAGE
|
||||||
if IS_PAGE
|
|
||||||
script
|
|
||||||
| ((window.gitter = {}).chat = {}).options = {
|
| ((window.gitter = {}).chat = {}).options = {
|
||||||
| useStyles: false,
|
| useStyles: false,
|
||||||
| activationElement: '.js-gitter-button',
|
| activationElement: '.js-gitter-button',
|
||||||
| targetElement: '.js-gitter',
|
| targetElement: '.js-gitter',
|
||||||
| room: '!{SOCIAL.gitter}'
|
| room: '!{SOCIAL.gitter}'
|
||||||
| };
|
| };
|
||||||
|
|
||||||
|
if IS_PAGE
|
||||||
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
|
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,10 +47,23 @@ if IS_PAGE
|
||||||
- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
|
- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
|
||||||
- ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
|
- ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
|
||||||
|
|
||||||
//- Browsers with JS module support.
|
if environment == "deploy"
|
||||||
Will be ignored otherwise.
|
//- DEPLOY: use compiled rollup.js and instantiate classes directly
|
||||||
|
script(src="/assets/js/rollup.js")
|
||||||
script(type="module")
|
script
|
||||||
|
!=ProgressBar
|
||||||
|
if changelog
|
||||||
|
!=Changelog
|
||||||
|
if IS_PAGE
|
||||||
|
!=NavHighlighter
|
||||||
|
!=GitHubEmbed
|
||||||
|
if HAS_MODELS
|
||||||
|
!=ModelLoader
|
||||||
|
if compare_models
|
||||||
|
!=ModelComparer
|
||||||
|
else
|
||||||
|
//- DEVELOPMENT: Use ES6 modules
|
||||||
|
script(type="module")
|
||||||
| import ProgressBar from '/assets/js/progress.js';
|
| import ProgressBar from '/assets/js/progress.js';
|
||||||
!=ProgressBar
|
!=ProgressBar
|
||||||
if changelog
|
if changelog
|
||||||
|
@ -68,19 +80,3 @@ script(type="module")
|
||||||
if compare_models
|
if compare_models
|
||||||
| import { ModelComparer } from '/assets/js/models.js';
|
| import { ModelComparer } from '/assets/js/models.js';
|
||||||
!=ModelComparer
|
!=ModelComparer
|
||||||
|
|
||||||
//- Browsers with no JS module support.
|
|
||||||
Won't be fetched or interpreted otherwise.
|
|
||||||
|
|
||||||
script(nomodule src="/assets/js/rollup.js")
|
|
||||||
script(nomodule)
|
|
||||||
!=ProgressBar
|
|
||||||
if changelog
|
|
||||||
!=Changelog
|
|
||||||
if IS_PAGE
|
|
||||||
!=NavHighlighter
|
|
||||||
!=GitHubEmbed
|
|
||||||
if HAS_MODELS
|
|
||||||
!=ModeLoader
|
|
||||||
if compare_models
|
|
||||||
!=ModelComparer
|
|
||||||
|
|
|
@ -198,6 +198,7 @@ export class ModelComparer {
|
||||||
this.fonts = CHART_FONTS;
|
this.fonts = CHART_FONTS;
|
||||||
this.defaultModels = defaultModels;
|
this.defaultModels = defaultModels;
|
||||||
this.tpl.get('result').style.display = 'block';
|
this.tpl.get('result').style.display = 'block';
|
||||||
|
this.tpl.get('error').style.display = 'none';
|
||||||
this.fetchCompat()
|
this.fetchCompat()
|
||||||
.then(compat => this.init(compat))
|
.then(compat => this.init(compat))
|
||||||
.catch(this.showError.bind(this))
|
.catch(this.showError.bind(this))
|
||||||
|
|
|
@ -40,13 +40,10 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
"MODELS": {
|
"MODELS": {
|
||||||
"en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"],
|
"en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
|
||||||
"de": ["de_core_news_sm", "de_core_news_md"],
|
"de": ["de_core_news_sm"],
|
||||||
"es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"],
|
"es": ["es_core_news_sm", "es_core_news_md"],
|
||||||
"pt": ["pt_core_news_sm"],
|
|
||||||
"fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"],
|
|
||||||
"it": ["it_core_news_sm"],
|
"it": ["it_core_news_sm"],
|
||||||
"nl": ["nl_core_news_sm"],
|
|
||||||
"xx": ["xx_ent_wiki_sm"]
|
"xx": ["xx_ent_wiki_sm"]
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
|
@ -218,7 +218,7 @@ p
|
||||||
| If an exception consists of more than one token, the #[code ORTH] values
|
| If an exception consists of more than one token, the #[code ORTH] values
|
||||||
| combined always need to #[strong match the original string]. The way the
|
| combined always need to #[strong match the original string]. The way the
|
||||||
| original string is split up can be pretty arbitrary sometimes – for
|
| original string is split up can be pretty arbitrary sometimes – for
|
||||||
| example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
|
| example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
|
||||||
| Because of how the tokenizer works, it's currently not possible to split
|
| Because of how the tokenizer works, it's currently not possible to split
|
||||||
| single-letter strings into multiple tokens.
|
| single-letter strings into multiple tokens.
|
||||||
|
|
||||||
|
|
|
@ -198,11 +198,11 @@ p
|
||||||
| #[code .finditer()] methods:
|
| #[code .finditer()] methods:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
import re
|
import regex as re
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
|
|
||||||
prefix_re = re.compile(r'''[\[\("']''')
|
prefix_re = re.compile(r'''^[\[\("']''')
|
||||||
suffix_re = re.compile(r'''[\]\)"']''')
|
suffix_re = re.compile(r'''[\]\)"']$''')
|
||||||
infix_re = re.compile(r'''[-~]''')
|
infix_re = re.compile(r'''[-~]''')
|
||||||
simple_url_re = re.compile(r'''^https?://''')
|
simple_url_re = re.compile(r'''^https?://''')
|
||||||
|
|
||||||
|
@ -220,6 +220,17 @@ p
|
||||||
| specialize are #[code find_prefix], #[code find_suffix] and
|
| specialize are #[code find_prefix], #[code find_suffix] and
|
||||||
| #[code find_infix].
|
| #[code find_infix].
|
||||||
|
|
||||||
|
+infobox("Important note", "⚠️")
|
||||||
|
| When customising the prefix, suffix and infix handling, remember that
|
||||||
|
| you're passing in #[strong functions] for spaCy to execute, e.g.
|
||||||
|
| #[code prefix_re.search] – not just the regular expressions. This means
|
||||||
|
| that your functions also need to define how the rules should be applied.
|
||||||
|
| For example, if you're adding your own prefix rules, you need
|
||||||
|
| to make sure they're only applied to characters at the
|
||||||
|
| #[strong beginning of a token], e.g. by adding #[code ^]. Similarly,
|
||||||
|
| suffix rules should only be applied at the #[strong end of a token],
|
||||||
|
| so your expression should end with a #[code $].
|
||||||
|
|
||||||
+h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
|
+h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
Loading…
Reference in New Issue
Block a user