mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge remote-tracking branch 'upstream/develop' into indonesian
This commit is contained in:
commit
fa544e6c9a
2
setup.py
2
setup.py
|
@ -28,7 +28,9 @@ MOD_NAMES = [
|
|||
'spacy.pipeline',
|
||||
'spacy.syntax.stateclass',
|
||||
'spacy.syntax._state',
|
||||
'spacy.syntax._beam_utils',
|
||||
'spacy.tokenizer',
|
||||
'spacy._cfile',
|
||||
'spacy.syntax.parser',
|
||||
'spacy.syntax.nn_parser',
|
||||
'spacy.syntax.beam_parser',
|
||||
|
|
26
spacy/_cfile.pxd
Normal file
26
spacy/_cfile.pxd
Normal file
|
@ -0,0 +1,26 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
cdef class CFile:
|
||||
cdef FILE* fp
|
||||
cdef bint is_open
|
||||
cdef Pool mem
|
||||
cdef int size # For compatibility with subclass
|
||||
cdef int _capacity # For compatibility with subclass
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
||||
|
||||
|
||||
|
||||
cdef class StringCFile(CFile):
|
||||
cdef unsigned char* data
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
|
88
spacy/_cfile.pyx
Normal file
88
spacy/_cfile.pyx
Normal file
|
@ -0,0 +1,88 @@
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from libc.string cimport memcpy
|
||||
|
||||
|
||||
cdef class CFile:
|
||||
def __init__(self, loc, mode, on_open_error=None):
|
||||
if isinstance(mode, unicode):
|
||||
mode_str = mode.encode('ascii')
|
||||
else:
|
||||
mode_str = mode
|
||||
if hasattr(loc, 'as_posix'):
|
||||
loc = loc.as_posix()
|
||||
self.mem = Pool()
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
self.fp = fopen(<char*>bytes_loc, mode_str)
|
||||
if self.fp == NULL:
|
||||
if on_open_error is not None:
|
||||
on_open_error()
|
||||
else:
|
||||
raise IOError("Could not open binary file %s" % bytes_loc)
|
||||
self.is_open = True
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.is_open:
|
||||
fclose(self.fp)
|
||||
|
||||
def close(self):
|
||||
fclose(self.fp)
|
||||
self.is_open = False
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||
st = fread(dest, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
|
||||
st = fwrite(src, elem_size, number, self.fp)
|
||||
if st != number:
|
||||
raise IOError
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||
cdef void* dest = mem.alloc(number, elem_size)
|
||||
self.read_into(dest, number, elem_size)
|
||||
return dest
|
||||
|
||||
def write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
||||
|
||||
|
||||
cdef class StringCFile:
|
||||
def __init__(self, mode, bytes data=b'', on_open_error=None):
|
||||
self.mem = Pool()
|
||||
self.is_open = 'w' in mode
|
||||
self._capacity = max(len(data), 8)
|
||||
self.size = len(data)
|
||||
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
|
||||
for i in range(len(data)):
|
||||
self.data[i] = data[i]
|
||||
|
||||
def close(self):
|
||||
self.is_open = False
|
||||
|
||||
def string_data(self):
|
||||
return (self.data-self.size)[:self.size]
|
||||
|
||||
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
|
||||
memcpy(dest, self.data, elem_size * number)
|
||||
self.data += elem_size * number
|
||||
|
||||
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
|
||||
write_size = number * elem_size
|
||||
if (self.size + write_size) >= self._capacity:
|
||||
self._capacity = (self.size + write_size) * 2
|
||||
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
|
||||
memcpy(&self.data[self.size], src, elem_size * number)
|
||||
self.size += write_size
|
||||
|
||||
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
|
||||
cdef void* dest = mem.alloc(number, elem_size)
|
||||
self.read_into(dest, number, elem_size)
|
||||
return dest
|
||||
|
||||
def write_unicode(self, unicode value):
|
||||
cdef bytes py_bytes = value.encode('utf8')
|
||||
cdef char* chars = <char*>py_bytes
|
||||
self.write(sizeof(char), len(py_bytes), chars)
|
122
spacy/_ml.py
122
spacy/_ml.py
|
@ -5,10 +5,12 @@ from thinc.neural._classes.hash_embed import HashEmbed
|
|||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
import random
|
||||
import cytoolz
|
||||
|
||||
from thinc.neural._classes.convolution import ExtractWindow
|
||||
from thinc.neural._classes.static_vectors import StaticVectors
|
||||
from thinc.neural._classes.batchnorm import BatchNorm
|
||||
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
||||
from thinc.neural._classes.layernorm import LayerNorm as LN
|
||||
from thinc.neural._classes.resnet import Residual
|
||||
from thinc.neural import ReLu
|
||||
from thinc.neural._classes.selu import SELU
|
||||
|
@ -19,10 +21,12 @@ from thinc.api import FeatureExtracter, with_getitem
|
|||
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
|
||||
from thinc.neural._classes.attention import ParametricAttention
|
||||
from thinc.linear.linear import LinearModel
|
||||
from thinc.api import uniqued, wrap
|
||||
from thinc.api import uniqued, wrap, flatten_add_lengths
|
||||
|
||||
|
||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
||||
from .tokens.doc import Doc
|
||||
from . import util
|
||||
|
||||
import numpy
|
||||
import io
|
||||
|
@ -53,6 +57,27 @@ def _logistic(X, drop=0.):
|
|||
return Y, logistic_bwd
|
||||
|
||||
|
||||
@layerize
|
||||
def add_tuples(X, drop=0.):
|
||||
"""Give inputs of sequence pairs, where each sequence is (vals, length),
|
||||
sum the values, returning a single sequence.
|
||||
|
||||
If input is:
|
||||
((vals1, length), (vals2, length)
|
||||
Output is:
|
||||
(vals1+vals2, length)
|
||||
|
||||
vals are a single tensor for the whole batch.
|
||||
"""
|
||||
(vals1, length1), (vals2, length2) = X
|
||||
assert length1 == length2
|
||||
|
||||
def add_tuples_bwd(dY, sgd=None):
|
||||
return (dY, dY)
|
||||
|
||||
return (vals1+vals2, length), add_tuples_bwd
|
||||
|
||||
|
||||
def _zero_init(model):
|
||||
def _zero_init_impl(self, X, y):
|
||||
self.W.fill(0)
|
||||
|
@ -61,6 +86,7 @@ def _zero_init(model):
|
|||
model.W.fill(0.)
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def _preprocess_doc(docs, drop=0.):
|
||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||
|
@ -72,7 +98,6 @@ def _preprocess_doc(docs, drop=0.):
|
|||
return (keys, vals, lengths), None
|
||||
|
||||
|
||||
|
||||
def _init_for_precomputed(W, ops):
|
||||
if (W**2).sum() != 0.:
|
||||
return
|
||||
|
@ -80,6 +105,7 @@ def _init_for_precomputed(W, ops):
|
|||
ops.xavier_uniform_init(reshaped)
|
||||
W[:] = reshaped.reshape(W.shape)
|
||||
|
||||
|
||||
@describe.on_data(_set_dimensions_if_needed)
|
||||
@describe.attributes(
|
||||
nI=Dimension("Input size"),
|
||||
|
@ -184,25 +210,36 @@ class PrecomputableMaxouts(Model):
|
|||
return Yfp, backward
|
||||
|
||||
|
||||
def drop_layer(layer, factor=2.):
|
||||
def drop_layer_fwd(X, drop=0.):
|
||||
drop *= factor
|
||||
mask = layer.ops.get_dropout_mask((1,), drop)
|
||||
if mask is None or mask > 0:
|
||||
return layer.begin_update(X, drop=drop)
|
||||
else:
|
||||
return X, lambda dX, sgd=None: dX
|
||||
return wrap(drop_layer_fwd, layer)
|
||||
|
||||
|
||||
def Tok2Vec(width, embed_size, preprocess=None):
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
||||
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
||||
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
||||
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
|
||||
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
|
||||
|
||||
embed = (norm | prefix | suffix | shape )
|
||||
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
|
||||
tok2vec = (
|
||||
with_flatten(
|
||||
asarray(Model.ops, dtype='uint64')
|
||||
>> embed
|
||||
>> Maxout(width, width*4, pieces=3)
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
|
||||
pad=4)
|
||||
>> uniqued(embed, column=5)
|
||||
>> drop_layer(
|
||||
Residual(
|
||||
(ExtractWindow(nW=1) >> BN(Maxout(width, width*3)))
|
||||
)
|
||||
) ** 4, pad=4
|
||||
)
|
||||
)
|
||||
if preprocess not in (False, None):
|
||||
tok2vec = preprocess >> tok2vec
|
||||
|
@ -297,7 +334,8 @@ def zero_init(model):
|
|||
|
||||
|
||||
def doc2feats(cols=None):
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
||||
if cols is None:
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
def forward(docs, drop=0.):
|
||||
feats = []
|
||||
for doc in docs:
|
||||
|
@ -323,6 +361,37 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
|||
return vectors, backward
|
||||
|
||||
|
||||
def fine_tune(embedding, combine=None):
|
||||
if combine is not None:
|
||||
raise NotImplementedError(
|
||||
"fine_tune currently only supports addition. Set combine=None")
|
||||
def fine_tune_fwd(docs_tokvecs, drop=0.):
|
||||
docs, tokvecs = docs_tokvecs
|
||||
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
|
||||
|
||||
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
|
||||
flat_tokvecs = embedding.ops.flatten(tokvecs)
|
||||
flat_vecs = embedding.ops.flatten(vecs)
|
||||
output = embedding.ops.unflatten(
|
||||
(model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs),
|
||||
lengths)
|
||||
|
||||
def fine_tune_bwd(d_output, sgd=None):
|
||||
bp_vecs(d_output, sgd=sgd)
|
||||
flat_grad = model.ops.flatten(d_output)
|
||||
model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum()
|
||||
model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum()
|
||||
if sgd is not None:
|
||||
sgd(model._mem.weights, model._mem.gradient, key=model.id)
|
||||
return d_output
|
||||
return output, fine_tune_bwd
|
||||
model = wrap(fine_tune_fwd, embedding)
|
||||
model.mix = model._mem.add((model.id, 'mix'), (2,))
|
||||
model.mix.fill(1.)
|
||||
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def flatten(seqs, drop=0.):
|
||||
if isinstance(seqs[0], numpy.ndarray):
|
||||
|
@ -369,6 +438,27 @@ def preprocess_doc(docs, drop=0.):
|
|||
vals = ops.allocate(keys.shape[0]) + 1
|
||||
return (keys, vals, lengths), None
|
||||
|
||||
def getitem(i):
|
||||
def getitem_fwd(X, drop=0.):
|
||||
return X[i], None
|
||||
return layerize(getitem_fwd)
|
||||
|
||||
def build_tagger_model(nr_class, token_vector_width, **cfg):
|
||||
embed_size = util.env_opt('embed_size', 7500)
|
||||
with Model.define_operators({'>>': chain, '+': add}):
|
||||
# Input: (doc, tensor) tuples
|
||||
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
|
||||
|
||||
model = (
|
||||
fine_tune(private_tok2vec)
|
||||
>> with_flatten(
|
||||
Maxout(token_vector_width, token_vector_width)
|
||||
>> Softmax(nr_class, token_vector_width)
|
||||
)
|
||||
)
|
||||
model.nI = None
|
||||
return model
|
||||
|
||||
|
||||
def build_text_classifier(nr_class, width=64, **cfg):
|
||||
nr_vector = cfg.get('nr_vector', 200)
|
||||
|
@ -383,7 +473,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
>> _flatten_add_lengths
|
||||
>> with_getitem(0,
|
||||
uniqued(
|
||||
(embed_lower | embed_prefix | embed_suffix | embed_shape)
|
||||
(embed_lower | embed_prefix | embed_suffix | embed_shape)
|
||||
>> Maxout(width, width+(width//2)*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
|
||||
|
@ -404,7 +494,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
||||
>> logistic
|
||||
)
|
||||
|
||||
|
||||
model.lsuv = False
|
||||
return model
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy-nightly'
|
||||
__version__ = '2.0.0a9'
|
||||
__version__ = '2.0.0a10'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Explosion AI'
|
||||
|
|
|
@ -21,10 +21,10 @@ CONVERTERS = {
|
|||
@plac.annotations(
|
||||
input_file=("input file", "positional", None, str),
|
||||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", float),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
|
|
|
@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
for batch in minibatch(train_docs, size=batch_sizes):
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, sgd=optimizer,
|
||||
drop=next(dropout_rates), losses=losses)
|
||||
drop=next(dropout_rates), losses=losses,
|
||||
update_tensors=True)
|
||||
pbar.update(sum(len(doc) for doc in docs))
|
||||
|
||||
with nlp.use_params(optimizer.averages):
|
||||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ('model%d' % i)
|
||||
nlp.to_disk(epoch_model_path)
|
||||
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
||||
dill.dump(nlp, file_, -1)
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
scorer = nlp_loaded.evaluate(
|
||||
|
|
|
@ -46,19 +46,21 @@ is_osx = sys.platform == 'darwin'
|
|||
|
||||
|
||||
if is_python2:
|
||||
import imp
|
||||
bytes_ = str
|
||||
unicode_ = unicode
|
||||
basestring_ = basestring
|
||||
input_ = raw_input
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
|
||||
path2str = lambda path: str(path).decode('utf8')
|
||||
|
||||
elif is_python3:
|
||||
import importlib.util
|
||||
bytes_ = bytes
|
||||
unicode_ = str
|
||||
basestring_ = str
|
||||
input_ = input
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2)
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
|
||||
path2str = lambda path: str(path)
|
||||
|
||||
|
||||
|
@ -102,3 +104,12 @@ def normalize_string_keys(old):
|
|||
return new
|
||||
|
||||
|
||||
def import_file(name, loc):
|
||||
loc = str(loc)
|
||||
if is_python2:
|
||||
return imp.load_source(name, loc)
|
||||
else:
|
||||
spec = importlib.util.spec_from_file_location(name, str(loc))
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
|
18
spacy/lang/da/examples.py
Normal file
18
spacy/lang/da/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.da.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple overvejer at købe et britisk statup for 1 milliard dollar",
|
||||
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
|
||||
"San Francisco overvejer at forbyde leverandørrobotter på fortov",
|
||||
"London er en stor by i Storbritannien"
|
||||
]
|
22
spacy/lang/de/examples.py
Normal file
22
spacy/lang/de/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.de.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
|
||||
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
|
||||
"Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
|
||||
"Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
|
||||
"San Francisco erwägt Verbot von Lieferrobotern",
|
||||
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
|
||||
"Wo bist du?",
|
||||
"Was ist die Hauptstadt von Deutschland?"
|
||||
]
|
22
spacy/lang/en/examples.py
Normal file
22
spacy/lang/en/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.en.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple is looking at buying U.K. startup for $1 billion",
|
||||
"Autonomous cars shift insurance liability toward manufacturers",
|
||||
"San Francisco considers banning sidewalk delivery robots",
|
||||
"London is a big city in the United Kingdom.",
|
||||
"Where are you?",
|
||||
"Who is the president of France?",
|
||||
"What is the capital of the United States?",
|
||||
"When was Barack Obama born?"
|
||||
]
|
22
spacy/lang/es/examples.py
Normal file
22
spacy/lang/es/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.es.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
|
||||
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
|
||||
"San Francisco analiza prohibir los robots delivery",
|
||||
"Londres es una gran ciudad del Reino Unido",
|
||||
"El gato come pescado",
|
||||
"Veo al hombre con el telescopio",
|
||||
"La araña come moscas",
|
||||
"El pingüino incuba en su nido"
|
||||
]
|
26
spacy/lang/fr/examples.py
Normal file
26
spacy/lang/fr/examples.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.fr.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
|
||||
"Les voitures autonomes voient leur assurances décalées vers les constructeurs",
|
||||
"San Francisco envisage d'interdire les robots coursiers",
|
||||
"Londres est une grande ville du Royaume-Uni",
|
||||
"L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
|
||||
"Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
|
||||
"La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
|
||||
"Nouvelles attaques de Trump contre le maire de Londres",
|
||||
"Où es-tu ?",
|
||||
"Qui est le président de la France ?",
|
||||
"Où est la capitale des Etats-Unis ?",
|
||||
"Quand est né Barack Obama ?"
|
||||
]
|
28
spacy/lang/he/examples.py
Normal file
28
spacy/lang/he/examples.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.he.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
|
||||
'רה"מ הודיע כי יחרים טקס בחסותו',
|
||||
'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
|
||||
'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
|
||||
'סע לשלום, המפתחות בפנים.',
|
||||
'מלצר, פעמיים טורקי!',
|
||||
'ואהבת לרעך כמוך.',
|
||||
'היום נעשה משהו בלתי נשכח.',
|
||||
'איפה הילד?',
|
||||
'מיהו נשיא צרפת?',
|
||||
'מהי בירת ארצות הברית?',
|
||||
"איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
|
||||
'מה הייתה הדקה?',
|
||||
'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
|
||||
]
|
18
spacy/lang/it/examples.py
Normal file
18
spacy/lang/it/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.it.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
|
||||
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
|
||||
"San Francisco prevede di bandire i robot di consegna porta a porta",
|
||||
"Londra è una grande città del Regno Unito."
|
||||
]
|
18
spacy/lang/nb/examples.py
Normal file
18
spacy/lang/nb/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.nb.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
|
||||
"Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
|
||||
"San Francisco vurderer å forby robotbud på fortauene",
|
||||
"London er en stor by i Storbritannia."
|
||||
]
|
20
spacy/lang/pl/examples.py
Normal file
20
spacy/lang/pl/examples.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.pl.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Poczuł przyjemną woń mocnej kawy.",
|
||||
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
|
||||
"Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
|
||||
"Nowy abonament pod lupą Komisji Europejskiej",
|
||||
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
|
||||
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
|
||||
]
|
18
spacy/lang/pt/examples.py
Normal file
18
spacy/lang/pt/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.pt.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
|
||||
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
|
||||
"São Francisco considera banir os robôs de entrega que andam pelas calçadas",
|
||||
"Londres é a maior cidade do Reino Unido"
|
||||
]
|
18
spacy/lang/sv/examples.py
Normal file
18
spacy/lang/sv/examples.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.sv.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple överväger att köpa brittisk startup för 1 miljard dollar.",
|
||||
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
|
||||
"San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
|
||||
"London är en storstad i Storbritannien."
|
||||
]
|
|
@ -95,7 +95,7 @@ class BaseDefaults(object):
|
|||
meta = nlp.meta if nlp is not None else {}
|
||||
# Resolve strings, like "cnn", "lstm", etc
|
||||
pipeline = []
|
||||
for entry in cls.pipeline:
|
||||
for entry in meta.get('pipeline', []):
|
||||
if entry in disable or getattr(entry, 'name', entry) in disable:
|
||||
continue
|
||||
factory = cls.Defaults.factories[entry]
|
||||
|
@ -277,7 +277,8 @@ class Language(object):
|
|||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None,
|
||||
update_tensors=False):
|
||||
"""Update the models in the pipeline.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
|
@ -304,14 +305,17 @@ class Language(object):
|
|||
grads[key] = (W, dW)
|
||||
pipes = list(self.pipeline[1:])
|
||||
random.shuffle(pipes)
|
||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
|
||||
for proc in pipes:
|
||||
if not hasattr(proc, 'update'):
|
||||
continue
|
||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||
drop=drop, sgd=get_grads, losses=losses)
|
||||
if d_tokvecses is not None:
|
||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
||||
if update_tensors and d_tokvecses is not None:
|
||||
for i, d_tv in enumerate(d_tokvecses):
|
||||
all_d_tokvecses[i] += d_tv
|
||||
bp_tokvecses(all_d_tokvecses, sgd=sgd)
|
||||
for key, (W, dW) in grads.items():
|
||||
sgd(W, dW, key=key)
|
||||
# Clear the tensor variable, to free GPU memory.
|
||||
|
@ -381,9 +385,18 @@ class Language(object):
|
|||
return optimizer
|
||||
|
||||
def evaluate(self, docs_golds):
|
||||
docs, golds = zip(*docs_golds)
|
||||
scorer = Scorer()
|
||||
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
|
||||
docs, golds = zip(*docs_golds)
|
||||
docs = list(docs)
|
||||
golds = list(golds)
|
||||
for pipe in self.pipeline:
|
||||
if not hasattr(pipe, 'pipe'):
|
||||
for doc in docs:
|
||||
pipe(doc)
|
||||
else:
|
||||
docs = list(pipe.pipe(docs))
|
||||
assert len(docs) == len(golds)
|
||||
for doc, gold in zip(docs, golds):
|
||||
scorer.score(doc, gold)
|
||||
doc.tensor = None
|
||||
return scorer
|
||||
|
@ -417,11 +430,16 @@ class Language(object):
|
|||
except StopIteration:
|
||||
pass
|
||||
|
||||
def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
|
||||
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
||||
disable=[]):
|
||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
||||
GIL-free multi-threading.
|
||||
|
||||
texts (iterator): A sequence of texts to process.
|
||||
as_tuples (bool):
|
||||
If set to True, inputs should be a sequence of
|
||||
(text, context) tuples. Output will then be a sequence of
|
||||
(doc, context) tuples. Defaults to False.
|
||||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
||||
decide how many to use at run time. Default is 2.
|
||||
batch_size (int): The number of texts to buffer.
|
||||
|
@ -433,7 +451,7 @@ class Language(object):
|
|||
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||
>>> assert doc.is_parsed
|
||||
"""
|
||||
if tuples:
|
||||
if as_tuples:
|
||||
text_context1, text_context2 = itertools.tee(texts)
|
||||
texts = (tc[0] for tc in text_context1)
|
||||
contexts = (tc[1] for tc in text_context2)
|
||||
|
|
|
@ -42,7 +42,7 @@ from .compat import json_dumps
|
|||
|
||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
||||
from ._ml import build_text_classifier
|
||||
from ._ml import build_text_classifier, build_tagger_model
|
||||
from .parts_of_speech import X
|
||||
|
||||
|
||||
|
@ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent):
|
|||
name = 'tensorizer'
|
||||
|
||||
@classmethod
|
||||
def Model(cls, width=128, embed_size=7500, **cfg):
|
||||
def Model(cls, width=128, embed_size=4000, **cfg):
|
||||
"""Create a new statistical model for the class.
|
||||
|
||||
width (int): Output size of the model.
|
||||
|
@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent):
|
|||
self.cfg = dict(cfg)
|
||||
|
||||
def __call__(self, doc):
|
||||
tags = self.predict([doc.tensor])
|
||||
tags = self.predict(([doc], [doc.tensor]))
|
||||
self.set_annotations([doc], tags)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
docs = list(docs)
|
||||
tokvecs = [d.tensor for d in docs]
|
||||
tag_ids = self.predict(tokvecs)
|
||||
tag_ids = self.predict((docs, tokvecs))
|
||||
self.set_annotations(docs, tag_ids)
|
||||
yield from docs
|
||||
|
||||
def predict(self, tokvecs):
|
||||
scores = self.model(tokvecs)
|
||||
def predict(self, docs_tokvecs):
|
||||
scores = self.model(docs_tokvecs)
|
||||
scores = self.model.ops.flatten(scores)
|
||||
guesses = scores.argmax(axis=1)
|
||||
if not isinstance(guesses, numpy.ndarray):
|
||||
guesses = guesses.get()
|
||||
tokvecs = docs_tokvecs[1]
|
||||
guesses = self.model.ops.unflatten(guesses,
|
||||
[tv.shape[0] for tv in tokvecs])
|
||||
return guesses
|
||||
|
@ -282,6 +284,8 @@ class NeuralTagger(BaseThincComponent):
|
|||
cdef Vocab vocab = self.vocab
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, 'get'):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
# Don't clobber preset POS tags
|
||||
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
||||
|
@ -294,8 +298,7 @@ class NeuralTagger(BaseThincComponent):
|
|||
|
||||
if self.model.nI is None:
|
||||
self.model.nI = tokvecs[0].shape[1]
|
||||
|
||||
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
|
||||
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
|
||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||
|
||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||
|
@ -346,10 +349,8 @@ class NeuralTagger(BaseThincComponent):
|
|||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, token_vector_width):
|
||||
return with_flatten(
|
||||
chain(Maxout(token_vector_width, token_vector_width),
|
||||
Softmax(n_tags, token_vector_width)))
|
||||
|
||||
return build_tagger_model(n_tags, token_vector_width)
|
||||
|
||||
def use_params(self, params):
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
@ -432,7 +433,7 @@ class NeuralLabeller(NeuralTagger):
|
|||
|
||||
@property
|
||||
def labels(self):
|
||||
return self.cfg.get('labels', {})
|
||||
return self.cfg.setdefault('labels', {})
|
||||
|
||||
@labels.setter
|
||||
def labels(self, value):
|
||||
|
@ -455,10 +456,8 @@ class NeuralLabeller(NeuralTagger):
|
|||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, token_vector_width):
|
||||
return with_flatten(
|
||||
chain(Maxout(token_vector_width, token_vector_width),
|
||||
Softmax(n_tags, token_vector_width)))
|
||||
|
||||
return build_tagger_model(n_tags, token_vector_width)
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
cdef int idx = 0
|
||||
|
|
|
@ -215,7 +215,10 @@ cdef class StringStore:
|
|||
path = util.ensure_path(path)
|
||||
with path.open('r') as file_:
|
||||
strings = ujson.load(file_)
|
||||
prev = list(self)
|
||||
self._reset_and_load(strings)
|
||||
for word in prev:
|
||||
self.add(word)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
|
@ -234,7 +237,10 @@ cdef class StringStore:
|
|||
RETURNS (StringStore): The `StringStore` object.
|
||||
"""
|
||||
strings = ujson.loads(bytes_data)
|
||||
prev = list(self)
|
||||
self._reset_and_load(strings)
|
||||
for word in prev:
|
||||
self.add(word)
|
||||
return self
|
||||
|
||||
def set_frozen(self, bint is_frozen):
|
||||
|
|
286
spacy/syntax/_beam_utils.pyx
Normal file
286
spacy/syntax/_beam_utils.pyx
Normal file
|
@ -0,0 +1,286 @@
|
|||
# cython: infer_types=True
|
||||
# cython: profile=True
|
||||
cimport numpy as np
|
||||
import numpy
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.extra.search import MaxViolation
|
||||
from thinc.typedefs cimport hash_t, class_t
|
||||
from thinc.extra.search cimport MaxViolation
|
||||
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from .stateclass cimport StateClass
|
||||
from ..gold cimport GoldParse
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||
dest = <StateClass>_dest
|
||||
src = <StateClass>_src
|
||||
moves = <const Transition*>_moves
|
||||
dest.clone(src)
|
||||
moves[clas].do(dest.c, moves[clas].label)
|
||||
|
||||
|
||||
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
||||
return (<StateClass>_state).is_final()
|
||||
|
||||
|
||||
def _cleanup(Beam beam):
|
||||
for i in range(beam.width):
|
||||
Py_XDECREF(<PyObject*>beam._states[i].content)
|
||||
Py_XDECREF(<PyObject*>beam._parents[i].content)
|
||||
|
||||
|
||||
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
||||
state = <StateClass>_state
|
||||
if state.c.is_final():
|
||||
return 1
|
||||
else:
|
||||
return state.c.hash()
|
||||
|
||||
|
||||
cdef class ParserBeam(object):
|
||||
cdef public TransitionSystem moves
|
||||
cdef public object states
|
||||
cdef public object golds
|
||||
cdef public object beams
|
||||
cdef public object dones
|
||||
|
||||
def __init__(self, TransitionSystem moves, states, golds,
|
||||
int width, float density):
|
||||
self.moves = moves
|
||||
self.states = states
|
||||
self.golds = golds
|
||||
self.beams = []
|
||||
cdef Beam beam
|
||||
cdef StateClass state, st
|
||||
for state in states:
|
||||
beam = Beam(self.moves.n_moves, width, density)
|
||||
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
|
||||
for i in range(beam.width):
|
||||
st = <StateClass>beam.at(i)
|
||||
st.c.offset = state.c.offset
|
||||
self.beams.append(beam)
|
||||
self.dones = [False] * len(self.beams)
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.beams is not None:
|
||||
for beam in self.beams:
|
||||
if beam is not None:
|
||||
_cleanup(beam)
|
||||
|
||||
@property
|
||||
def is_done(self):
|
||||
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.beams[i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.beams)
|
||||
|
||||
def advance(self, scores, follow_gold=False):
|
||||
cdef Beam beam
|
||||
for i, beam in enumerate(self.beams):
|
||||
if beam.is_done or not scores[i].size or self.dones[i]:
|
||||
continue
|
||||
self._set_scores(beam, scores[i])
|
||||
if self.golds is not None:
|
||||
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
|
||||
if follow_gold:
|
||||
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
||||
else:
|
||||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||
beam.check_done(_check_final_state, NULL)
|
||||
if beam.is_done and self.golds is not None:
|
||||
for j in range(beam.size):
|
||||
state = <StateClass>beam.at(j)
|
||||
if state.is_final():
|
||||
try:
|
||||
if self.moves.is_gold_parse(state, self.golds[i]):
|
||||
beam._states[j].loss = 0.0
|
||||
elif beam._states[j].loss == 0.0:
|
||||
beam._states[j].loss = 1.0
|
||||
except NotImplementedError:
|
||||
break
|
||||
|
||||
def _set_scores(self, Beam beam, float[:, ::1] scores):
|
||||
cdef float* c_scores = &scores[0, 0]
|
||||
cdef int nr_state = min(scores.shape[0], beam.size)
|
||||
cdef int nr_class = scores.shape[1]
|
||||
for i in range(nr_state):
|
||||
state = <StateClass>beam.at(i)
|
||||
if not state.is_final():
|
||||
for j in range(nr_class):
|
||||
beam.scores[i][j] = c_scores[i * nr_class + j]
|
||||
self.moves.set_valid(beam.is_valid[i], state.c)
|
||||
else:
|
||||
for j in range(beam.nr_class):
|
||||
beam.scores[i][j] = 0
|
||||
beam.costs[i][j] = 0
|
||||
|
||||
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
|
||||
for i in range(beam.size):
|
||||
state = <StateClass>beam.at(i)
|
||||
if not state.c.is_final():
|
||||
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
|
||||
if follow_gold:
|
||||
for j in range(beam.nr_class):
|
||||
if beam.costs[i][j] >= 1:
|
||||
beam.is_valid[i][j] = 0
|
||||
|
||||
|
||||
def get_token_ids(states, int n_tokens):
|
||||
cdef StateClass state
|
||||
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
|
||||
dtype='int32', order='C')
|
||||
c_ids = <int*>ids.data
|
||||
for i, state in enumerate(states):
|
||||
if not state.is_final():
|
||||
state.c.set_context_tokens(c_ids, n_tokens)
|
||||
else:
|
||||
ids[i] = -1
|
||||
c_ids += ids.shape[1]
|
||||
return ids
|
||||
|
||||
nr_update = 0
|
||||
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||
states, tokvecs, golds,
|
||||
state2vec, vec2scores,
|
||||
int width, float density,
|
||||
sgd=None, losses=None, drop=0.):
|
||||
global nr_update
|
||||
cdef MaxViolation violn
|
||||
nr_update += 1
|
||||
pbeam = ParserBeam(moves, states, golds,
|
||||
width=width, density=density)
|
||||
gbeam = ParserBeam(moves, states, golds,
|
||||
width=width, density=0.0)
|
||||
cdef StateClass state
|
||||
beam_maps = []
|
||||
backprops = []
|
||||
violns = [MaxViolation() for _ in range(len(states))]
|
||||
for t in range(max_steps):
|
||||
if pbeam.is_done and gbeam.is_done:
|
||||
break
|
||||
# The beam maps let us find the right row in the flattened scores
|
||||
# arrays for each state. States are identified by (example id, history).
|
||||
# We keep a different beam map for each step (since we'll have a flat
|
||||
# scores array for each step). The beam map will let us take the per-state
|
||||
# losses, and compute the gradient for each (step, state, class).
|
||||
beam_maps.append({})
|
||||
# Gather all states from the two beams in a list. Some stats may occur
|
||||
# in both beams. To figure out which beam each state belonged to,
|
||||
# we keep two lists of indices, p_indices and g_indices
|
||||
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
|
||||
if not states:
|
||||
break
|
||||
# Now that we have our flat list of states, feed them through the model
|
||||
token_ids = get_token_ids(states, nr_feature)
|
||||
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
||||
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
||||
|
||||
# Store the callbacks for the backward pass
|
||||
backprops.append((token_ids, bp_vectors, bp_scores))
|
||||
|
||||
# Unpack the flat scores into lists for the two beams. The indices arrays
|
||||
# tell us which example and state the scores-row refers to.
|
||||
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
|
||||
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
|
||||
# Now advance the states in the beams. The gold beam is contrained to
|
||||
# to follow only gold analyses.
|
||||
pbeam.advance(p_scores)
|
||||
gbeam.advance(g_scores, follow_gold=True)
|
||||
# Track the "maximum violation", to use in the update.
|
||||
for i, violn in enumerate(violns):
|
||||
violn.check_crf(pbeam[i], gbeam[i])
|
||||
histories = []
|
||||
losses = []
|
||||
for violn in violns:
|
||||
if violn.p_hist:
|
||||
histories.append(violn.p_hist + violn.g_hist)
|
||||
losses.append(violn.p_probs + violn.g_probs)
|
||||
else:
|
||||
histories.append([])
|
||||
losses.append([])
|
||||
states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
|
||||
return states_d_scores, backprops[:len(states_d_scores)]
|
||||
|
||||
|
||||
def get_states(pbeams, gbeams, beam_map, nr_update):
|
||||
seen = {}
|
||||
states = []
|
||||
p_indices = []
|
||||
g_indices = []
|
||||
cdef Beam pbeam, gbeam
|
||||
assert len(pbeams) == len(gbeams)
|
||||
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
|
||||
p_indices.append([])
|
||||
g_indices.append([])
|
||||
for i in range(pbeam.size):
|
||||
state = <StateClass>pbeam.at(i)
|
||||
if not state.is_final():
|
||||
key = tuple([eg_id] + pbeam.histories[i])
|
||||
assert key not in seen, (key, seen)
|
||||
seen[key] = len(states)
|
||||
p_indices[-1].append(len(states))
|
||||
states.append(state)
|
||||
beam_map.update(seen)
|
||||
for i in range(gbeam.size):
|
||||
state = <StateClass>gbeam.at(i)
|
||||
if not state.is_final():
|
||||
key = tuple([eg_id] + gbeam.histories[i])
|
||||
if key in seen:
|
||||
g_indices[-1].append(seen[key])
|
||||
else:
|
||||
g_indices[-1].append(len(states))
|
||||
beam_map[key] = len(states)
|
||||
states.append(state)
|
||||
p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
|
||||
g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
|
||||
return states, p_idx, g_idx
|
||||
|
||||
|
||||
def get_gradient(nr_class, beam_maps, histories, losses):
|
||||
"""
|
||||
The global model assigns a loss to each parse. The beam scores
|
||||
are additive, so the same gradient is applied to each action
|
||||
in the history. This gives the gradient of a single *action*
|
||||
for a beam state -- so we have "the gradient of loss for taking
|
||||
action i given history H."
|
||||
|
||||
Histories: Each hitory is a list of actions
|
||||
Each candidate has a history
|
||||
Each beam has multiple candidates
|
||||
Each batch has multiple beams
|
||||
So history is list of lists of lists of ints
|
||||
"""
|
||||
nr_step = len(beam_maps)
|
||||
grads = []
|
||||
nr_step = 0
|
||||
for eg_id, hists in enumerate(histories):
|
||||
for loss, hist in zip(losses[eg_id], hists):
|
||||
if loss != 0.0 and not numpy.isnan(loss):
|
||||
nr_step = max(nr_step, len(hist))
|
||||
for i in range(nr_step):
|
||||
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
|
||||
assert len(histories) == len(losses)
|
||||
for eg_id, hists in enumerate(histories):
|
||||
for loss, hist in zip(losses[eg_id], hists):
|
||||
if loss == 0.0 or numpy.isnan(loss):
|
||||
continue
|
||||
key = tuple([eg_id])
|
||||
# Adjust loss for length
|
||||
avg_loss = loss / len(hist)
|
||||
loss += avg_loss * (nr_step - len(hist))
|
||||
for j, clas in enumerate(hist):
|
||||
i = beam_maps[j][key]
|
||||
# In step j, at state i action clas
|
||||
# resulted in loss
|
||||
grads[j][i, clas] += loss
|
||||
key = key + tuple([clas])
|
||||
return grads
|
||||
|
||||
|
|
@ -37,6 +37,7 @@ cdef cppclass StateC:
|
|||
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
|
||||
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
|
||||
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
|
||||
this.offset = 0
|
||||
cdef int i
|
||||
for i in range(length + (PADDING * 2)):
|
||||
this._ents[i].end = -1
|
||||
|
@ -73,7 +74,16 @@ cdef cppclass StateC:
|
|||
free(this.shifted - PADDING)
|
||||
|
||||
void set_context_tokens(int* ids, int n) nogil:
|
||||
if n == 13:
|
||||
if n == 8:
|
||||
ids[0] = this.B(0)
|
||||
ids[1] = this.B(1)
|
||||
ids[2] = this.S(0)
|
||||
ids[3] = this.S(1)
|
||||
ids[4] = this.H(this.S(0))
|
||||
ids[5] = this.L(this.B(0), 1)
|
||||
ids[6] = this.L(this.S(0), 2)
|
||||
ids[7] = this.R(this.S(0), 1)
|
||||
elif n == 13:
|
||||
ids[0] = this.B(0)
|
||||
ids[1] = this.B(1)
|
||||
ids[2] = this.S(0)
|
||||
|
|
|
@ -351,6 +351,20 @@ cdef class ArcEager(TransitionSystem):
|
|||
def __get__(self):
|
||||
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
|
||||
|
||||
def is_gold_parse(self, StateClass state, GoldParse gold):
|
||||
predicted = set()
|
||||
truth = set()
|
||||
for i in range(gold.length):
|
||||
if gold.cand_to_gold[i] is None:
|
||||
continue
|
||||
if state.safe_get(i).dep:
|
||||
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
|
||||
else:
|
||||
predicted.add((i, state.H(i), 'ROOT'))
|
||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||
truth.add((id_, head, dep))
|
||||
return truth == predicted
|
||||
|
||||
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||
end = end or len(gold.heads)
|
||||
if all([tag is None for tag in gold.heads[start:end]]):
|
||||
|
@ -385,6 +399,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
for i in range(self.n_moves):
|
||||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
return Transition(clas=0, move=MISSING, label=0)
|
||||
|
||||
def move_name(self, int move, attr_t label):
|
||||
label_str = self.strings[label]
|
||||
|
|
|
@ -107,7 +107,7 @@ cdef class BeamParser(Parser):
|
|||
# The non-monotonic oracle makes it difficult to ensure final costs are
|
||||
# correct. Therefore do final correction
|
||||
for i in range(pred.size):
|
||||
if is_gold(<StateClass>pred.at(i), gold_parse, self.moves.strings):
|
||||
if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
|
||||
pred._states[i].loss = 0.0
|
||||
elif pred._states[i].loss == 0.0:
|
||||
pred._states[i].loss = 1.0
|
||||
|
@ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
|
|||
if not pred._states[i].is_done or pred._states[i].loss == 0:
|
||||
continue
|
||||
state = <StateClass>pred.at(i)
|
||||
if is_gold(state, gold_parse, moves.strings) == True:
|
||||
if moves.is_gold_parse(state, gold_parse) == True:
|
||||
for dep in gold_parse.orig_annot:
|
||||
print(dep[1], dep[3], dep[4])
|
||||
print("Cost", pred._states[i].loss)
|
||||
|
@ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
|
|||
if not gold._states[i].is_done:
|
||||
continue
|
||||
state = <StateClass>gold.at(i)
|
||||
if is_gold(state, gold_parse, moves.strings) == False:
|
||||
if moves.is_gold(state, gold_parse) == False:
|
||||
print("Truth")
|
||||
for dep in gold_parse.orig_annot:
|
||||
print(dep[1], dep[3], dep[4])
|
||||
|
@ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
|
|||
raise Exception("Gold parse is not gold-standard")
|
||||
|
||||
|
||||
def is_gold(StateClass state, GoldParse gold, StringStore strings):
|
||||
predicted = set()
|
||||
truth = set()
|
||||
for i in range(gold.length):
|
||||
if gold.cand_to_gold[i] is None:
|
||||
continue
|
||||
if state.safe_get(i).dep:
|
||||
predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
|
||||
else:
|
||||
predicted.add((i, state.H(i), 'ROOT'))
|
||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||
truth.add((id_, head, dep))
|
||||
return truth == predicted
|
||||
|
|
|
@ -14,8 +14,4 @@ cdef class Parser:
|
|||
cdef readonly TransitionSystem moves
|
||||
cdef readonly object cfg
|
||||
|
||||
cdef void _parse_step(self, StateC* state,
|
||||
const float* feat_weights,
|
||||
int nr_class, int nr_feat, int nr_piece) nogil
|
||||
|
||||
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
|
||||
|
|
|
@ -37,14 +37,18 @@ from preshed.maps cimport MapStruct
|
|||
from preshed.maps cimport map_get
|
||||
|
||||
from thinc.api import layerize, chain, noop, clone
|
||||
from thinc.neural import Model, Affine, ELU, ReLu, Maxout
|
||||
from thinc.neural import Model, Affine, ReLu, Maxout
|
||||
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
||||
from thinc.neural._classes.selu import SELU
|
||||
from thinc.neural._classes.layernorm import LayerNorm
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
|
||||
from .. import util
|
||||
from ..util import get_async, get_cuda_stream
|
||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||
from .._ml import Tok2Vec, doc2feats, rebatch
|
||||
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
|
||||
from .._ml import Residual, drop_layer
|
||||
from ..compat import json_dumps
|
||||
|
||||
from . import _parse_features
|
||||
|
@ -59,8 +63,10 @@ from ..structs cimport TokenC
|
|||
from ..tokens.doc cimport Doc
|
||||
from ..strings cimport StringStore
|
||||
from ..gold cimport GoldParse
|
||||
from ..attrs cimport TAG, DEP
|
||||
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
|
||||
from . import _beam_utils
|
||||
|
||||
USE_FINE_TUNE = True
|
||||
|
||||
def get_templates(*args, **kwargs):
|
||||
return []
|
||||
|
@ -232,11 +238,14 @@ cdef class Parser:
|
|||
Base class of the DependencyParser and EntityRecognizer.
|
||||
"""
|
||||
@classmethod
|
||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
|
||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
|
||||
depth = util.env_opt('parser_hidden_depth', depth)
|
||||
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
||||
embed_size = util.env_opt('embed_size', 4000)
|
||||
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
|
||||
preprocess=doc2feats()))
|
||||
if parser_maxout_pieces == 1:
|
||||
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
||||
nF=cls.nr_feature,
|
||||
|
@ -248,15 +257,10 @@ cdef class Parser:
|
|||
nI=token_vector_width)
|
||||
|
||||
with Model.use_device('cpu'):
|
||||
if depth == 0:
|
||||
upper = chain()
|
||||
upper.is_noop = True
|
||||
else:
|
||||
upper = chain(
|
||||
clone(Maxout(hidden_width), (depth-1)),
|
||||
zero_init(Affine(nr_class, drop_factor=0.0))
|
||||
)
|
||||
upper.is_noop = False
|
||||
upper = chain(
|
||||
clone(Maxout(hidden_width), (depth-1)),
|
||||
zero_init(Affine(nr_class, drop_factor=0.0))
|
||||
)
|
||||
# TODO: This is an unfortunate hack atm!
|
||||
# Used to set input dimensions in network.
|
||||
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
||||
|
@ -268,7 +272,7 @@ cdef class Parser:
|
|||
'hidden_width': hidden_width,
|
||||
'maxout_pieces': parser_maxout_pieces
|
||||
}
|
||||
return (lower, upper), cfg
|
||||
return (tensors, lower, upper), cfg
|
||||
|
||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
||||
"""
|
||||
|
@ -294,6 +298,10 @@ cdef class Parser:
|
|||
self.moves = self.TransitionSystem(self.vocab.strings, {})
|
||||
else:
|
||||
self.moves = moves
|
||||
if 'beam_width' not in cfg:
|
||||
cfg['beam_width'] = util.env_opt('beam_width', 1)
|
||||
if 'beam_density' not in cfg:
|
||||
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
||||
self.cfg = cfg
|
||||
if 'actions' in self.cfg:
|
||||
for action, labels in self.cfg.get('actions', {}).items():
|
||||
|
@ -316,7 +324,7 @@ cdef class Parser:
|
|||
if beam_width is None:
|
||||
beam_width = self.cfg.get('beam_width', 1)
|
||||
if beam_density is None:
|
||||
beam_density = self.cfg.get('beam_density', 0.001)
|
||||
beam_density = self.cfg.get('beam_density', 0.0)
|
||||
cdef Beam beam
|
||||
if beam_width == 1:
|
||||
states = self.parse_batch([doc], [doc.tensor])
|
||||
|
@ -332,7 +340,7 @@ cdef class Parser:
|
|||
return output
|
||||
|
||||
def pipe(self, docs, int batch_size=1000, int n_threads=2,
|
||||
beam_width=1, beam_density=0.001):
|
||||
beam_width=None, beam_density=None):
|
||||
"""
|
||||
Process a stream of documents.
|
||||
|
||||
|
@ -344,17 +352,23 @@ cdef class Parser:
|
|||
The number of threads with which to work on the buffer in parallel.
|
||||
Yields (Doc): Documents, in order.
|
||||
"""
|
||||
cdef StateClass parse_state
|
||||
if beam_width is None:
|
||||
beam_width = self.cfg.get('beam_width', 1)
|
||||
if beam_density is None:
|
||||
beam_density = self.cfg.get('beam_density', 0.0)
|
||||
cdef Doc doc
|
||||
queue = []
|
||||
cdef Beam beam
|
||||
for docs in cytoolz.partition_all(batch_size, docs):
|
||||
docs = list(docs)
|
||||
tokvecs = [d.tensor for d in docs]
|
||||
tokvecs = [doc.tensor for doc in docs]
|
||||
if beam_width == 1:
|
||||
parse_states = self.parse_batch(docs, tokvecs)
|
||||
else:
|
||||
parse_states = self.beam_parse(docs, tokvecs,
|
||||
beam_width=beam_width, beam_density=beam_density)
|
||||
beams = self.beam_parse(docs, tokvecs,
|
||||
beam_width=beam_width, beam_density=beam_density)
|
||||
parse_states = []
|
||||
for beam in beams:
|
||||
parse_states.append(<StateClass>beam.at(0))
|
||||
self.set_annotations(docs, parse_states)
|
||||
yield from docs
|
||||
|
||||
|
@ -369,8 +383,12 @@ cdef class Parser:
|
|||
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
if isinstance(tokvecses, np.ndarray):
|
||||
tokvecses = [tokvecses]
|
||||
|
||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||
if USE_FINE_TUNE:
|
||||
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||
|
||||
nr_state = len(docs)
|
||||
nr_class = self.moves.n_moves
|
||||
|
@ -394,27 +412,20 @@ cdef class Parser:
|
|||
cdef np.ndarray scores
|
||||
c_token_ids = <int*>token_ids.data
|
||||
c_is_valid = <int*>is_valid.data
|
||||
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
||||
while not next_step.empty():
|
||||
if not has_hidden:
|
||||
for i in cython.parallel.prange(
|
||||
next_step.size(), num_threads=6, nogil=True):
|
||||
self._parse_step(next_step[i],
|
||||
feat_weights, nr_class, nr_feat, nr_piece)
|
||||
else:
|
||||
for i in range(next_step.size()):
|
||||
st = next_step[i]
|
||||
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
||||
self.moves.set_valid(&c_is_valid[i*nr_class], st)
|
||||
vectors = state2vec(token_ids[:next_step.size()])
|
||||
scores = vec2scores(vectors)
|
||||
c_scores = <float*>scores.data
|
||||
for i in range(next_step.size()):
|
||||
st = next_step[i]
|
||||
guess = arg_max_if_valid(
|
||||
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
|
||||
action = self.moves.c[guess]
|
||||
action.do(st, action.label)
|
||||
for i in range(next_step.size()):
|
||||
st = next_step[i]
|
||||
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
||||
self.moves.set_valid(&c_is_valid[i*nr_class], st)
|
||||
vectors = state2vec(token_ids[:next_step.size()])
|
||||
scores = vec2scores(vectors)
|
||||
c_scores = <float*>scores.data
|
||||
for i in range(next_step.size()):
|
||||
st = next_step[i]
|
||||
guess = arg_max_if_valid(
|
||||
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
|
||||
action = self.moves.c[guess]
|
||||
action.do(st, action.label)
|
||||
this_step, next_step = next_step, this_step
|
||||
next_step.clear()
|
||||
for st in this_step:
|
||||
|
@ -422,18 +433,22 @@ cdef class Parser:
|
|||
next_step.push_back(st)
|
||||
return states
|
||||
|
||||
def beam_parse(self, docs, tokvecses, int beam_width=8, float beam_density=0.001):
|
||||
def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
|
||||
cdef Beam beam
|
||||
cdef np.ndarray scores
|
||||
cdef Doc doc
|
||||
cdef int nr_class = self.moves.n_moves
|
||||
cdef StateClass stcls, output
|
||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||
if USE_FINE_TUNE:
|
||||
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||
cuda_stream = get_cuda_stream()
|
||||
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
|
||||
cuda_stream, 0.0)
|
||||
beams = []
|
||||
cdef int offset = 0
|
||||
cdef int j = 0
|
||||
cdef int k
|
||||
for doc in docs:
|
||||
beam = Beam(nr_class, beam_width, min_density=beam_density)
|
||||
beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
|
||||
|
@ -446,44 +461,32 @@ cdef class Parser:
|
|||
states = []
|
||||
for i in range(beam.size):
|
||||
stcls = <StateClass>beam.at(i)
|
||||
states.append(stcls)
|
||||
# This way we avoid having to score finalized states
|
||||
# We do have to take care to keep indexes aligned, though
|
||||
if not stcls.is_final():
|
||||
states.append(stcls)
|
||||
token_ids = self.get_token_ids(states)
|
||||
vectors = state2vec(token_ids)
|
||||
scores = vec2scores(vectors)
|
||||
j = 0
|
||||
c_scores = <float*>scores.data
|
||||
for i in range(beam.size):
|
||||
stcls = <StateClass>beam.at(i)
|
||||
if not stcls.is_final():
|
||||
self.moves.set_valid(beam.is_valid[i], stcls.c)
|
||||
for j in range(nr_class):
|
||||
beam.scores[i][j] = scores[i, j]
|
||||
for k in range(nr_class):
|
||||
beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
|
||||
j += 1
|
||||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||
beam.check_done(_check_final_state, NULL)
|
||||
beams.append(beam)
|
||||
return beams
|
||||
|
||||
cdef void _parse_step(self, StateC* state,
|
||||
const float* feat_weights,
|
||||
int nr_class, int nr_feat, int nr_piece) nogil:
|
||||
'''This only works with no hidden layers -- fast but inaccurate'''
|
||||
#for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
|
||||
# self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
|
||||
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
||||
scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
|
||||
is_valid = <int*>calloc(nr_class, sizeof(int))
|
||||
|
||||
state.set_context_tokens(token_ids, nr_feat)
|
||||
sum_state_features(scores,
|
||||
feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
|
||||
self.moves.set_valid(is_valid, state)
|
||||
guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
|
||||
action = self.moves.c[guess]
|
||||
action.do(state, action.label)
|
||||
|
||||
free(is_valid)
|
||||
free(scores)
|
||||
free(token_ids)
|
||||
|
||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
|
||||
return self.update_beam(docs_tokvecs, golds,
|
||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||
drop=drop, sgd=sgd, losses=losses)
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
docs, tokvec_lists = docs_tokvecs
|
||||
|
@ -491,6 +494,10 @@ cdef class Parser:
|
|||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||
docs = [docs]
|
||||
golds = [golds]
|
||||
if USE_FINE_TUNE:
|
||||
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||
my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
|
||||
tokvecs += my_tokvecs
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
|
||||
|
@ -517,13 +524,14 @@ cdef class Parser:
|
|||
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
|
||||
|
||||
d_scores = self.get_batch_loss(states, golds, scores)
|
||||
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
|
||||
d_scores /= len(docs)
|
||||
d_vector = bp_scores(d_scores, sgd=sgd)
|
||||
if drop != 0:
|
||||
d_vector *= mask
|
||||
|
||||
if isinstance(self.model[0].ops, CupyOps) \
|
||||
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to CPU, asynchronously
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
backprops.append((
|
||||
get_async(cuda_stream, token_ids),
|
||||
get_async(cuda_stream, d_vector),
|
||||
|
@ -540,7 +548,62 @@ cdef class Parser:
|
|||
break
|
||||
self._make_updates(d_tokvecs,
|
||||
backprops, sgd, cuda_stream)
|
||||
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||
if USE_FINE_TUNE:
|
||||
bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||
return d_tokvecs
|
||||
|
||||
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
|
||||
drop=0., sgd=None, losses=None):
|
||||
if width is None:
|
||||
width = self.cfg.get('beam_width', 2)
|
||||
if density is None:
|
||||
density = self.cfg.get('beam_density', 0.0)
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
docs, tokvecs = docs_tokvecs
|
||||
lengths = [len(d) for d in docs]
|
||||
assert min(lengths) >= 1
|
||||
tokvecs = self.model[0].ops.flatten(tokvecs)
|
||||
if USE_FINE_TUNE:
|
||||
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||
my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
|
||||
tokvecs += my_tokvecs
|
||||
|
||||
states = self.moves.init_batch(docs)
|
||||
for gold in golds:
|
||||
self.moves.preprocess_gold(gold)
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
|
||||
|
||||
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
||||
states, tokvecs, golds,
|
||||
state2vec, vec2scores,
|
||||
width, density,
|
||||
sgd=sgd, drop=drop, losses=losses)
|
||||
backprop_lower = []
|
||||
cdef float batch_size = len(docs)
|
||||
for i, d_scores in enumerate(states_d_scores):
|
||||
d_scores /= batch_size
|
||||
if losses is not None:
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
ids, bp_vectors, bp_scores = backprops[i]
|
||||
d_vector = bp_scores(d_scores, sgd=sgd)
|
||||
if isinstance(self.model[0].ops, CupyOps) \
|
||||
and not isinstance(ids, state2vec.ops.xp.ndarray):
|
||||
backprop_lower.append((
|
||||
get_async(cuda_stream, ids),
|
||||
get_async(cuda_stream, d_vector),
|
||||
bp_vectors))
|
||||
else:
|
||||
backprop_lower.append((ids, d_vector, bp_vectors))
|
||||
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
|
||||
if USE_FINE_TUNE:
|
||||
bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||
return d_tokvecs
|
||||
|
||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||
"""Make a square batch, of length equal to the shortest doc. A long
|
||||
|
@ -585,14 +648,10 @@ cdef class Parser:
|
|||
xp = get_array_module(d_tokvecs)
|
||||
for ids, d_vector, bp_vector in backprops:
|
||||
d_state_features = bp_vector(d_vector, sgd=sgd)
|
||||
active_feats = ids * (ids >= 0)
|
||||
active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
|
||||
if hasattr(xp, 'scatter_add'):
|
||||
xp.scatter_add(d_tokvecs,
|
||||
ids, d_state_features * active_feats)
|
||||
else:
|
||||
xp.add.at(d_tokvecs,
|
||||
ids, d_state_features * active_feats)
|
||||
mask = ids >= 0
|
||||
d_state_features *= mask.reshape(ids.shape + (1,))
|
||||
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
|
||||
d_state_features)
|
||||
|
||||
@property
|
||||
def move_names(self):
|
||||
|
@ -603,12 +662,12 @@ cdef class Parser:
|
|||
return names
|
||||
|
||||
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
||||
lower, upper = self.model
|
||||
_, lower, upper = self.model
|
||||
state2vec = precompute_hiddens(batch_size, tokvecs,
|
||||
lower, stream, drop=dropout)
|
||||
return state2vec, upper
|
||||
|
||||
nr_feature = 13
|
||||
nr_feature = 8
|
||||
|
||||
def get_token_ids(self, states):
|
||||
cdef StateClass state
|
||||
|
@ -693,10 +752,12 @@ cdef class Parser:
|
|||
|
||||
def to_disk(self, path, **exclude):
|
||||
serializers = {
|
||||
'lower_model': lambda p: p.open('wb').write(
|
||||
'tok2vec_model': lambda p: p.open('wb').write(
|
||||
self.model[0].to_bytes()),
|
||||
'upper_model': lambda p: p.open('wb').write(
|
||||
'lower_model': lambda p: p.open('wb').write(
|
||||
self.model[1].to_bytes()),
|
||||
'upper_model': lambda p: p.open('wb').write(
|
||||
self.model[2].to_bytes()),
|
||||
'vocab': lambda p: self.vocab.to_disk(p),
|
||||
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
||||
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
||||
|
@ -717,24 +778,29 @@ cdef class Parser:
|
|||
self.model, cfg = self.Model(**self.cfg)
|
||||
else:
|
||||
cfg = {}
|
||||
with (path / 'lower_model').open('rb') as file_:
|
||||
with (path / 'tok2vec_model').open('rb') as file_:
|
||||
bytes_data = file_.read()
|
||||
self.model[0].from_bytes(bytes_data)
|
||||
with (path / 'upper_model').open('rb') as file_:
|
||||
with (path / 'lower_model').open('rb') as file_:
|
||||
bytes_data = file_.read()
|
||||
self.model[1].from_bytes(bytes_data)
|
||||
with (path / 'upper_model').open('rb') as file_:
|
||||
bytes_data = file_.read()
|
||||
self.model[2].from_bytes(bytes_data)
|
||||
self.cfg.update(cfg)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
serializers = OrderedDict((
|
||||
('lower_model', lambda: self.model[0].to_bytes()),
|
||||
('upper_model', lambda: self.model[1].to_bytes()),
|
||||
('tok2vec_model', lambda: self.model[0].to_bytes()),
|
||||
('lower_model', lambda: self.model[1].to_bytes()),
|
||||
('upper_model', lambda: self.model[2].to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||
('cfg', lambda: ujson.dumps(self.cfg))
|
||||
))
|
||||
if 'model' in exclude:
|
||||
exclude['tok2vec_model'] = True
|
||||
exclude['lower_model'] = True
|
||||
exclude['upper_model'] = True
|
||||
exclude.pop('model')
|
||||
|
@ -745,6 +811,7 @@ cdef class Parser:
|
|||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||
('tok2vec_model', lambda b: None),
|
||||
('lower_model', lambda b: None),
|
||||
('upper_model', lambda b: None)
|
||||
))
|
||||
|
@ -754,10 +821,12 @@ cdef class Parser:
|
|||
self.model, cfg = self.Model(self.moves.n_moves)
|
||||
else:
|
||||
cfg = {}
|
||||
if 'tok2vec_model' in msg:
|
||||
self.model[0].from_bytes(msg['tok2vec_model'])
|
||||
if 'lower_model' in msg:
|
||||
self.model[0].from_bytes(msg['lower_model'])
|
||||
self.model[1].from_bytes(msg['lower_model'])
|
||||
if 'upper_model' in msg:
|
||||
self.model[1].from_bytes(msg['upper_model'])
|
||||
self.model[2].from_bytes(msg['upper_model'])
|
||||
self.cfg.update(cfg)
|
||||
return self
|
||||
|
||||
|
|
|
@ -99,6 +99,9 @@ cdef class TransitionSystem:
|
|||
def preprocess_gold(self, GoldParse gold):
|
||||
raise NotImplementedError
|
||||
|
||||
def is_gold_parse(self, StateClass state, GoldParse gold):
|
||||
raise NotImplementedError
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
raise NotImplementedError
|
||||
|
||||
|
@ -107,6 +110,8 @@ cdef class TransitionSystem:
|
|||
|
||||
def is_valid(self, StateClass stcls, move_name):
|
||||
action = self.lookup_transition(move_name)
|
||||
if action.move == 0:
|
||||
return False
|
||||
return action.is_valid(stcls.c, action.label)
|
||||
|
||||
cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
|
||||
|
|
|
@ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc):
|
|||
parser(doc, beam_width=32, beam_density=0.001)
|
||||
for word in doc:
|
||||
print(word.text, word.head, word.dep_)
|
||||
|
||||
|
||||
def test_update_doc_beam(parser, tok2vec, model, doc, gold):
|
||||
parser.model = model
|
||||
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
|
||||
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
|
||||
assert d_tokvecs[0].shape == tokvecs[0].shape
|
||||
def optimize(weights, gradient, key=None):
|
||||
weights -= 0.001 * gradient
|
||||
bp_tokvecs(d_tokvecs, sgd=optimize)
|
||||
assert d_tokvecs[0].sum() == 0.
|
||||
|
||||
|
||||
|
|
87
spacy/tests/parser/test_nn_beam.py
Normal file
87
spacy/tests/parser/test_nn_beam.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import numpy
|
||||
from thinc.api import layerize
|
||||
|
||||
from ...vocab import Vocab
|
||||
from ...syntax.arc_eager import ArcEager
|
||||
from ...tokens import Doc
|
||||
from ...gold import GoldParse
|
||||
from ...syntax._beam_utils import ParserBeam, update_beam
|
||||
from ...syntax.stateclass import StateClass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vocab():
|
||||
return Vocab()
|
||||
|
||||
@pytest.fixture
|
||||
def moves(vocab):
|
||||
aeager = ArcEager(vocab.strings, {})
|
||||
aeager.add_action(2, 'nsubj')
|
||||
aeager.add_action(3, 'dobj')
|
||||
aeager.add_action(2, 'aux')
|
||||
return aeager
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def docs(vocab):
|
||||
return [Doc(vocab, words=['Rats', 'bite', 'things'])]
|
||||
|
||||
@pytest.fixture
|
||||
def states(docs):
|
||||
return [StateClass(doc) for doc in docs]
|
||||
|
||||
@pytest.fixture
|
||||
def tokvecs(docs, vector_size):
|
||||
output = []
|
||||
for doc in docs:
|
||||
vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
|
||||
output.append(numpy.asarray(vec))
|
||||
return output
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def golds(docs):
|
||||
return [GoldParse(doc) for doc in docs]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def batch_size(docs):
|
||||
return len(docs)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def beam_width():
|
||||
return 4
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vector_size():
|
||||
return 6
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def beam(moves, states, golds, beam_width):
|
||||
return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
|
||||
|
||||
@pytest.fixture
|
||||
def scores(moves, batch_size, beam_width):
|
||||
return [
|
||||
numpy.asarray(
|
||||
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
|
||||
dtype='f')
|
||||
for _ in range(batch_size)]
|
||||
|
||||
|
||||
def test_create_beam(beam):
|
||||
pass
|
||||
|
||||
|
||||
def test_beam_advance(beam, scores):
|
||||
beam.advance(scores)
|
||||
|
||||
|
||||
def test_beam_advance_too_few_scores(beam, scores):
|
||||
with pytest.raises(IndexError):
|
||||
beam.advance(scores[:-1])
|
12
spacy/tests/regression/test_issue1257.py
Normal file
12
spacy/tests/regression/test_issue1257.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
'''Test tokens compare correctly'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import get_doc
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
def test_issue1257():
|
||||
doc1 = get_doc(Vocab(), ['a', 'b', 'c'])
|
||||
doc2 = get_doc(Vocab(), ['a', 'c', 'e'])
|
||||
assert doc1[0] != doc2[0]
|
||||
assert not doc1[0] == doc2[0]
|
|
@ -11,8 +11,8 @@ import pytest
|
|||
def taggers(en_vocab):
|
||||
tagger1 = Tagger(en_vocab)
|
||||
tagger2 = Tagger(en_vocab)
|
||||
tagger1.model = tagger1.Model(None, None)
|
||||
tagger2.model = tagger2.Model(None, None)
|
||||
tagger1.model = tagger1.Model(8, 8)
|
||||
tagger2.model = tagger1.model
|
||||
return (tagger1, tagger2)
|
||||
|
||||
|
||||
|
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
|||
tagger1, tagger2 = taggers
|
||||
tagger1_b = tagger1.to_bytes()
|
||||
tagger2_b = tagger2.to_bytes()
|
||||
assert tagger1_b == tagger2_b
|
||||
tagger1 = tagger1.from_bytes(tagger1_b)
|
||||
assert tagger1.to_bytes() == tagger1_b
|
||||
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import get_doc
|
||||
from ...attrs import ORTH, LENGTH
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer):
|
|||
span3 = tokens[0:2]
|
||||
assert hash(span3) == hash(span1)
|
||||
|
||||
|
||||
def test_spans_by_character(doc):
|
||||
span1 = doc[1:-2]
|
||||
span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == 'GPE'
|
||||
|
||||
|
||||
def test_span_to_array(doc):
|
||||
span = doc[1:-2]
|
||||
arr = span.to_array([ORTH, LENGTH])
|
||||
assert arr.shape == (len(span), 2)
|
||||
assert arr[0, 0] == span[0].orth
|
||||
assert arr[0, 1] == len(span[0])
|
||||
|
||||
|
|
|
@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
|
|||
"""Add list of vector tuples to given vocab. All vectors need to have the
|
||||
same length. Format: [("text", [1, 2, 3])]"""
|
||||
length = len(vectors[0][1])
|
||||
vocab.resize_vectors(length)
|
||||
vocab.clear_vectors(length)
|
||||
for word, vec in vectors:
|
||||
vocab[word].vector = vec
|
||||
vocab.set_vector(word, vec)
|
||||
return vocab
|
||||
|
||||
|
||||
|
|
|
@ -14,10 +14,9 @@ def vectors():
|
|||
|
||||
@pytest.fixture()
|
||||
def vocab(en_vocab, vectors):
|
||||
#return add_vecs_to_vocab(en_vocab, vectors)
|
||||
return None
|
||||
add_vecs_to_vocab(en_vocab, vectors)
|
||||
return en_vocab
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_LL(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
lex1 = vocab[word1]
|
||||
|
@ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors):
|
|||
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_TT(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = get_doc(vocab, words=[word1, word2])
|
||||
|
@ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors):
|
|||
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_TD(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = get_doc(vocab, words=[word1, word2])
|
||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_DS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = get_doc(vocab, words=[word1, word2])
|
||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_vectors_similarity_TS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = get_doc(vocab, words=[word1, word2])
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ...vectors import Vectors
|
||||
from ...tokenizer import Tokenizer
|
||||
from ..util import add_vecs_to_vocab, get_doc
|
||||
|
||||
import numpy
|
||||
import pytest
|
||||
|
@ -11,22 +13,42 @@ import pytest
|
|||
def strings():
|
||||
return ["apple", "orange"]
|
||||
|
||||
@pytest.fixture
|
||||
def vectors():
|
||||
return [
|
||||
("apple", [1, 2, 3]),
|
||||
("orange", [-1, -2, -3]),
|
||||
('and', [-1, -1, -1]),
|
||||
('juice', [5, 5, 10]),
|
||||
('pie', [7, 6.3, 8.9])]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def vocab(en_vocab, vectors):
|
||||
add_vecs_to_vocab(en_vocab, vectors)
|
||||
return en_vocab
|
||||
|
||||
|
||||
def test_init_vectors_with_data(strings, data):
|
||||
v = Vectors(strings, data)
|
||||
assert v.shape == data.shape
|
||||
|
||||
def test_init_vectors_with_width(strings):
|
||||
v = Vectors(strings, 3)
|
||||
for string in strings:
|
||||
v.add(string)
|
||||
assert v.shape == (len(strings), 3)
|
||||
|
||||
|
||||
def test_get_vector(strings, data):
|
||||
v = Vectors(strings, data)
|
||||
for string in strings:
|
||||
v.add(string)
|
||||
assert list(v[strings[0]]) == list(data[0])
|
||||
assert list(v[strings[0]]) != list(data[1])
|
||||
assert list(v[strings[1]]) != list(data[0])
|
||||
|
@ -35,6 +57,8 @@ def test_get_vector(strings, data):
|
|||
def test_set_vector(strings, data):
|
||||
orig = data.copy()
|
||||
v = Vectors(strings, data)
|
||||
for string in strings:
|
||||
v.add(string)
|
||||
assert list(v[strings[0]]) == list(orig[0])
|
||||
assert list(v[strings[0]]) != list(orig[1])
|
||||
v[strings[0]] = data[1]
|
||||
|
@ -42,125 +66,111 @@ def test_set_vector(strings, data):
|
|||
assert list(v[strings[0]]) != list(orig[0])
|
||||
|
||||
|
||||
#
|
||||
#@pytest.fixture()
|
||||
#def tokenizer_v(vocab):
|
||||
# return Tokenizer(vocab, {}, None, None, None)
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', ["apple and orange"])
|
||||
#def test_vectors_token_vector(tokenizer_v, vectors, text):
|
||||
# doc = tokenizer_v(text)
|
||||
# assert vectors[0] == (doc[0].text, list(doc[0].vector))
|
||||
# assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', ["apple", "orange"])
|
||||
#def test_vectors_lexeme_vector(vocab, text):
|
||||
# lex = vocab[text]
|
||||
# assert list(lex.vector)
|
||||
# assert lex.vector_norm
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||
#def test_vectors_doc_vector(vocab, text):
|
||||
# doc = get_doc(vocab, text)
|
||||
# assert list(doc.vector)
|
||||
# assert doc.vector_norm
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||
#def test_vectors_span_vector(vocab, text):
|
||||
# span = get_doc(vocab, text)[0:2]
|
||||
# assert list(span.vector)
|
||||
# assert span.vector_norm
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', ["apple orange"])
|
||||
#def test_vectors_token_token_similarity(tokenizer_v, text):
|
||||
# doc = tokenizer_v(text)
|
||||
# assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
|
||||
# assert 0.0 < doc[0].similarity(doc[1]) < 1.0
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||
#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
|
||||
# token = tokenizer_v(text1)
|
||||
# lex = vocab[text2]
|
||||
# assert token.similarity(lex) == lex.similarity(token)
|
||||
# assert 0.0 < token.similarity(lex) < 1.0
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
#def test_vectors_token_span_similarity(vocab, text):
|
||||
# doc = get_doc(vocab, text)
|
||||
# assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
|
||||
# assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
#def test_vectors_token_doc_similarity(vocab, text):
|
||||
# doc = get_doc(vocab, text)
|
||||
# assert doc[0].similarity(doc) == doc.similarity(doc[0])
|
||||
# assert 0.0 < doc[0].similarity(doc) < 1.0
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
#def test_vectors_lexeme_span_similarity(vocab, text):
|
||||
# doc = get_doc(vocab, text)
|
||||
# lex = vocab[text[0]]
|
||||
# assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
|
||||
# assert 0.0 < doc.similarity(doc[1:3]) < 1.0
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||
#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
|
||||
# lex1 = vocab[text1]
|
||||
# lex2 = vocab[text2]
|
||||
# assert lex1.similarity(lex2) == lex2.similarity(lex1)
|
||||
# assert 0.0 < lex1.similarity(lex2) < 1.0
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
#def test_vectors_lexeme_doc_similarity(vocab, text):
|
||||
# doc = get_doc(vocab, text)
|
||||
# lex = vocab[text[0]]
|
||||
# assert lex.similarity(doc) == doc.similarity(lex)
|
||||
# assert 0.0 < lex.similarity(doc) < 1.0
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
#def test_vectors_span_span_similarity(vocab, text):
|
||||
# doc = get_doc(vocab, text)
|
||||
# assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
|
||||
# assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
#def test_vectors_span_doc_similarity(vocab, text):
|
||||
# doc = get_doc(vocab, text)
|
||||
# assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
|
||||
# assert 0.0 < doc[0:2].similarity(doc) < 1.0
|
||||
#
|
||||
#
|
||||
#@pytest.mark.xfail
|
||||
#@pytest.mark.parametrize('text1,text2', [
|
||||
# (["apple", "and", "apple", "pie"], ["orange", "juice"])])
|
||||
#def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
||||
# doc1 = get_doc(vocab, text1)
|
||||
# doc2 = get_doc(vocab, text2)
|
||||
# assert doc1.similarity(doc2) == doc2.similarity(doc1)
|
||||
# assert 0.0 < doc1.similarity(doc2) < 1.0
|
||||
|
||||
@pytest.fixture()
|
||||
def tokenizer_v(vocab):
|
||||
return Tokenizer(vocab, {}, None, None, None)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["apple and orange"])
|
||||
def test_vectors_token_vector(tokenizer_v, vectors, text):
|
||||
doc = tokenizer_v(text)
|
||||
assert vectors[0] == (doc[0].text, list(doc[0].vector))
|
||||
assert vectors[1] == (doc[2].text, list(doc[2].vector))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["apple", "orange"])
|
||||
def test_vectors_lexeme_vector(vocab, text):
|
||||
lex = vocab[text]
|
||||
assert list(lex.vector)
|
||||
assert lex.vector_norm
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||
def test_vectors_doc_vector(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
assert list(doc.vector)
|
||||
assert doc.vector_norm
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
|
||||
def test_vectors_span_vector(vocab, text):
|
||||
span = get_doc(vocab, text)[0:2]
|
||||
assert list(span.vector)
|
||||
assert span.vector_norm
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["apple orange"])
|
||||
def test_vectors_token_token_similarity(tokenizer_v, text):
|
||||
doc = tokenizer_v(text)
|
||||
assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
|
||||
assert -1. < doc[0].similarity(doc[1]) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||
def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
|
||||
token = tokenizer_v(text1)
|
||||
lex = vocab[text2]
|
||||
assert token.similarity(lex) == lex.similarity(token)
|
||||
assert -1. < token.similarity(lex) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_token_span_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
|
||||
assert -1. < doc[0].similarity(doc[1:3]) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_token_doc_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
assert doc[0].similarity(doc) == doc.similarity(doc[0])
|
||||
assert -1. < doc[0].similarity(doc) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_lexeme_span_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
lex = vocab[text[0]]
|
||||
assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
|
||||
assert -1. < doc.similarity(doc[1:3]) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
|
||||
def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
|
||||
lex1 = vocab[text1]
|
||||
lex2 = vocab[text2]
|
||||
assert lex1.similarity(lex2) == lex2.similarity(lex1)
|
||||
assert -1. < lex1.similarity(lex2) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_lexeme_doc_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
lex = vocab[text[0]]
|
||||
assert lex.similarity(doc) == doc.similarity(lex)
|
||||
assert -1. < lex.similarity(doc) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_span_span_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
|
||||
assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
|
||||
def test_vectors_span_doc_similarity(vocab, text):
|
||||
doc = get_doc(vocab, text)
|
||||
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
|
||||
assert -1. < doc[0:2].similarity(doc) < 1.0
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text1,text2', [
|
||||
(["apple", "and", "apple", "pie"], ["orange", "juice"])])
|
||||
def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
||||
doc1 = get_doc(vocab, text1)
|
||||
doc2 = get_doc(vocab, text2)
|
||||
assert doc1.similarity(doc2) == doc2.similarity(doc1)
|
||||
assert -1. < doc1.similarity(doc2) < 1.0
|
||||
|
|
|
@ -238,6 +238,29 @@ cdef class Doc:
|
|||
def doc(self):
|
||||
return self
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, vector=None):
|
||||
"""Create a `Span` object from the slice `doc.text[start : end]`.
|
||||
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
if not isinstance(label, int):
|
||||
label = self.vocab.strings.add(label)
|
||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||
if start == -1:
|
||||
return None
|
||||
cdef int end = token_by_end(self.c, self.length, end_idx)
|
||||
if end == -1:
|
||||
return None
|
||||
# Currently we have the token index, we want the range-end index
|
||||
end += 1
|
||||
cdef Span span = Span(self, start, end, label=label, vector=vector)
|
||||
return span
|
||||
|
||||
def similarity(self, other):
|
||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
|
|
@ -15,5 +15,5 @@ cdef class Span:
|
|||
cdef public _vector
|
||||
cdef public _vector_norm
|
||||
|
||||
|
||||
cpdef int _recalculate_indices(self) except -1
|
||||
cpdef np.ndarray to_array(self, object features)
|
||||
|
|
|
@ -7,7 +7,7 @@ import numpy
|
|||
import numpy.linalg
|
||||
from libc.math cimport sqrt
|
||||
|
||||
from .doc cimport token_by_start, token_by_end
|
||||
from .doc cimport token_by_start, token_by_end, get_token_attr
|
||||
from ..structs cimport TokenC, LexemeC
|
||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||
from ..attrs cimport attr_id_t
|
||||
|
@ -135,6 +135,29 @@ cdef class Span:
|
|||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
The values will be 32-bit integers.
|
||||
|
||||
attr_ids (list[int]): A list of attribute ID ints.
|
||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||
per word, and one column per attribute indicated in the input
|
||||
`attr_ids`.
|
||||
"""
|
||||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
cdef np.ndarray[attr_t, ndim=2] output
|
||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||
# dict iteration.
|
||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||
cdef int length = self.end - self.start
|
||||
output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64)
|
||||
for i in range(self.start, self.end):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
|
||||
return output
|
||||
|
||||
cpdef int _recalculate_indices(self) except -1:
|
||||
if self.end > self.doc.length \
|
||||
or self.doc.c[self.start].idx != self.start_char \
|
||||
|
|
|
@ -62,18 +62,26 @@ cdef class Token:
|
|||
|
||||
def __richcmp__(self, Token other, int op):
|
||||
# http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
|
||||
cdef Doc my_doc = self.doc
|
||||
cdef Doc other_doc = other.doc
|
||||
my = self.idx
|
||||
their = other.idx if other is not None else None
|
||||
if op == 0:
|
||||
return my < their
|
||||
elif op == 2:
|
||||
return my == their
|
||||
if my_doc is other_doc:
|
||||
return my == their
|
||||
else:
|
||||
return False
|
||||
elif op == 4:
|
||||
return my > their
|
||||
elif op == 1:
|
||||
return my <= their
|
||||
elif op == 3:
|
||||
return my != their
|
||||
if my_doc is other_doc:
|
||||
return my != their
|
||||
else:
|
||||
return True
|
||||
elif op == 5:
|
||||
return my >= their
|
||||
else:
|
||||
|
|
|
@ -22,7 +22,7 @@ import ujson
|
|||
|
||||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
||||
from .compat import copy_array, normalize_string_keys, getattr_
|
||||
from .compat import copy_array, normalize_string_keys, getattr_, import_file
|
||||
|
||||
|
||||
LANGUAGES = {}
|
||||
|
@ -112,15 +112,13 @@ def load_model(name, **overrides):
|
|||
|
||||
def load_model_from_link(name, **overrides):
|
||||
"""Load a model from a shortcut link, or directory in spaCy data path."""
|
||||
init_file = get_data_path() / name / '__init__.py'
|
||||
spec = importlib.util.spec_from_file_location(name, str(init_file))
|
||||
path = get_data_path() / name / '__init__.py'
|
||||
try:
|
||||
cls = importlib.util.module_from_spec(spec)
|
||||
cls = import_file(name, path)
|
||||
except AttributeError:
|
||||
raise IOError(
|
||||
"Cant' load '%s'. If you're using a shortcut link, make sure it "
|
||||
"points to a valid model package (not just a data directory)." % name)
|
||||
spec.loader.exec_module(cls)
|
||||
return cls.load(**overrides)
|
||||
|
||||
|
||||
|
|
|
@ -1,18 +1,25 @@
|
|||
from __future__ import unicode_literals
|
||||
from libc.stdint cimport int32_t, uint64_t
|
||||
import numpy
|
||||
from collections import OrderedDict
|
||||
import msgpack
|
||||
import msgpack_numpy
|
||||
msgpack_numpy.patch()
|
||||
cimport numpy as np
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .strings cimport StringStore
|
||||
from . import util
|
||||
from .compat import basestring_
|
||||
|
||||
|
||||
cdef class Vectors:
|
||||
'''Store, save and load word vectors.'''
|
||||
cdef public object data
|
||||
cdef readonly StringStore strings
|
||||
cdef public object key2i
|
||||
cdef public object key2row
|
||||
cdef public object keys
|
||||
cdef public int i
|
||||
|
||||
def __init__(self, strings, data_or_width):
|
||||
self.strings = StringStore()
|
||||
|
@ -21,10 +28,10 @@ cdef class Vectors:
|
|||
dtype='f')
|
||||
else:
|
||||
data = data_or_width
|
||||
self.i = 0
|
||||
self.data = data
|
||||
self.key2i = {}
|
||||
for i, string in enumerate(strings):
|
||||
self.key2i[self.strings.add(string)] = i
|
||||
self.key2row = {}
|
||||
self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
|
||||
|
||||
def __reduce__(self):
|
||||
return (Vectors, (self.strings, self.data))
|
||||
|
@ -32,7 +39,7 @@ cdef class Vectors:
|
|||
def __getitem__(self, key):
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings[key]
|
||||
i = self.key2i[key]
|
||||
i = self.key2row[key]
|
||||
if i is None:
|
||||
raise KeyError(key)
|
||||
else:
|
||||
|
@ -41,14 +48,36 @@ cdef class Vectors:
|
|||
def __setitem__(self, key, vector):
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings.add(key)
|
||||
i = self.key2i[key]
|
||||
i = self.key2row[key]
|
||||
self.data[i] = vector
|
||||
|
||||
def __iter__(self):
|
||||
yield from self.data
|
||||
|
||||
def __len__(self):
|
||||
return len(self.strings)
|
||||
return self.i
|
||||
|
||||
def __contains__(self, key):
|
||||
if isinstance(key, basestring_):
|
||||
key = self.strings[key]
|
||||
return key in self.key2row
|
||||
|
||||
def add(self, key, vector=None):
|
||||
if isinstance(key, basestring_):
|
||||
key = self.strings.add(key)
|
||||
if key not in self.key2row:
|
||||
i = self.i
|
||||
if i >= self.keys.shape[0]:
|
||||
self.keys.resize((self.keys.shape[0]*2,))
|
||||
self.data.resize((self.data.shape[0]*2, self.data.shape[1]))
|
||||
self.key2row[key] = self.i
|
||||
self.keys[self.i] = key
|
||||
self.i += 1
|
||||
else:
|
||||
i = self.key2row[key]
|
||||
if vector is not None:
|
||||
self.data[i] = vector
|
||||
return i
|
||||
|
||||
def items(self):
|
||||
for i, string in enumerate(self.strings):
|
||||
|
@ -61,34 +90,60 @@ cdef class Vectors:
|
|||
def most_similar(self, key):
|
||||
raise NotImplementedError
|
||||
|
||||
def to_disk(self, path):
|
||||
raise NotImplementedError
|
||||
def to_disk(self, path, **exclude):
|
||||
serializers = OrderedDict((
|
||||
('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
|
||||
('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
|
||||
))
|
||||
return util.to_disk(path, serializers, exclude)
|
||||
|
||||
def from_disk(self, path):
|
||||
raise NotImplementedError
|
||||
def from_disk(self, path, **exclude):
|
||||
def load_keys(path):
|
||||
if path.exists():
|
||||
self.keys = numpy.load(path)
|
||||
for i, key in enumerate(self.keys):
|
||||
self.keys[i] = key
|
||||
self.key2row[key] = i
|
||||
|
||||
def load_vectors(path):
|
||||
if path.exists():
|
||||
self.data = numpy.load(path)
|
||||
|
||||
serializers = OrderedDict((
|
||||
('keys', load_keys),
|
||||
('vectors', load_vectors),
|
||||
))
|
||||
util.from_disk(path, serializers, exclude)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def serialize_weights():
|
||||
if hasattr(self.weights, 'to_bytes'):
|
||||
return self.weights.to_bytes()
|
||||
if hasattr(self.data, 'to_bytes'):
|
||||
return self.data.to_bytes()
|
||||
else:
|
||||
return msgpack.dumps(self.weights)
|
||||
|
||||
return msgpack.dumps(self.data)
|
||||
serializers = OrderedDict((
|
||||
('strings', lambda: self.strings.to_bytes()),
|
||||
('weights', serialize_weights)
|
||||
('keys', lambda: msgpack.dumps(self.keys)),
|
||||
('vectors', serialize_weights)
|
||||
))
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, data, **exclude):
|
||||
def deserialize_weights(b):
|
||||
if hasattr(self.weights, 'from_bytes'):
|
||||
self.weights.from_bytes()
|
||||
if hasattr(self.data, 'from_bytes'):
|
||||
self.data.from_bytes()
|
||||
else:
|
||||
self.weights = msgpack.loads(b)
|
||||
self.data = msgpack.loads(b)
|
||||
|
||||
def load_keys(keys):
|
||||
self.keys.resize((len(keys),))
|
||||
for i, key in enumerate(keys):
|
||||
self.keys[i] = key
|
||||
self.key2row[key] = i
|
||||
|
||||
deserializers = OrderedDict((
|
||||
('strings', lambda b: self.strings.from_bytes(b)),
|
||||
('weights', deserialize_weights)
|
||||
('keys', lambda b: load_keys(msgpack.loads(b))),
|
||||
('vectors', deserialize_weights)
|
||||
))
|
||||
return util.from_bytes(deserializers, exclude)
|
||||
util.from_bytes(data, deserializers, exclude)
|
||||
return self
|
||||
|
|
|
@ -19,9 +19,10 @@ from .tokens.token cimport Token
|
|||
from .attrs cimport PROB, LANG
|
||||
from .structs cimport SerializedLexemeC
|
||||
|
||||
from .compat import copy_reg, pickle
|
||||
from .compat import copy_reg, pickle, basestring_
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from .vectors import Vectors
|
||||
from . import util
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
|
@ -63,6 +64,7 @@ cdef class Vocab:
|
|||
self.strings.add(name)
|
||||
self.lex_attr_getters = lex_attr_getters
|
||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
self.vectors = Vectors(self.strings, 300)
|
||||
|
||||
property lang:
|
||||
def __get__(self):
|
||||
|
@ -242,13 +244,15 @@ cdef class Vocab:
|
|||
|
||||
@property
|
||||
def vectors_length(self):
|
||||
raise NotImplementedError
|
||||
return len(self.vectors)
|
||||
|
||||
def clear_vectors(self):
|
||||
def clear_vectors(self, new_dim=None):
|
||||
"""Drop the current vector table. Because all vectors must be the same
|
||||
width, you have to call this to change the size of the vectors.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
if new_dim is None:
|
||||
new_dim = self.vectors.data.shape[1]
|
||||
self.vectors = Vectors(self.strings, new_dim)
|
||||
|
||||
def get_vector(self, orth):
|
||||
"""Retrieve a vector for a word in the vocabulary.
|
||||
|
@ -262,7 +266,9 @@ cdef class Vocab:
|
|||
|
||||
RAISES: If no vectors data is loaded, ValueError is raised.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
if isinstance(orth, basestring_):
|
||||
orth = self.strings.add(orth)
|
||||
return self.vectors[orth]
|
||||
|
||||
def set_vector(self, orth, vector):
|
||||
"""Set a vector for a word in the vocabulary.
|
||||
|
@ -272,15 +278,19 @@ cdef class Vocab:
|
|||
RETURNS:
|
||||
None
|
||||
"""
|
||||
raise NotImplementedError
|
||||
if not isinstance(orth, basestring_):
|
||||
orth = self.strings[orth]
|
||||
self.vectors.add(orth, vector=vector)
|
||||
|
||||
def has_vector(self, orth):
|
||||
"""Check whether a word has a vector. Returns False if no
|
||||
vectors have been loaded. Words can be looked up by string
|
||||
or int ID."""
|
||||
return False
|
||||
if isinstance(orth, basestring_):
|
||||
orth = self.strings.add(orth)
|
||||
return orth in self.vectors
|
||||
|
||||
def to_disk(self, path):
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
|
@ -292,8 +302,10 @@ cdef class Vocab:
|
|||
self.strings.to_disk(path / 'strings.json')
|
||||
with (path / 'lexemes.bin').open('wb') as file_:
|
||||
file_.write(self.lexemes_to_bytes())
|
||||
if self.vectors is not None:
|
||||
self.vectors.to_disk(path)
|
||||
|
||||
def from_disk(self, path):
|
||||
def from_disk(self, path, **exclude):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
|
@ -305,6 +317,8 @@ cdef class Vocab:
|
|||
self.strings.from_disk(path / 'strings.json')
|
||||
with (path / 'lexemes.bin').open('rb') as file_:
|
||||
self.lexemes_from_bytes(file_.read())
|
||||
if self.vectors is not None:
|
||||
self.vectors.from_disk(path, exclude='strings.json')
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
|
@ -313,9 +327,16 @@ cdef class Vocab:
|
|||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||
"""
|
||||
def deserialize_vectors():
|
||||
if self.vectors is None:
|
||||
return None
|
||||
else:
|
||||
return self.vectors.to_bytes(exclude='strings.json')
|
||||
|
||||
getters = OrderedDict((
|
||||
('strings', lambda: self.strings.to_bytes()),
|
||||
('lexemes', lambda: self.lexemes_to_bytes()),
|
||||
('vectors', deserialize_vectors)
|
||||
))
|
||||
return util.to_bytes(getters, exclude)
|
||||
|
||||
|
@ -326,9 +347,15 @@ cdef class Vocab:
|
|||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (Vocab): The `Vocab` object.
|
||||
"""
|
||||
def serialize_vectors(b):
|
||||
if self.vectors is None:
|
||||
return None
|
||||
else:
|
||||
return self.vectors.from_bytes(b, exclude='strings')
|
||||
setters = OrderedDict((
|
||||
('strings', lambda b: self.strings.from_bytes(b)),
|
||||
('lexemes', lambda b: self.lexemes_from_bytes(b)),
|
||||
('vectors', lambda b: serialize_vectors(b))
|
||||
))
|
||||
util.from_bytes(bytes_data, setters, exclude)
|
||||
return self
|
||||
|
|
|
@ -112,6 +112,10 @@
|
|||
.u-nowrap
|
||||
white-space: nowrap
|
||||
|
||||
.u-break.u-break
|
||||
word-wrap: break-word
|
||||
white-space: initial
|
||||
|
||||
.u-no-border
|
||||
border: none
|
||||
|
||||
|
|
|
@ -140,6 +140,43 @@ p Get the number of tokens in the document.
|
|||
+cell int
|
||||
+cell The number of tokens in the document.
|
||||
|
||||
+h(2, "char_span") Doc.char_span
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Create a #[code Span] object from the slice #[code doc.text[start : end]].
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York')
|
||||
span = doc.char_span(7, 15, label=u'GPE')
|
||||
assert span.text == 'New York'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start]
|
||||
+cell int
|
||||
+cell The index of the first character of the span.
|
||||
|
||||
+row
|
||||
+cell #[code end]
|
||||
+cell int
|
||||
+cell The index of the first character after the span.
|
||||
|
||||
+row
|
||||
+cell #[code label]
|
||||
+cell uint64 / unicode
|
||||
+cell A label to attach to the Span, e.g. for named entities.
|
||||
|
||||
+row
|
||||
+cell #[code vector]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A meaning representation of the span.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Span]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "similarity") Doc.similarity
|
||||
+tag method
|
||||
+tag-model("vectors")
|
||||
|
@ -211,12 +248,12 @@ p
|
|||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell ints
|
||||
+cell list
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell
|
||||
| The exported attributes as a 2D numpy array, with one row per
|
||||
| token and one column per attribute.
|
||||
|
@ -245,7 +282,7 @@ p
|
|||
|
||||
+row
|
||||
+cell #[code array]
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell The attribute values to load.
|
||||
|
||||
+footrow
|
||||
|
@ -509,7 +546,7 @@ p
|
|||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the document's semantics.
|
||||
|
||||
+h(2, "vector_norm") Doc.vector_norm
|
||||
|
|
|
@ -111,6 +111,14 @@ p
|
|||
+cell -
|
||||
+cell A sequence of unicode objects.
|
||||
|
||||
+row
|
||||
+cell #[code as_tuples]
|
||||
+cell bool
|
||||
+cell
|
||||
| If set to #[code True], inputs should be a sequence of
|
||||
| #[code (text, context)] tuples. Output will then be a sequence of
|
||||
| #[code (doc, context)] tuples. Defaults to #[code False].
|
||||
|
||||
+row
|
||||
+cell #[code n_threads]
|
||||
+cell int
|
||||
|
|
|
@ -129,7 +129,7 @@ p A real-valued meaning representation.
|
|||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the lexeme's semantics.
|
||||
|
||||
+h(2, "vector_norm") Lexeme.vector_norm
|
||||
|
|
|
@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
|
|||
|
||||
+row
|
||||
+cell #[code vector]
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A meaning representation of the span.
|
||||
|
||||
+footrow
|
||||
|
@ -145,11 +145,47 @@ p
|
|||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "to_array") Span.to_array
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Given a list of #[code M] attribute IDs, export the tokens to a numpy
|
||||
| #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
|
||||
| the document. The values will be 32-bit integers.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
span = doc[2:3]
|
||||
# All strings mapped to integers, for easy export to numpy
|
||||
np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell list
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code.u-break numpy.ndarray[long, ndim=2]]
|
||||
+cell
|
||||
| A feature matrix, with one row per word, and one column per
|
||||
| attribute indicated in the input #[code attr_ids].
|
||||
|
||||
+h(2, "merge") Span.merge
|
||||
+tag method
|
||||
|
||||
p Retokenize the document, such that the span is merged into a single token.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
span = doc[2:3]
|
||||
span.merge()
|
||||
assert len(doc) == 6
|
||||
assert doc[2].text == 'New York'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **attributes]
|
||||
|
@ -270,7 +306,7 @@ p
|
|||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the span's semantics.
|
||||
|
||||
+h(2, "vector_norm") Span.vector_norm
|
||||
|
|
|
@ -250,7 +250,7 @@ p A real-valued meaning representation.
|
|||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the token's semantics.
|
||||
|
||||
+h(2, "vector_norm") Span.vector_norm
|
||||
|
|
Loading…
Reference in New Issue
Block a user