Merge remote-tracking branch 'origin/develop' into develop-irish

This commit is contained in:
Jim O'Regan 2017-09-11 08:40:11 +01:00
commit 1ee75ae337
108 changed files with 43967 additions and 458 deletions

1
.gitignore vendored
View File

@ -40,7 +40,6 @@ venv/
# Distribution / packaging
env/
bin/
build/
develop-eggs/
dist/

View File

@ -14,8 +14,7 @@ os:
env:
- VIA=compile LC_ALL=en_US.ascii
- VIA=compile
# - VIA=sdist
#- VIA=pypi_nightly
install:
- "./travis.sh"
@ -23,7 +22,7 @@ install:
script:
- "pip install pytest pytest-timeout"
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
- if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
notifications:

View File

@ -1,3 +1,4 @@
recursive-include include *.h
include LICENSE
include README.rst
include bin/spacy

View File

@ -229,7 +229,7 @@ Compile from source
The other way to install spaCy is to clone its
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
source. That is the common way if you want to make changes to the code base.
You'll need to make sure that you have a development enviroment consisting of a
You'll need to make sure that you have a development environment consisting of a
Python distribution including header files, a compiler,
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.

1
bin/spacy Normal file
View File

@ -0,0 +1 @@
python -m spacy "$@"

View File

@ -28,7 +28,9 @@ MOD_NAMES = [
'spacy.pipeline',
'spacy.syntax.stateclass',
'spacy.syntax._state',
'spacy.syntax._beam_utils',
'spacy.tokenizer',
'spacy._cfile',
'spacy.syntax.parser',
'spacy.syntax.nn_parser',
'spacy.syntax.beam_parser',
@ -187,6 +189,7 @@ def setup_package():
url=about['__uri__'],
license=about['__license__'],
ext_modules=ext_modules,
scripts=['bin/spacy'],
install_requires=[
'numpy>=1.7',
'murmurhash>=0.28,<0.29',

View File

@ -3,15 +3,23 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals
if __name__ == '__main__':
import plac
import sys
from spacy.cli import download, link, info, package, train, convert
from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile
from spacy.util import prints
commands = {'download': download, 'link': link, 'info': info, 'train': train,
'convert': convert, 'package': package}
commands = {
'download': download,
'link': link,
'info': info,
'train': train,
'convert': convert,
'package': package,
'model': model,
'profile': profile,
}
if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1)
command = sys.argv.pop(1)
@ -19,5 +27,7 @@ if __name__ == '__main__':
if command in commands:
plac.call(commands[command])
else:
prints("Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command, exits=1)
prints(
"Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command,
exits=1)

26
spacy/_cfile.pxd Normal file
View File

@ -0,0 +1,26 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool
cdef class CFile:
cdef FILE* fp
cdef bint is_open
cdef Pool mem
cdef int size # For compatibility with subclass
cdef int _capacity # For compatibility with subclass
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
cdef class StringCFile(CFile):
cdef unsigned char* data
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *

88
spacy/_cfile.pyx Normal file
View File

@ -0,0 +1,88 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memcpy
cdef class CFile:
def __init__(self, loc, mode, on_open_error=None):
if isinstance(mode, unicode):
mode_str = mode.encode('ascii')
else:
mode_str = mode
if hasattr(loc, 'as_posix'):
loc = loc.as_posix()
self.mem = Pool()
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode_str)
if self.fp == NULL:
if on_open_error is not None:
on_open_error()
else:
raise IOError("Could not open binary file %s" % bytes_loc)
self.is_open = True
def __dealloc__(self):
if self.is_open:
fclose(self.fp)
def close(self):
fclose(self.fp)
self.is_open = False
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
st = fread(dest, elem_size, number, self.fp)
if st != number:
raise IOError
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
st = fwrite(src, elem_size, number, self.fp)
if st != number:
raise IOError
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)
cdef class StringCFile:
def __init__(self, mode, bytes data=b'', on_open_error=None):
self.mem = Pool()
self.is_open = 'w' in mode
self._capacity = max(len(data), 8)
self.size = len(data)
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
for i in range(len(data)):
self.data[i] = data[i]
def close(self):
self.is_open = False
def string_data(self):
return (self.data-self.size)[:self.size]
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
memcpy(dest, self.data, elem_size * number)
self.data += elem_size * number
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
write_size = number * elem_size
if (self.size + write_size) >= self._capacity:
self._capacity = (self.size + write_size) * 2
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
memcpy(&self.data[self.size], src, elem_size * number)
self.size += write_size
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)

View File

@ -9,7 +9,7 @@ import cytoolz
from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm
from thinc.neural._classes.batchnorm import BatchNorm as BN
from thinc.neural._classes.layernorm import LayerNorm as LN
from thinc.neural._classes.resnet import Residual
from thinc.neural import ReLu
@ -23,8 +23,10 @@ from thinc.neural._classes.attention import ParametricAttention
from thinc.linear.linear import LinearModel
from thinc.api import uniqued, wrap, flatten_add_lengths
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
from .tokens.doc import Doc
from . import util
import numpy
import io
@ -208,25 +210,39 @@ class PrecomputableMaxouts(Model):
return Yfp, backward
def drop_layer(layer, factor=2.):
def drop_layer_fwd(X, drop=0.):
if drop <= 0.:
return layer.begin_update(X, drop=drop)
else:
coinflip = layer.ops.xp.random.random()
if (coinflip / factor) >= drop:
return layer.begin_update(X, drop=drop)
else:
return X, lambda dX, sgd=None: dX
model = wrap(drop_layer_fwd, layer)
model.predict = layer
return model
def Tok2Vec(width, embed_size, preprocess=None):
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
embed = (norm | prefix | suffix | shape )
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
tok2vec = (
with_flatten(
asarray(Model.ops, dtype='uint64')
>> uniqued(embed, column=5)
>> LN(Maxout(width, width*4, pieces=3))
>> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
>> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
>> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
>> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)),
pad=4)
>> Residual(
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
) ** 4, pad=4
)
)
if preprocess not in (False, None):
tok2vec = preprocess >> tok2vec
@ -321,6 +337,7 @@ def zero_init(model):
def doc2feats(cols=None):
if cols is None:
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
def forward(docs, drop=0.):
feats = []
@ -353,20 +370,37 @@ def fine_tune(embedding, combine=None):
"fine_tune currently only supports addition. Set combine=None")
def fine_tune_fwd(docs_tokvecs, drop=0.):
docs, tokvecs = docs_tokvecs
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
flat_tokvecs = embedding.ops.flatten(tokvecs)
flat_vecs = embedding.ops.flatten(vecs)
output = embedding.ops.unflatten(
embedding.ops.flatten(tokvecs)
+ embedding.ops.flatten(vecs),
lengths)
(model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
def fine_tune_bwd(d_output, sgd=None):
bp_vecs(d_output, sgd=sgd)
return d_output
flat_grad = model.ops.flatten(d_output)
model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
if sgd is not None:
sgd(model._mem.weights, model._mem.gradient, key=model.id)
return [d_o * model.mix[0] for d_o in d_output]
return output, fine_tune_bwd
def fine_tune_predict(docs_tokvecs):
docs, tokvecs = docs_tokvecs
vecs = embedding(docs)
return [model.mix[0]*tv+model.mix[1]*v
for tv, v in zip(tokvecs, vecs)]
model = wrap(fine_tune_fwd, embedding)
model.mix = model._mem.add((model.id, 'mix'), (2,))
model.mix.fill(0.5)
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
model.predict = fine_tune_predict
return model
@ -422,9 +456,10 @@ def getitem(i):
return layerize(getitem_fwd)
def build_tagger_model(nr_class, token_vector_width, **cfg):
embed_size = util.env_opt('embed_size', 7500)
with Model.define_operators({'>>': chain, '+': add}):
# Input: (doc, tensor) tuples
private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
model = (
fine_tune(private_tok2vec)
@ -437,30 +472,103 @@ def build_tagger_model(nr_class, token_vector_width, **cfg):
return model
@layerize
def SpacyVectors(docs, drop=0.):
xp = get_array_module(docs[0].vocab.vectors.data)
width = docs[0].vocab.vectors.data.shape[1]
batch = []
for doc in docs:
indices = numpy.zeros((len(doc),), dtype='i')
for i, word in enumerate(doc):
if word.orth in doc.vocab.vectors.key2row:
indices[i] = doc.vocab.vectors.key2row[word.orth]
else:
indices[i] = 0
vectors = doc.vocab.vectors.data[indices]
batch.append(vectors)
return batch, None
def foreach(layer, drop_factor=1.0):
'''Map a layer across elements in a list'''
def foreach_fwd(Xs, drop=0.):
drop *= drop_factor
ys = []
backprops = []
for X in Xs:
y, bp_y = layer.begin_update(X, drop=drop)
ys.append(y)
backprops.append(bp_y)
def foreach_bwd(d_ys, sgd=None):
d_Xs = []
for d_y, bp_y in zip(d_ys, backprops):
if bp_y is not None and bp_y is not None:
d_Xs.append(d_y, sgd=sgd)
else:
d_Xs.append(None)
return d_Xs
return ys, foreach_bwd
model = wrap(foreach_fwd, layer)
return model
def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 200)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}):
embed_lower = HashEmbed(width, nr_vector, column=1)
embed_prefix = HashEmbed(width//2, nr_vector, column=2)
embed_suffix = HashEmbed(width//2, nr_vector, column=3)
embed_shape = HashEmbed(width//2, nr_vector, column=4)
nr_vector = cfg.get('nr_vector', 5000)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
'**': clone}):
if cfg.get('low_data'):
model = (
SpacyVectors
>> flatten_add_lengths
>> with_getitem(0,
Affine(width, 300)
)
>> ParametricAttention(width)
>> Pooling(sum_pool)
>> Residual(ReLu(width, width)) ** 2
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
>> logistic
)
return model
lower = HashEmbed(width, nr_vector, column=1)
prefix = HashEmbed(width//2, nr_vector, column=2)
suffix = HashEmbed(width//2, nr_vector, column=3)
shape = HashEmbed(width//2, nr_vector, column=4)
trained_vectors = (
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
>> with_flatten(
uniqued(
(lower | prefix | suffix | shape)
>> LN(Maxout(width, width+(width//2)*3)),
column=0
)
)
)
static_vectors = (
SpacyVectors
>> with_flatten(Affine(width, 300))
)
cnn_model = (
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE])
>> _flatten_add_lengths
>> with_getitem(0,
uniqued(
(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(width, width+(width//2)*3))
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
# TODO Make concatenate support lists
concatenate_lists(trained_vectors, static_vectors)
>> with_flatten(
LN(Maxout(width, width*2))
>> Residual(
(ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
) ** 2, pad=2
)
>> ParametricAttention(width,)
>> flatten_add_lengths
>> ParametricAttention(width)
>> Pooling(sum_pool)
>> ReLu(width, width)
>> Residual(zero_init(Maxout(width, width)))
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
)
linear_model = (
_preprocess_doc
>> LinearModel(nr_class, drop_factor=0.)
@ -475,3 +583,35 @@ def build_text_classifier(nr_class, width=64, **cfg):
model.lsuv = False
return model
@layerize
def flatten(seqs, drop=0.):
ops = Model.ops
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths, pad=0)
X = ops.flatten(seqs, pad=0)
return X, finish_update
def concatenate_lists(*layers, **kwargs): # pragma: no cover
'''Compose two or more models `f`, `g`, etc, such that their outputs are
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
'''
if not layers:
return noop()
drop_factor = kwargs.get('drop_factor', 1.0)
ops = layers[0].ops
layers = [chain(layer, flatten) for layer in layers]
concat = concatenate(*layers)
def concatenate_lists_fwd(Xs, drop=0.):
drop *= drop_factor
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
ys = ops.unflatten(flat_y, lengths)
def concatenate_lists_bwd(d_ys, sgd=None):
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
return ys, concatenate_lists_bwd
model = wrap(concatenate_lists_fwd, concat)
return model

View File

@ -3,7 +3,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy-nightly'
__version__ = '2.0.0a7'
__version__ = '2.0.0a13'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
__author__ = 'Explosion AI'

View File

@ -2,5 +2,7 @@ from .download import download
from .info import info
from .link import link
from .package import package
from .profile import profile
from .train import train
from .convert import convert
from .model import model

View File

@ -21,10 +21,10 @@ CONVERTERS = {
@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
n_sents=("Number of sentences per doc", "option", "n", int),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(cmd, input_file, output_dir, n_sents, morphology):
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
"""
Convert files into JSON format for use with train command and other
experiment management functions.

View File

@ -8,7 +8,7 @@ import subprocess
import sys
from .link import link
from ..util import prints
from ..util import prints, get_package_path
from .. import about
@ -24,15 +24,20 @@ def download(cmd, model, direct=False):
with version.
"""
if direct:
download_model('{m}/{m}.tar.gz'.format(m=model))
dl = download_model('{m}/{m}.tar.gz'.format(m=model))
else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
model_name = shortcuts.get(model, model)
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
if dl == 0:
try:
link(None, model_name, model, force=True)
# Get package path here because link uses
# pip.get_installed_distributions() to check if model is a package,
# which fails if model was just installed via subprocess
package_path = get_package_path(model_name)
link(None, model_name, model, force=True, model_path=package_path)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and
@ -73,6 +78,6 @@ def get_version(model, comp):
def download_model(filename):
download_url = about.__download_url__ + '/' + filename
subprocess.call([sys.executable, '-m',
return subprocess.call([sys.executable, '-m',
'pip', 'install', '--no-cache-dir', download_url],
env=os.environ.copy())

View File

@ -14,7 +14,7 @@ from .. import util
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(cmd, origin, link_name, force=False):
def link(cmd, origin, link_name, force=False, model_path=None):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False):
if util.is_package(origin):
model_path = util.get_package_path(origin)
else:
model_path = Path(origin)
model_path = Path(origin) if model_path is None else Path(model_path)
if not model_path.exists():
prints("The data should be located in %s" % path2str(model_path),
title="Can't locate model data", exits=1)

137
spacy/cli/model.py Normal file
View File

@ -0,0 +1,137 @@
# coding: utf8
from __future__ import unicode_literals
import bz2
import gzip
import math
from ast import literal_eval
from pathlib import Path
import numpy as np
import spacy
from preshed.counter import PreshCounter
from .. import util
from ..compat import fix_text
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data,
min_doc_freq=5, min_word_freq=200):
model_path = Path(model_dir)
freqs_path = Path(freqs_data)
clusters_path = Path(clusters_data) if clusters_data else None
vectors_path = Path(vectors_data) if vectors_data else None
check_dirs(freqs_path, clusters_path, vectors_path)
vocab = util.get_lang_class(lang).Defaults.create_vocab()
nlp = spacy.blank(lang)
vocab = nlp.vocab
probs, oov_prob = read_probs(
freqs_path, min_doc_freq=int(min_doc_freq), min_freq=int(min_doc_freq))
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
add_vectors(vocab, vectors_path)
create_model(model_path, nlp)
def add_vectors(vocab, vectors_path):
with bz2.BZ2File(vectors_path.as_posix()) as f:
num_words, dim = next(f).split()
vocab.clear_vectors(int(dim))
for line in f:
word_w_vector = line.decode("utf8").strip().split(" ")
word = word_w_vector[0]
vector = np.array([float(val) for val in word_w_vector[1:]])
if word in vocab:
vocab.set_vector(word, vector)
def create_model(model_path, model):
if not model_path.exists():
model_path.mkdir()
model.to_disk(model_path.as_posix())
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(
key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(
sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = file_path.as_posix()
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()
def check_dirs(freqs_data, clusters_data, vectors_data):
if not freqs_data.is_file():
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
if clusters_data and not clusters_data.is_file():
util.sys_exit(
clusters_data.as_posix(), title="No Brown clusters file found")
if vectors_data and not vectors_data.is_file():
util.sys_exit(
vectors_data.as_posix(), title="No word vectors file found")

View File

@ -15,10 +15,11 @@ from .. import about
@plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
meta_path=("path to meta.json", "option", "m", str),
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(cmd, input_dir, output_dir, meta=None, force=False):
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
"""
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta)
meta_path = util.ensure_path(meta_path)
if not input_path or not input_path.exists():
prints(input_path, title="Model directory not found", exits=1)
if not output_path or not output_path.exists():
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
template_manifest = get_template('MANIFEST.in')
template_init = get_template('xx_model_name/__init__.py')
meta_path = meta_path or input_path / 'meta.json'
if meta_path.is_file():
if not create_meta and meta_path.is_file():
prints(meta_path, title="Reading meta.json from file")
meta = util.read_json(meta_path)
else:
@ -100,7 +101,7 @@ def generate_meta():
def generate_pipeline():
prints("If set to 'True', the default pipeline is used. If set to 'False', "
"the pipeline will be disabled. Components should be specified as a "
"comma-separated list of component names, e.g. vectorizer, tagger, "
"comma-separated list of component names, e.g. tensorizer, tagger, "
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)

45
spacy/cli/profile.py Normal file
View File

@ -0,0 +1,45 @@
# coding: utf8
from __future__ import unicode_literals, division, print_function
import plac
from pathlib import Path
import ujson
import cProfile
import pstats
import spacy
import sys
import tqdm
import cytoolz
def read_inputs(loc):
if loc is None:
file_ = sys.stdin
file_ = (line.encode('utf8') for line in file_)
else:
file_ = Path(loc).open()
for line in file_:
data = ujson.loads(line)
text = data['text']
yield text
@plac.annotations(
lang=("model/language", "positional", None, str),
inputs=("Location of input file", "positional", None, read_inputs)
)
def profile(cmd, lang, inputs=None):
"""
Profile a spaCy pipeline, to find out which functions take the most time.
"""
nlp = spacy.load(lang)
texts = list(cytoolz.take(10000, inputs))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp, texts):
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
pass

View File

@ -32,10 +32,12 @@ from ..compat import json_dumps
resume=("Whether to resume training", "flag", "R", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
)
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
@ -70,8 +72,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
util.env_opt('batch_compound', 1.001))
if resume:
prints(output_path / 'model19.pickle', title="Resuming training")
nlp = dill.load((output_path / 'model19.pickle').open('rb'))
prints(output_path / 'model9.pickle', title="Resuming training")
nlp = dill.load((output_path / 'model9.pickle').open('rb'))
else:
nlp = lang_class(pipeline=pipeline)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
@ -85,28 +87,26 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
if resume:
i += 20
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True,
gold_preproc=False, max_length=0)
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
losses = {}
for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses,
update_tensors=True)
update_shared=True)
pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages):
util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path)
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1)
nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate(
corpus.dev_docs(
nlp_loaded,
gold_preproc=False))
gold_preproc=gold_preproc))
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores))

View File

@ -46,19 +46,21 @@ is_osx = sys.platform == 'darwin'
if is_python2:
import imp
bytes_ = str
unicode_ = unicode
basestring_ = basestring
input_ = raw_input
json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
path2str = lambda path: str(path).decode('utf8')
elif is_python3:
import importlib.util
bytes_ = bytes
unicode_ = str
basestring_ = str
input_ = input
json_dumps = lambda data: ujson.dumps(data, indent=2)
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
path2str = lambda path: str(path)
@ -102,3 +104,12 @@ def normalize_string_keys(old):
return new
def import_file(name, loc):
loc = str(loc)
if is_python2:
return imp.load_source(name, loc)
else:
spec = importlib.util.spec_from_file_location(name, str(loc))
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module

View File

@ -15,7 +15,7 @@ def depr_model_download(lang):
lang (unicode): Language shortcut, 'en' or 'de'.
"""
prints("The spacy.%s.download command is now deprecated. Please use "
"python -m spacy download [model name or shortcut] instead. For "
"spacy download [model name or shortcut] instead. For "
"more info, see the documentation:" % lang,
about.__docs_models__,
"Downloading default '%s' model now..." % lang,

View File

@ -60,7 +60,7 @@ GLOSSARY = {
'JJR': 'adjective, comparative',
'JJS': 'adjective, superlative',
'LS': 'list item marker',
'MD': 'verb, modal auxillary',
'MD': 'verb, modal auxiliary',
'NIL': 'missing tag',
'NN': 'noun, singular or mass',
'NNP': 'noun, proper singular',
@ -91,7 +91,7 @@ GLOSSARY = {
'NFP': 'superfluous punctuation',
'GW': 'additional word in multi-word expression',
'XX': 'unknown',
'BES': 'auxillary "be"',
'BES': 'auxiliary "be"',
'HVS': 'forms of "have"',

View File

@ -9,6 +9,7 @@ cdef struct GoldParseC:
int* tags
int* heads
int* has_dep
int* sent_start
attr_t* labels
int** brackets
Transition* ner

View File

@ -406,11 +406,11 @@ cdef class GoldParse:
if tags is None:
tags = [None for _ in doc]
if heads is None:
heads = [token.i for token in doc]
heads = [None for token in doc]
if deps is None:
deps = [None for _ in doc]
if entities is None:
entities = ['-' for _ in doc]
entities = [None for _ in doc]
elif len(entities) == 0:
entities = ['O' for _ in doc]
elif not isinstance(entities[0], basestring):
@ -426,6 +426,7 @@ cdef class GoldParse:
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.cats = list(cats)
@ -482,6 +483,10 @@ cdef class GoldParse:
"""
return not nonproj.is_nonproj_tree(self.heads)
@property
def sent_starts(self):
return [self.c.sent_start[i] for i in range(self.length)]
def biluo_tags_from_offsets(doc, entities, missing='O'):
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out

View File

@ -27,7 +27,7 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
'TB T G M K')
'TB T G M K %')
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'

18
spacy/lang/da/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.da.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple overvejer at købe et britisk statup for 1 milliard dollar",
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
"San Francisco overvejer at forbyde leverandørrobotter på fortov",
"London er en stor by i Storbritannien"
]

22
spacy/lang/de/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.de.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
"Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
"Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
"San Francisco erwägt Verbot von Lieferrobotern",
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
"Wo bist du?",
"Was ist die Hauptstadt von Deutschland?"
]

22
spacy/lang/en/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.en.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple is looking at buying U.K. startup for $1 billion",
"Autonomous cars shift insurance liability toward manufacturers",
"San Francisco considers banning sidewalk delivery robots",
"London is a big city in the United Kingdom.",
"Where are you?",
"Who is the president of France?",
"What is the capital of the United States?",
"When was Barack Obama born?"
]

View File

@ -59,7 +59,8 @@ MORPH_RULES = {
"VBP": {
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
"am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
},
"VBD": {

View File

@ -232,7 +232,10 @@ for verb_data in [
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be", NORM: "was"},
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
{ORTH: "were", LEMMA: "be", NORM: "were"},
{ORTH: "have", NORM: "have"},
{ORTH: "has", LEMMA: "have", NORM: "has"},
{ORTH: "dare", NORM: "dare"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:

22
spacy/lang/es/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.es.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
"San Francisco analiza prohibir los robots delivery",
"Londres es una gran ciudad del Reino Unido",
"El gato come pescado",
"Veo al hombre con el telescopio",
"La araña come moscas",
"El pingüino incuba en su nido"
]

26
spacy/lang/fr/examples.py Normal file
View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.fr.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
"Les voitures autonomes voient leur assurances décalées vers les constructeurs",
"San Francisco envisage d'interdire les robots coursiers",
"Londres est une grande ville du Royaume-Uni",
"LItalie choisit ArcelorMittal pour reprendre la plus grande aciérie dEurope",
"Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
"La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
"Nouvelles attaques de Trump contre le maire de Londres",
"Où es-tu ?",
"Qui est le président de la France ?",
"Où est la capitale des Etats-Unis ?",
"Quand est né Barack Obama ?"
]

28
spacy/lang/he/examples.py Normal file
View File

@ -0,0 +1,28 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.he.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
'רה"מ הודיע כי יחרים טקס בחסותו',
'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
'סע לשלום, המפתחות בפנים.',
'מלצר, פעמיים טורקי!',
'ואהבת לרעך כמוך.',
'היום נעשה משהו בלתי נשכח.',
'איפה הילד?',
'מיהו נשיא צרפת?',
'מהי בירת ארצות הברית?',
"איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
'מה הייתה הדקה?',
'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
]

42
spacy/lang/id/__init__.py Normal file
View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .lemmatizer import LOOKUP
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
class IndonesianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'id'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Indonesian(Language):
lang = 'id'
Defaults = IndonesianDefaults
__all__ = ['Indonesian']

File diff suppressed because it is too large Load Diff

22
spacy/lang/id/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.en.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
"Abu Sayyaf mengeksekusi sandera warga Filipina",
"Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
"PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
"Jakarta adalah kota besar yang nyaris tidak pernah tidur."
"Kamu ada di mana semalam?",
"Siapa yang membeli makanan ringan tersebut?",
"Siapa presiden pertama Republik Indonesia?"
]

36883
spacy/lang/id/lemmatizer.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
'gajillion', 'bazillion',
'nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
'delapan', 'sembilan', 'sepuluh', 'sebelas', 'duabelas', 'tigabelas',
'empatbelas', 'limabelas', 'enambelas', 'tujuhbelas', 'delapanbelas',
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
'noniliun', 'desiliun',
]
def like_num(text):
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
if text.count('-') == 1:
_, num = text.split('-')
if num.isdigit() or num in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -0,0 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
_exc = {
"Rp": "$",
"IDR": "$",
"RMB": "$",
"USD": "$",
"AUD": "$",
"GBP": "$",
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -0,0 +1,53 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..char_classes import merge_chars, split_chars, _currency, _units
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
_units = (_units + 's bit Gbps Mbps mbps Kbps kbps ƒ ppi px '
'Hz kHz MHz GHz mAh '
'ratus rb ribu ribuan '
'juta jt jutaan mill?iar million bil[l]?iun bilyun billion '
)
_currency = (_currency + r' USD Rp IDR RMB SGD S\$')
_months = ('Januari Februari Maret April Mei Juni Juli Agustus September '
'Oktober November Desember January February March May June '
'July August October December Jan Feb Mar Jun Jul Aug Sept '
'Oct Okt Nov Des ')
UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency)
HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>'
HTML_SUFFIX = r'</(b|strong|i|em|p|span|div|a)>'
MONTHS = merge_chars(_months)
LIST_CURRENCY = split_chars(_currency)
TOKENIZER_PREFIXES.remove('#') # hashtag
_prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['/', '']
_suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '-[KkMm]u', '[—-]'] + [
r'(?<={c})(?:[0-9]+)'.format(c=CURRENCY),
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
r'(?<=[0-9])%',
r'(?<=[0-9{a}]{h})(?:[\.,:-])'.format(a=ALPHA, h=HTML_SUFFIX),
r'(?<=[0-9{a}])(?:{h})'.format(a=ALPHA, h=HTML_SUFFIX),
]
_infixes = TOKENIZER_INFIXES + [
r'(?<=[0-9])[\\/](?=[0-9%-])',
r'(?<=[0-9])%(?=[{a}0-9/])'.format(a=ALPHA),
r'(?<={u})[\/-](?=[0-9])'.format(u=UNITS),
r'(?<={m})[\/-](?=[0-9])'.format(m=MONTHS),
r'(?<=[0-9\)][\.,])"(?=[0-9])',
r'(?<=[{a}\)][\.,\'])["—](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])-(?=[0-9])'.format(a=ALPHA),
r'(?<=[0-9])-(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])[\/-](?={c}{a})'.format(a=ALPHA, c=CURRENCY),
]
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

763
spacy/lang/id/stop_words.py Normal file
View File

@ -0,0 +1,763 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
ada
adalah
adanya
adapun
agak
agaknya
agar
akan
akankah
akhir
akhiri
akhirnya
aku
akulah
amat
amatlah
anda
andalah
antar
antara
antaranya
apa
apaan
apabila
apakah
apalagi
apatah
artinya
asal
asalkan
atas
atau
ataukah
ataupun
awal
awalnya
bagai
bagaikan
bagaimana
bagaimanakah
bagaimanapun
bagi
bagian
bahkan
bahwa
bahwasanya
baik
bakal
bakalan
balik
banyak
bapak
baru
bawah
beberapa
begini
beginian
beginikah
beginilah
begitu
begitukah
begitulah
begitupun
bekerja
belakang
belakangan
belum
belumlah
benar
benarkah
benarlah
berada
berakhir
berakhirlah
berakhirnya
berapa
berapakah
berapalah
berapapun
berarti
berawal
berbagai
berdatangan
beri
berikan
berikut
berikutnya
berjumlah
berkali-kali
berkata
berkehendak
berkeinginan
berkenaan
berlainan
berlalu
berlangsung
berlebihan
bermacam
bermacam-macam
bermaksud
bermula
bersama
bersama-sama
bersiap
bersiap-siap
bertanya
bertanya-tanya
berturut
berturut-turut
bertutur
berujar
berupa
besar
betul
betulkah
biasa
biasanya
bila
bilakah
bisa
bisakah
boleh
bolehkah
bolehlah
buat
bukan
bukankah
bukanlah
bukannya
bulan
bung
cara
caranya
cukup
cukupkah
cukuplah
cuma
dahulu
dalam
dan
dapat
dari
daripada
datang
dekat
demi
demikian
demikianlah
dengan
depan
di
dia
diakhiri
diakhirinya
dialah
diantara
diantaranya
diberi
diberikan
diberikannya
dibuat
dibuatnya
didapat
didatangkan
digunakan
diibaratkan
diibaratkannya
diingat
diingatkan
diinginkan
dijawab
dijelaskan
dijelaskannya
dikarenakan
dikatakan
dikatakannya
dikerjakan
diketahui
diketahuinya
dikira
dilakukan
dilalui
dilihat
dimaksud
dimaksudkan
dimaksudkannya
dimaksudnya
diminta
dimintai
dimisalkan
dimulai
dimulailah
dimulainya
dimungkinkan
dini
dipastikan
diperbuat
diperbuatnya
dipergunakan
diperkirakan
diperlihatkan
diperlukan
diperlukannya
dipersoalkan
dipertanyakan
dipunyai
diri
dirinya
disampaikan
disebut
disebutkan
disebutkannya
disini
disinilah
ditambahkan
ditandaskan
ditanya
ditanyai
ditanyakan
ditegaskan
ditujukan
ditunjuk
ditunjuki
ditunjukkan
ditunjukkannya
ditunjuknya
dituturkan
dituturkannya
diucapkan
diucapkannya
diungkapkan
dong
dua
dulu
empat
enggak
enggaknya
entah
entahlah
guna
gunakan
hal
hampir
hanya
hanyalah
hari
harus
haruslah
harusnya
hendak
hendaklah
hendaknya
hingga
ia
ialah
ibarat
ibaratkan
ibaratnya
ibu
ikut
ingat
ingat-ingat
ingin
inginkah
inginkan
ini
inikah
inilah
itu
itukah
itulah
jadi
jadilah
jadinya
jangan
jangankan
janganlah
jauh
jawab
jawaban
jawabnya
jelas
jelaskan
jelaslah
jelasnya
jika
jikalau
juga
jumlah
jumlahnya
justru
kala
kalau
kalaulah
kalaupun
kalian
kami
kamilah
kamu
kamulah
kan
kapan
kapankah
kapanpun
karena
karenanya
kasus
kata
katakan
katakanlah
katanya
ke
keadaan
kebetulan
kecil
kedua
keduanya
keinginan
kelamaan
kelihatan
kelihatannya
kelima
keluar
kembali
kemudian
kemungkinan
kemungkinannya
kenapa
kepada
kepadanya
kesampaian
keseluruhan
keseluruhannya
keterlaluan
ketika
khususnya
kini
kinilah
kira
kira-kira
kiranya
kita
kitalah
kok
kurang
lagi
lagian
lah
lain
lainnya
lalu
lama
lamanya
lanjut
lanjutnya
lebih
lewat
lima
luar
macam
maka
makanya
makin
malah
malahan
mampu
mampukah
mana
manakala
manalagi
masa
masalah
masalahnya
masih
masihkah
masing
masing-masing
mau
maupun
melainkan
melakukan
melalui
melihat
melihatnya
memang
memastikan
memberi
memberikan
membuat
memerlukan
memihak
meminta
memintakan
memisalkan
memperbuat
mempergunakan
memperkirakan
memperlihatkan
mempersiapkan
mempersoalkan
mempertanyakan
mempunyai
memulai
memungkinkan
menaiki
menambahkan
menandaskan
menanti
menanti-nanti
menantikan
menanya
menanyai
menanyakan
mendapat
mendapatkan
mendatang
mendatangi
mendatangkan
menegaskan
mengakhiri
mengapa
mengatakan
mengatakannya
mengenai
mengerjakan
mengetahui
menggunakan
menghendaki
mengibaratkan
mengibaratkannya
mengingat
mengingatkan
menginginkan
mengira
mengucapkan
mengucapkannya
mengungkapkan
menjadi
menjawab
menjelaskan
menuju
menunjuk
menunjuki
menunjukkan
menunjuknya
menurut
menuturkan
menyampaikan
menyangkut
menyatakan
menyebutkan
menyeluruh
menyiapkan
merasa
mereka
merekalah
merupakan
meski
meskipun
meyakini
meyakinkan
minta
mirip
misal
misalkan
misalnya
mula
mulai
mulailah
mulanya
mungkin
mungkinkah
nah
naik
namun
nanti
nantinya
nyaris
nyatanya
oleh
olehnya
pada
padahal
padanya
pak
paling
panjang
pantas
para
pasti
pastilah
penting
pentingnya
per
percuma
perlu
perlukah
perlunya
pernah
persoalan
pertama
pertama-tama
pertanyaan
pertanyakan
pihak
pihaknya
pukul
pula
pun
punya
rasa
rasanya
rata
rupanya
saat
saatnya
saja
sajalah
saling
sama
sama-sama
sambil
sampai
sampai-sampai
sampaikan
sana
sangat
sangatlah
satu
saya
sayalah
se
sebab
sebabnya
sebagai
sebagaimana
sebagainya
sebagian
sebaik
sebaik-baiknya
sebaiknya
sebaliknya
sebanyak
sebegini
sebegitu
sebelum
sebelumnya
sebenarnya
seberapa
sebesar
sebetulnya
sebisanya
sebuah
sebut
sebutlah
sebutnya
secara
secukupnya
sedang
sedangkan
sedemikian
sedikit
sedikitnya
seenaknya
segala
segalanya
segera
seharusnya
sehingga
seingat
sejak
sejauh
sejenak
sejumlah
sekadar
sekadarnya
sekali
sekali-kali
sekalian
sekaligus
sekalipun
sekarang
sekarang
sekecil
seketika
sekiranya
sekitar
sekitarnya
sekurang-kurangnya
sekurangnya
sela
selain
selaku
selalu
selama
selama-lamanya
selamanya
selanjutnya
seluruh
seluruhnya
semacam
semakin
semampu
semampunya
semasa
semasih
semata
semata-mata
semaunya
sementara
semisal
semisalnya
sempat
semua
semuanya
semula
sendiri
sendirian
sendirinya
seolah
seolah-olah
seorang
sepanjang
sepantasnya
sepantasnyalah
seperlunya
seperti
sepertinya
sepihak
sering
seringnya
serta
serupa
sesaat
sesama
sesampai
sesegera
sesekali
seseorang
sesuatu
sesuatunya
sesudah
sesudahnya
setelah
setempat
setengah
seterusnya
setiap
setiba
setibanya
setidak-tidaknya
setidaknya
setinggi
seusai
sewaktu
siap
siapa
siapakah
siapapun
sini
sinilah
soal
soalnya
suatu
sudah
sudahkah
sudahlah
supaya
tadi
tadinya
tahu
tahun
tak
tambah
tambahnya
tampak
tampaknya
tandas
tandasnya
tanpa
tanya
tanyakan
tanyanya
tapi
tegas
tegasnya
telah
tempat
tengah
tentang
tentu
tentulah
tentunya
tepat
terakhir
terasa
terbanyak
terdahulu
terdapat
terdiri
terhadap
terhadapnya
teringat
teringat-ingat
terjadi
terjadilah
terjadinya
terkira
terlalu
terlebih
terlihat
termasuk
ternyata
tersampaikan
tersebut
tersebutlah
tertentu
tertuju
terus
terutama
tetap
tetapi
tiap
tiba
tiba-tiba
tidak
tidakkah
tidaklah
tiga
tinggi
toh
tunjuk
turut
tutur
tuturnya
ucap
ucapnya
ujar
ujarnya
umum
umumnya
ungkap
ungkapnya
untuk
usah
usai
waduh
wah
wahai
waktu
waktunya
walau
walaupun
wong
yaitu
yakin
yakni
yang
""".split())

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -0,0 +1,50 @@
# coding: utf8
from __future__ import unicode_literals
import regex as re
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
from ..tokenizer_exceptions import URL_PATTERN
from ...symbols import ORTH
_exc = {}
for orth in ID_BASE_EXCEPTIONS:
_exc[orth] = [{ORTH: orth}]
orth_title = orth.title()
_exc[orth_title] = [{ORTH: orth_title}]
orth_caps = orth.upper()
_exc[orth_caps] = [{ORTH: orth_caps}]
orth_lower = orth.lower()
_exc[orth_lower] = [{ORTH: orth_lower}]
if '-' in orth:
orth_title = '-'.join([part.title() for part in orth.split('-')])
_exc[orth_title] = [{ORTH: orth_title}]
orth_caps = '-'.join([part.upper() for part in orth.split('-')])
_exc[orth_caps] = [{ORTH: orth_caps}]
for orth in [
"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
"E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
"Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
"Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
"B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
"M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
"M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
"S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
"S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
"a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
"dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
"n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)

18
spacy/lang/it/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.it.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
"San Francisco prevede di bandire i robot di consegna porta a porta",
"Londra è una grande città del Regno Unito."
]

View File

@ -137,6 +137,7 @@ LEX_ATTRS = {
attrs.IS_UPPER: lambda string: string.isupper(),
attrs.IS_STOP: lambda string: False,
attrs.IS_OOV: lambda string: True,
attrs.PROB: lambda string: -20.,
attrs.LIKE_EMAIL: like_email,
attrs.LIKE_NUM: like_num,
attrs.IS_PUNCT: is_punct,

18
spacy/lang/nb/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.nb.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
"Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
"San Francisco vurderer å forby robotbud på fortauene",
"London er en stor by i Storbritannia."
]

20
spacy/lang/pl/examples.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.pl.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Poczuł przyjemną woń mocnej kawy.",
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
"Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
"Nowy abonament pod lupą Komisji Europejskiej",
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
]

18
spacy/lang/pt/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.pt.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
"São Francisco considera banir os robôs de entrega que andam pelas calçadas",
"Londres é a maior cidade do Reino Unido"
]

18
spacy/lang/sv/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.sv.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple överväger att köpa brittisk startup för 1 miljard dollar.",
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
"San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
"London är en storstad i Storbritannien."
]

View File

@ -95,7 +95,7 @@ class BaseDefaults(object):
meta = nlp.meta if nlp is not None else {}
# Resolve strings, like "cnn", "lstm", etc
pipeline = []
for entry in cls.pipeline:
for entry in meta.get('pipeline', []):
if entry in disable or getattr(entry, 'name', entry) in disable:
continue
factory = cls.Defaults.factories[entry]
@ -200,6 +200,7 @@ class Language(object):
else:
flat_list.append(pipe)
self.pipeline = flat_list
self._optimizer = None
@property
def meta(self):
@ -278,7 +279,7 @@ class Language(object):
return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None,
update_tensors=False):
update_shared=False):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
@ -298,6 +299,10 @@ class Language(object):
"Got: %d, %d" % (len(docs), len(golds)))
if len(docs) == 0:
return
if sgd is None:
if self._optimizer is None:
self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
grads = {}
@ -305,14 +310,18 @@ class Language(object):
grads[key] = (W, dW)
pipes = list(self.pipeline[1:])
random.shuffle(pipes)
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
for proc in pipes:
if not hasattr(proc, 'update'):
continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses)
if update_tensors and d_tokvecses is not None:
bp_tokvecses(d_tokvecses, sgd=sgd)
if update_shared and d_tokvecses is not None:
for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv
if update_shared and bp_tokvecses is not None:
bp_tokvecses(all_d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory.
@ -375,11 +384,11 @@ class Language(object):
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
optimizer.max_grad_norm = max_grad_norm
optimizer.device = device
return optimizer
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
def evaluate(self, docs_golds):
scorer = Scorer()
@ -427,11 +436,16 @@ class Language(object):
except StopIteration:
pass
def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
texts (iterator): A sequence of texts to process.
as_tuples (bool):
If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
@ -443,7 +457,7 @@ class Language(object):
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed
"""
if tuples:
if as_tuples:
text_context1, text_context2 = itertools.tee(texts)
texts = (tc[0] for tc in text_context1)
contexts = (tc[1] for tc in text_context2)

View File

@ -25,6 +25,7 @@ class Lemmatizer(object):
elif univ_pos == PUNCT:
univ_pos = 'punct'
# See Issue #435 for example of where this logic is requied.
print("Check base form", string)
if self.is_base_form(univ_pos, morphology):
return set([string.lower()])
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
@ -38,12 +39,20 @@ class Lemmatizer(object):
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
others = [key for key in morphology
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
return True
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
morphology.get('Tense') == 'pres' and \
morphology.get('Number') is None and \
not others):
return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
return True
elif VerbForm_inf in morphology:

View File

@ -171,6 +171,8 @@ cdef class Lexeme:
property rank:
def __get__(self):
return self.c.id
def __set__(self, value):
self.c.id = value
property sentiment:
def __get__(self):

View File

@ -46,6 +46,43 @@ from ._ml import build_text_classifier, build_tagger_model
from .parts_of_speech import X
class SentenceSegmenter(object):
'''A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse).
To change the sentence boundary detection strategy, pass a generator
function `strategy` on initialization, or assign a new strategy to
the .strategy attribute.
Sentence detection strategies should be generators that take `Doc` objects
and yield `Span` objects for each sentence.
'''
name = 'sbd'
def __init__(self, vocab, strategy=None):
self.vocab = vocab
if strategy is None or strategy == 'on_punct':
strategy = self.split_on_punct
self.strategy = strategy
def __call__(self, doc):
doc.user_hooks['sents'] = self.strategy
@staticmethod
def split_on_punct(doc):
start = 0
seen_period = False
for i, word in enumerate(doc):
if seen_period and not word.is_punct:
yield doc[start : word.i]
start = word.i
seen_period = False
elif word.text in ['.', '!', '?']:
seen_period = True
if start < len(doc):
yield doc[start : len(doc)]
class BaseThincComponent(object):
name = None
@ -91,16 +128,21 @@ class BaseThincComponent(object):
def to_bytes(self, **exclude):
serialize = OrderedDict((
('cfg', lambda: json_dumps(self.cfg)),
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
def load_model(b):
if self.model is True:
self.model = self.Model()
self.model = self.Model(**self.cfg)
self.model.from_bytes(b)
deserialize = OrderedDict((
('model', lambda b: self.model.from_bytes(b)),
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('model', load_model),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
@ -108,19 +150,22 @@ class BaseThincComponent(object):
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p)),
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
('vocab', lambda p: self.vocab.to_disk(p))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
def load_model(p):
if self.model is True:
self.model = self.Model()
self.model = self.Model(**self.cfg)
self.model.from_bytes(p.open('rb').read())
deserialize = OrderedDict((
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
('model', load_model),
('vocab', lambda p: self.vocab.from_disk(p)),
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
))
util.from_disk(path, deserialize, exclude)
return self
@ -138,7 +183,7 @@ class TokenVectorEncoder(BaseThincComponent):
name = 'tensorizer'
@classmethod
def Model(cls, width=128, embed_size=7500, **cfg):
def Model(cls, width=128, embed_size=4000, **cfg):
"""Create a new statistical model for the class.
width (int): Output size of the model.
@ -284,6 +329,8 @@ class NeuralTagger(BaseThincComponent):
cdef Vocab vocab = self.vocab
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, 'get'):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
@ -292,6 +339,8 @@ class NeuralTagger(BaseThincComponent):
doc.is_tagged = True
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
if self.model.nI is None:
@ -300,6 +349,8 @@ class NeuralTagger(BaseThincComponent):
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
return d_tokvecs
def get_loss(self, docs, golds, scores):
@ -366,7 +417,8 @@ class NeuralTagger(BaseThincComponent):
def from_bytes(self, bytes_data, **exclude):
def load_model(b):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
token_vector_width = util.env_opt('token_vector_width',
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(b)
@ -400,7 +452,8 @@ class NeuralTagger(BaseThincComponent):
def from_disk(self, path, **exclude):
def load_model(p):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
token_vector_width = util.env_opt('token_vector_width',
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(p.open('rb').read())
@ -595,12 +648,13 @@ class TextCategorizer(BaseThincComponent):
return mean_square_error, d_scores
def begin_training(self, gold_tuples=tuple(), pipeline=None):
if pipeline:
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
token_vector_width = pipeline[0].model.nO
else:
token_vector_width = 64
if self.model is True:
self.model = self.Model(len(self.labels), token_vector_width)
self.model = self.Model(len(self.labels), token_vector_width,
**self.cfg)
cdef class EntityRecognizer(LinearParser):

View File

@ -215,7 +215,10 @@ cdef class StringStore:
path = util.ensure_path(path)
with path.open('r') as file_:
strings = ujson.load(file_)
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
return self
def to_bytes(self, **exclude):
@ -234,7 +237,10 @@ cdef class StringStore:
RETURNS (StringStore): The `StringStore` object.
"""
strings = ujson.loads(bytes_data)
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
return self
def set_frozen(self, bint is_frozen):

View File

@ -0,0 +1,286 @@
# cython: infer_types=True
# cython: profile=True
cimport numpy as np
import numpy
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation
from thinc.typedefs cimport hash_t, class_t
from thinc.extra.search cimport MaxViolation
from .transition_system cimport TransitionSystem, Transition
from .stateclass cimport StateClass
from ..gold cimport GoldParse
from ..tokens.doc cimport Doc
# These are passed as callbacks to thinc.search.Beam
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
dest = <StateClass>_dest
src = <StateClass>_src
moves = <const Transition*>_moves
dest.clone(src)
moves[clas].do(dest.c, moves[clas].label)
cdef int _check_final_state(void* _state, void* extra_args) except -1:
return (<StateClass>_state).is_final()
def _cleanup(Beam beam):
for i in range(beam.width):
Py_XDECREF(<PyObject*>beam._states[i].content)
Py_XDECREF(<PyObject*>beam._parents[i].content)
cdef hash_t _hash_state(void* _state, void* _) except 0:
state = <StateClass>_state
if state.c.is_final():
return 1
else:
return state.c.hash()
cdef class ParserBeam(object):
cdef public TransitionSystem moves
cdef public object states
cdef public object golds
cdef public object beams
cdef public object dones
def __init__(self, TransitionSystem moves, states, golds,
int width, float density):
self.moves = moves
self.states = states
self.golds = golds
self.beams = []
cdef Beam beam
cdef StateClass state, st
for state in states:
beam = Beam(self.moves.n_moves, width, density)
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
for i in range(beam.width):
st = <StateClass>beam.at(i)
st.c.offset = state.c.offset
self.beams.append(beam)
self.dones = [False] * len(self.beams)
def __dealloc__(self):
if self.beams is not None:
for beam in self.beams:
if beam is not None:
_cleanup(beam)
@property
def is_done(self):
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
def __getitem__(self, i):
return self.beams[i]
def __len__(self):
return len(self.beams)
def advance(self, scores, follow_gold=False):
cdef Beam beam
for i, beam in enumerate(self.beams):
if beam.is_done or not scores[i].size or self.dones[i]:
continue
self._set_scores(beam, scores[i])
if self.golds is not None:
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
if follow_gold:
beam.advance(_transition_state, NULL, <void*>self.moves.c)
else:
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL)
if beam.is_done and self.golds is not None:
for j in range(beam.size):
state = <StateClass>beam.at(j)
if state.is_final():
try:
if self.moves.is_gold_parse(state, self.golds[i]):
beam._states[j].loss = 0.0
elif beam._states[j].loss == 0.0:
beam._states[j].loss = 1.0
except NotImplementedError:
break
def _set_scores(self, Beam beam, float[:, ::1] scores):
cdef float* c_scores = &scores[0, 0]
cdef int nr_state = min(scores.shape[0], beam.size)
cdef int nr_class = scores.shape[1]
for i in range(nr_state):
state = <StateClass>beam.at(i)
if not state.is_final():
for j in range(nr_class):
beam.scores[i][j] = c_scores[i * nr_class + j]
self.moves.set_valid(beam.is_valid[i], state.c)
else:
for j in range(beam.nr_class):
beam.scores[i][j] = 0
beam.costs[i][j] = 0
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
for i in range(beam.size):
state = <StateClass>beam.at(i)
if not state.c.is_final():
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
if follow_gold:
for j in range(beam.nr_class):
if beam.costs[i][j] >= 1:
beam.is_valid[i][j] = 0
def get_token_ids(states, int n_tokens):
cdef StateClass state
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
dtype='int32', order='C')
c_ids = <int*>ids.data
for i, state in enumerate(states):
if not state.is_final():
state.c.set_context_tokens(c_ids, n_tokens)
else:
ids[i] = -1
c_ids += ids.shape[1]
return ids
nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, tokvecs, golds,
state2vec, vec2scores,
int width, float density,
sgd=None, losses=None, drop=0.):
global nr_update
cdef MaxViolation violn
nr_update += 1
pbeam = ParserBeam(moves, states, golds,
width=width, density=density)
gbeam = ParserBeam(moves, states, golds,
width=width, density=0.0)
cdef StateClass state
beam_maps = []
backprops = []
violns = [MaxViolation() for _ in range(len(states))]
for t in range(max_steps):
if pbeam.is_done and gbeam.is_done:
break
# The beam maps let us find the right row in the flattened scores
# arrays for each state. States are identified by (example id, history).
# We keep a different beam map for each step (since we'll have a flat
# scores array for each step). The beam map will let us take the per-state
# losses, and compute the gradient for each (step, state, class).
beam_maps.append({})
# Gather all states from the two beams in a list. Some stats may occur
# in both beams. To figure out which beam each state belonged to,
# we keep two lists of indices, p_indices and g_indices
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
if not states:
break
# Now that we have our flat list of states, feed them through the model
token_ids = get_token_ids(states, nr_feature)
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
# Store the callbacks for the backward pass
backprops.append((token_ids, bp_vectors, bp_scores))
# Unpack the flat scores into lists for the two beams. The indices arrays
# tell us which example and state the scores-row refers to.
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
# Now advance the states in the beams. The gold beam is contrained to
# to follow only gold analyses.
pbeam.advance(p_scores)
gbeam.advance(g_scores, follow_gold=True)
# Track the "maximum violation", to use in the update.
for i, violn in enumerate(violns):
violn.check_crf(pbeam[i], gbeam[i])
histories = []
losses = []
for violn in violns:
if violn.p_hist:
histories.append(violn.p_hist + violn.g_hist)
losses.append(violn.p_probs + violn.g_probs)
else:
histories.append([])
losses.append([])
states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
return states_d_scores, backprops[:len(states_d_scores)]
def get_states(pbeams, gbeams, beam_map, nr_update):
seen = {}
states = []
p_indices = []
g_indices = []
cdef Beam pbeam, gbeam
assert len(pbeams) == len(gbeams)
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
p_indices.append([])
g_indices.append([])
for i in range(pbeam.size):
state = <StateClass>pbeam.at(i)
if not state.is_final():
key = tuple([eg_id] + pbeam.histories[i])
assert key not in seen, (key, seen)
seen[key] = len(states)
p_indices[-1].append(len(states))
states.append(state)
beam_map.update(seen)
for i in range(gbeam.size):
state = <StateClass>gbeam.at(i)
if not state.is_final():
key = tuple([eg_id] + gbeam.histories[i])
if key in seen:
g_indices[-1].append(seen[key])
else:
g_indices[-1].append(len(states))
beam_map[key] = len(states)
states.append(state)
p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
return states, p_idx, g_idx
def get_gradient(nr_class, beam_maps, histories, losses):
"""
The global model assigns a loss to each parse. The beam scores
are additive, so the same gradient is applied to each action
in the history. This gives the gradient of a single *action*
for a beam state -- so we have "the gradient of loss for taking
action i given history H."
Histories: Each hitory is a list of actions
Each candidate has a history
Each beam has multiple candidates
Each batch has multiple beams
So history is list of lists of lists of ints
"""
nr_step = len(beam_maps)
grads = []
nr_step = 0
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
if loss != 0.0 and not numpy.isnan(loss):
nr_step = max(nr_step, len(hist))
for i in range(nr_step):
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
assert len(histories) == len(losses)
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
if loss == 0.0 or numpy.isnan(loss):
continue
key = tuple([eg_id])
# Adjust loss for length
avg_loss = loss / len(hist)
loss += avg_loss * (nr_step - len(hist))
for j, clas in enumerate(hist):
i = beam_maps[j][key]
# In step j, at state i action clas
# resulted in loss
grads[j][i, clas] += loss
key = key + tuple([clas])
return grads

View File

@ -37,6 +37,7 @@ cdef cppclass StateC:
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
this.offset = 0
cdef int i
for i in range(length + (PADDING * 2)):
this._ents[i].end = -1
@ -73,7 +74,16 @@ cdef cppclass StateC:
free(this.shifted - PADDING)
void set_context_tokens(int* ids, int n) nogil:
if n == 13:
if n == 8:
ids[0] = this.B(0)
ids[1] = this.B(1)
ids[2] = this.S(0)
ids[3] = this.S(1)
ids[4] = this.H(this.S(0))
ids[5] = this.L(this.B(0), 1)
ids[6] = this.L(this.S(0), 2)
ids[7] = this.R(this.S(0), 1)
elif n == 13:
ids[0] = this.B(0)
ids[1] = this.B(1)
ids[2] = this.S(0)

View File

@ -20,7 +20,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
from ..lexeme cimport Lexeme
from ..structs cimport TokenC
@ -286,7 +286,7 @@ cdef class Break:
return 0
cdef int _get_root(int word, const GoldParseC* gold) nogil:
while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
while gold.heads[word] != word and gold.has_dep[word] and word >= 0:
word = gold.heads[word]
if not gold.has_dep[word]:
return -1
@ -351,6 +351,20 @@ cdef class ArcEager(TransitionSystem):
def __get__(self):
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
def is_gold_parse(self, StateClass state, GoldParse gold):
predicted = set()
truth = set()
for i in range(gold.length):
if gold.cand_to_gold[i] is None:
continue
if state.safe_get(i).dep:
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
else:
predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
truth.add((id_, head, dep))
return truth == predicted
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.heads)
if all([tag is None for tag in gold.heads[start:end]]):
@ -502,9 +516,11 @@ cdef class ArcEager(TransitionSystem):
"before training and after parsing. Either pass make_projective=True "
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
else:
print(gold.orig_annot)
print(gold.words)
print(gold.heads)
print(gold.labels)
print(gold.sent_starts)
raise ValueError(
"Could not find a gold-standard action to supervise the dependency "
"parser.\n"

View File

@ -107,7 +107,7 @@ cdef class BeamParser(Parser):
# The non-monotonic oracle makes it difficult to ensure final costs are
# correct. Therefore do final correction
for i in range(pred.size):
if is_gold(<StateClass>pred.at(i), gold_parse, self.moves.strings):
if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
pred._states[i].loss = 0.0
elif pred._states[i].loss == 0.0:
pred._states[i].loss = 1.0
@ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
if not pred._states[i].is_done or pred._states[i].loss == 0:
continue
state = <StateClass>pred.at(i)
if is_gold(state, gold_parse, moves.strings) == True:
if moves.is_gold_parse(state, gold_parse) == True:
for dep in gold_parse.orig_annot:
print(dep[1], dep[3], dep[4])
print("Cost", pred._states[i].loss)
@ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
if not gold._states[i].is_done:
continue
state = <StateClass>gold.at(i)
if is_gold(state, gold_parse, moves.strings) == False:
if moves.is_gold(state, gold_parse) == False:
print("Truth")
for dep in gold_parse.orig_annot:
print(dep[1], dep[3], dep[4])
@ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
raise Exception("Gold parse is not gold-standard")
def is_gold(StateClass state, GoldParse gold, StringStore strings):
predicted = set()
truth = set()
for i in range(gold.length):
if gold.cand_to_gold[i] is None:
continue
if state.safe_get(i).dep:
predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
else:
predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
truth.add((id_, head, dep))
return truth == predicted

View File

@ -113,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.ner)
if all([tag == '-' for tag in gold.ner[start:end]]):
if all([tag in ('-', None) for tag in gold.ner[start:end]]):
return False
else:
return True

View File

@ -14,4 +14,8 @@ cdef class Parser:
cdef readonly TransitionSystem moves
cdef readonly object cfg
cdef void _parse_step(self, StateC* state,
const float* feat_weights,
int nr_class, int nr_feat, int nr_piece) nogil
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil

View File

@ -36,15 +36,19 @@ from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.api import layerize, chain, noop, clone
from thinc.neural import Model, Affine, ELU, ReLu, Maxout
from thinc.api import layerize, chain, noop, clone, with_flatten
from thinc.neural import Model, Affine, ReLu, Maxout
from thinc.neural._classes.batchnorm import BatchNorm as BN
from thinc.neural._classes.selu import SELU
from thinc.neural._classes.layernorm import LayerNorm
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
from .. import util
from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
from .._ml import Residual, drop_layer
from ..compat import json_dumps
from . import _parse_features
@ -59,8 +63,10 @@ from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from ..gold cimport GoldParse
from ..attrs cimport TAG, DEP
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
from . import _beam_utils
USE_FINE_TUNE = True
def get_templates(*args, **kwargs):
return []
@ -232,12 +238,14 @@ cdef class Parser:
Base class of the DependencyParser and EntityRecognizer.
"""
@classmethod
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
depth = util.env_opt('parser_hidden_depth', depth)
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width)
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
tensors = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
embed_size = util.env_opt('embed_size', 4000)
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
preprocess=doc2feats()))
if parser_maxout_pieces == 1:
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
@ -249,10 +257,15 @@ cdef class Parser:
nI=token_vector_width)
with Model.use_device('cpu'):
if depth == 0:
upper = chain()
upper.is_noop = True
else:
upper = chain(
clone(Maxout(hidden_width), (depth-1)),
zero_init(Affine(nr_class, drop_factor=0.0))
)
upper.is_noop = False
# TODO: This is an unfortunate hack atm!
# Used to set input dimensions in network.
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
@ -290,6 +303,10 @@ cdef class Parser:
self.moves = self.TransitionSystem(self.vocab.strings, {})
else:
self.moves = moves
if 'beam_width' not in cfg:
cfg['beam_width'] = util.env_opt('beam_width', 1)
if 'beam_density' not in cfg:
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
self.cfg = cfg
if 'actions' in self.cfg:
for action, labels in self.cfg.get('actions', {}).items():
@ -312,7 +329,7 @@ cdef class Parser:
if beam_width is None:
beam_width = self.cfg.get('beam_width', 1)
if beam_density is None:
beam_density = self.cfg.get('beam_density', 0.001)
beam_density = self.cfg.get('beam_density', 0.0)
cdef Beam beam
if beam_width == 1:
states = self.parse_batch([doc], [doc.tensor])
@ -328,7 +345,7 @@ cdef class Parser:
return output
def pipe(self, docs, int batch_size=1000, int n_threads=2,
beam_width=1, beam_density=0.001):
beam_width=None, beam_density=None):
"""
Process a stream of documents.
@ -340,15 +357,23 @@ cdef class Parser:
The number of threads with which to work on the buffer in parallel.
Yields (Doc): Documents, in order.
"""
if beam_width is None:
beam_width = self.cfg.get('beam_width', 1)
if beam_density is None:
beam_density = self.cfg.get('beam_density', 0.0)
cdef Doc doc
cdef Beam beam
for docs in cytoolz.partition_all(batch_size, docs):
docs = list(docs)
tokvecs = [doc.tensor for doc in docs]
if beam_width == 1:
parse_states = self.parse_batch(docs, tokvecs)
else:
parse_states = self.beam_parse(docs, tokvecs,
beams = self.beam_parse(docs, tokvecs,
beam_width=beam_width, beam_density=beam_density)
parse_states = []
for beam in beams:
parse_states.append(<StateClass>beam.at(0))
self.set_annotations(docs, parse_states)
yield from docs
@ -367,7 +392,8 @@ cdef class Parser:
tokvecses = [tokvecses]
tokvecs = self.model[0].ops.flatten(tokvecses)
tokvecs += self.model[0].ops.flatten(self.model[0](docs))
if USE_FINE_TUNE:
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
nr_state = len(docs)
nr_class = self.moves.n_moves
@ -391,7 +417,14 @@ cdef class Parser:
cdef np.ndarray scores
c_token_ids = <int*>token_ids.data
c_is_valid = <int*>is_valid.data
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
while not next_step.empty():
if not has_hidden:
for i in cython.parallel.prange(
next_step.size(), num_threads=6, nogil=True):
self._parse_step(next_step[i],
feat_weights, nr_class, nr_feat, nr_piece)
else:
for i in range(next_step.size()):
st = next_step[i]
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
@ -412,19 +445,22 @@ cdef class Parser:
next_step.push_back(st)
return states
def beam_parse(self, docs, tokvecses, int beam_width=8, float beam_density=0.001):
def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
cdef Beam beam
cdef np.ndarray scores
cdef Doc doc
cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output
tokvecs = self.model[0].ops.flatten(tokvecses)
tokvecs += self.model[0].ops.flatten(self.model[0](docs))
if USE_FINE_TUNE:
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
cuda_stream, 0.0)
beams = []
cdef int offset = 0
cdef int j = 0
cdef int k
for doc in docs:
beam = Beam(nr_class, beam_width, min_density=beam_density)
beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
@ -437,22 +473,56 @@ cdef class Parser:
states = []
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
# This way we avoid having to score finalized states
# We do have to take care to keep indexes aligned, though
if not stcls.is_final():
states.append(stcls)
token_ids = self.get_token_ids(states)
vectors = state2vec(token_ids)
scores = vec2scores(vectors)
j = 0
c_scores = <float*>scores.data
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if not stcls.is_final():
self.moves.set_valid(beam.is_valid[i], stcls.c)
for j in range(nr_class):
beam.scores[i][j] = scores[i, j]
for k in range(nr_class):
beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
j += 1
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL)
beams.append(beam)
return beams
cdef void _parse_step(self, StateC* state,
const float* feat_weights,
int nr_class, int nr_feat, int nr_piece) nogil:
'''This only works with no hidden layers -- fast but inaccurate'''
#for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
# self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
token_ids = <int*>calloc(nr_feat, sizeof(int))
scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
is_valid = <int*>calloc(nr_class, sizeof(int))
state.set_context_tokens(token_ids, nr_feat)
sum_state_features(scores,
feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
self.moves.set_valid(is_valid, state)
guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
action = self.moves.c[guess]
action.do(state, action.label)
free(is_valid)
free(scores)
free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
return self.update_beam(docs_tokvecs, golds,
self.cfg['beam_width'], self.cfg['beam_density'],
drop=drop, sgd=sgd, losses=losses)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs
@ -460,9 +530,9 @@ cdef class Parser:
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs]
golds = [golds]
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs, drop=0.)
my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
tokvecs += my_tokvecs
if USE_FINE_TUNE:
tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs = self.model[0].ops.flatten(tokvecs)
cuda_stream = get_cuda_stream()
@ -489,13 +559,14 @@ cdef class Parser:
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
d_scores = self.get_batch_loss(states, golds, scores)
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
d_scores /= len(docs)
d_vector = bp_scores(d_scores, sgd=sgd)
if drop != 0:
d_vector *= mask
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to CPU, asynchronously
# Move token_ids and d_vector to GPU, asynchronously
backprops.append((
get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector),
@ -513,7 +584,63 @@ cdef class Parser:
self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
#bp_my_tokvecs(d_tokvecs, sgd=sgd)
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
if not golds:
return None
if width is None:
width = self.cfg.get('beam_width', 2)
if density is None:
density = self.cfg.get('beam_density', 0.0)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
lengths = [len(d) for d in docs]
assert min(lengths) >= 1
tokvecs = self.model[0].ops.flatten(tokvecs)
if USE_FINE_TUNE:
tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs = self.model[0].ops.flatten(tokvecs)
states = self.moves.init_batch(docs)
for gold in golds:
self.moves.preprocess_gold(gold)
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
states, tokvecs, golds,
state2vec, vec2scores,
width, density,
sgd=sgd, drop=drop, losses=losses)
backprop_lower = []
cdef float batch_size = len(docs)
for i, d_scores in enumerate(states_d_scores):
d_scores /= batch_size
if losses is not None:
losses[self.name] += (d_scores**2).sum()
ids, bp_vectors, bp_scores = backprops[i]
d_vector = bp_scores(d_scores, sgd=sgd)
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(ids, state2vec.ops.xp.ndarray):
backprop_lower.append((
get_async(cuda_stream, ids),
get_async(cuda_stream, d_vector),
bp_vectors))
else:
backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def _init_gold_batch(self, whole_docs, whole_golds):
@ -559,14 +686,10 @@ cdef class Parser:
xp = get_array_module(d_tokvecs)
for ids, d_vector, bp_vector in backprops:
d_state_features = bp_vector(d_vector, sgd=sgd)
active_feats = ids * (ids >= 0)
active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
if hasattr(xp, 'scatter_add'):
xp.scatter_add(d_tokvecs,
ids, d_state_features * active_feats)
else:
xp.add.at(d_tokvecs,
ids, d_state_features * active_feats)
mask = ids >= 0
d_state_features *= mask.reshape(ids.shape + (1,))
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
d_state_features)
@property
def move_names(self):
@ -582,7 +705,7 @@ cdef class Parser:
lower, stream, drop=dropout)
return state2vec, upper
nr_feature = 13
nr_feature = 8
def get_token_ids(self, states):
cdef StateClass state

View File

@ -99,6 +99,9 @@ cdef class TransitionSystem:
def preprocess_gold(self, GoldParse gold):
raise NotImplementedError
def is_gold_parse(self, StateClass state, GoldParse gold):
raise NotImplementedError
cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError
@ -145,7 +148,7 @@ cdef class TransitionSystem:
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int):
if not isinstance(label_name, (int, long)):
label_id = self.strings.add(label_name)
else:
label_id = label_name

View File

@ -11,9 +11,9 @@ from ..strings import StringStore
from .. import util
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'],
'fr': ['fr_depvec_web_lg'],
'xx': ['xx_ent_web_md']}
@ -86,6 +86,9 @@ def hu_tokenizer():
def fi_tokenizer():
return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture
def id_tokenizer():
return util.get_lang_class('id').Defaults.create_tokenizer()
@pytest.fixture
def sv_tokenizer():

View File

@ -2,12 +2,18 @@
from __future__ import unicode_literals
import pytest
from ....tokens.doc import Doc
@pytest.fixture
def en_lemmatizer(EN):
return EN.Defaults.create_lemmatizer()
@pytest.mark.models('en')
def test_doc_lemmatization(EN):
doc = Doc(EN.vocab, words=['bleed'])
doc[0].tag_ = 'VBP'
assert doc[0].lemma_ == 'bleed'
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
@ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
assert en_lemmatizer.noun(text) == set(lemmas)
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
("feed", ["feed"]),
("need", ["need"]),
("ring", ["ring"]),
("axes", ["axis", "axe", "ax"])])
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
assert en_lemmatizer.noun(text) == set(lemmas)
@pytest.mark.xfail
@pytest.mark.models('en')
def test_en_lemmatizer_base_forms(en_lemmatizer):

View File

@ -25,7 +25,6 @@ def test_tag_names(EN):
doc = EN(text, disable=['parser'])
assert type(doc[2].pos) == int
assert isinstance(doc[2].pos_, six.text_type)
assert type(doc[2].dep) == int
assert isinstance(doc[2].dep_, six.text_type)
assert doc[2].tag_ == u'NNS'

View File

View File

@ -0,0 +1,115 @@
# coding: utf-8
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(Ma'arif)"])
def test_tokenizer_splits_no_special(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Ma'arif"])
def test_tokenizer_splits_no_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["(Ma'arif"])
def test_tokenizer_splits_prefix_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["Ma'arif)"])
def test_tokenizer_splits_suffix_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(Ma'arif)"])
def test_tokenizer_splits_even_wrap(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(Ma'arif?)"])
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
def test_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
tokens = id_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["S.Kom.)"])
def test_tokenizer_splits_suffix_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(S.Kom.)"])
def test_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(S.Kom.?)"])
def test_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)])
def test_tokenizer_splits_hyphens(id_tokenizer, text, length):
tokens = id_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_tokenizer_splits_numeric_range(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"])
def test_tokenizer_splits_period_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"])
def test_tokenizer_splits_comma_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
assert tokens[0].text == text.split(",")[0]
assert tokens[1].text == ","
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"])
def test_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
def test_tokenizer_splits_double_hyphen_infix(id_tokenizer):
tokens = id_tokenizer("Arsene Wenger--manajer Arsenal--melakukan konferensi pers.")
assert len(tokens) == 10
assert tokens[0].text == "Arsene"
assert tokens[1].text == "Wenger"
assert tokens[2].text == "--"
assert tokens[3].text == "manajer"
assert tokens[4].text == "Arsenal"
assert tokens[5].text == "--"
assert tokens[6].text == "melakukan"
assert tokens[7].text == "konferensi"
assert tokens[8].text == "pers"
assert tokens[9].text == "."

View File

@ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc):
parser(doc, beam_width=32, beam_density=0.001)
for word in doc:
print(word.text, word.head, word.dep_)
def test_update_doc_beam(parser, tok2vec, model, doc, gold):
parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize)
assert d_tokvecs[0].sum() == 0.

View File

@ -0,0 +1,87 @@
from __future__ import unicode_literals
import pytest
import numpy
from thinc.api import layerize
from ...vocab import Vocab
from ...syntax.arc_eager import ArcEager
from ...tokens import Doc
from ...gold import GoldParse
from ...syntax._beam_utils import ParserBeam, update_beam
from ...syntax.stateclass import StateClass
@pytest.fixture
def vocab():
return Vocab()
@pytest.fixture
def moves(vocab):
aeager = ArcEager(vocab.strings, {})
aeager.add_action(2, 'nsubj')
aeager.add_action(3, 'dobj')
aeager.add_action(2, 'aux')
return aeager
@pytest.fixture
def docs(vocab):
return [Doc(vocab, words=['Rats', 'bite', 'things'])]
@pytest.fixture
def states(docs):
return [StateClass(doc) for doc in docs]
@pytest.fixture
def tokvecs(docs, vector_size):
output = []
for doc in docs:
vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
output.append(numpy.asarray(vec))
return output
@pytest.fixture
def golds(docs):
return [GoldParse(doc) for doc in docs]
@pytest.fixture
def batch_size(docs):
return len(docs)
@pytest.fixture
def beam_width():
return 4
@pytest.fixture
def vector_size():
return 6
@pytest.fixture
def beam(moves, states, golds, beam_width):
return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
@pytest.fixture
def scores(moves, batch_size, beam_width):
return [
numpy.asarray(
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
dtype='f')
for _ in range(batch_size)]
def test_create_beam(beam):
pass
def test_beam_advance(beam, scores):
beam.advance(scores)
def test_beam_advance_too_few_scores(beam, scores):
with pytest.raises(IndexError):
beam.advance(scores[:-1])

View File

@ -0,0 +1,12 @@
'''Test tokens compare correctly'''
from __future__ import unicode_literals
from ..util import get_doc
from ...vocab import Vocab
def test_issue1257():
doc1 = get_doc(Vocab(), ['a', 'b', 'c'])
doc2 = get_doc(Vocab(), ['a', 'c', 'e'])
assert doc1[0] != doc2[0]
assert not doc1[0] == doc2[0]

View File

@ -0,0 +1,8 @@
import pytest
@pytest.mark.models('en')
def test_issue1305(EN):
'''Test lemmatization of English VBZ'''
assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
doc = EN(u'This app works well')
assert doc[2].lemma_ == 'work'

View File

@ -13,7 +13,10 @@ def test_issue429(EN):
return None
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
for ent_id, label, span in spans:
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
span.merge(
tag=('NNP' if label else span.root.tag_),
lemma=span.text,
label='PERSON')
doc = EN('a')
matcher = Matcher(EN.vocab)

View File

@ -11,8 +11,8 @@ import pytest
def taggers(en_vocab):
tagger1 = Tagger(en_vocab)
tagger2 = Tagger(en_vocab)
tagger1.model = tagger1.Model(None, None)
tagger2.model = tagger2.Model(None, None)
tagger1.model = tagger1.Model(8, 8)
tagger2.model = tagger1.model
return (tagger1, tagger2)
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
tagger1, tagger2 = taggers
tagger1_b = tagger1.to_bytes()
tagger2_b = tagger2.to_bytes()
assert tagger1_b == tagger2_b
tagger1 = tagger1.from_bytes(tagger1_b)
assert tagger1.to_bytes() == tagger1_b
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ..util import get_doc
from ...attrs import ORTH, LENGTH
import pytest
@ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer):
span3 = tokens[0:2]
assert hash(span3) == hash(span1)
def test_spans_by_character(doc):
span1 = doc[1:-2]
span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == 'GPE'
def test_span_to_array(doc):
span = doc[1:-2]
arr = span.to_array([ORTH, LENGTH])
assert arr.shape == (len(span), 2)
assert arr[0, 0] == span[0].orth
assert arr[0, 1] == len(span[0])

View File

@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
"""Add list of vector tuples to given vocab. All vectors need to have the
same length. Format: [("text", [1, 2, 3])]"""
length = len(vectors[0][1])
vocab.resize_vectors(length)
vocab.clear_vectors(length)
for word, vec in vectors:
vocab[word].vector = vec
vocab.set_vector(word, vec)
return vocab

View File

@ -14,10 +14,9 @@ def vectors():
@pytest.fixture()
def vocab(en_vocab, vectors):
#return add_vecs_to_vocab(en_vocab, vectors)
return None
add_vecs_to_vocab(en_vocab, vectors)
return en_vocab
@pytest.mark.xfail
def test_vectors_similarity_LL(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
lex1 = vocab[word1]
@ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors):
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@pytest.mark.xfail
def test_vectors_similarity_TT(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])
@ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors):
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
@pytest.mark.xfail
def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
@pytest.mark.xfail
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
@pytest.mark.xfail
def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])

View File

@ -2,6 +2,8 @@
from __future__ import unicode_literals
from ...vectors import Vectors
from ...tokenizer import Tokenizer
from ..util import add_vecs_to_vocab, get_doc
import numpy
import pytest
@ -11,22 +13,42 @@ import pytest
def strings():
return ["apple", "orange"]
@pytest.fixture
def vectors():
return [
("apple", [1, 2, 3]),
("orange", [-1, -2, -3]),
('and', [-1, -1, -1]),
('juice', [5, 5, 10]),
('pie', [7, 6.3, 8.9])]
@pytest.fixture
def data():
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')
@pytest.fixture()
def vocab(en_vocab, vectors):
add_vecs_to_vocab(en_vocab, vectors)
return en_vocab
def test_init_vectors_with_data(strings, data):
v = Vectors(strings, data)
assert v.shape == data.shape
def test_init_vectors_with_width(strings):
v = Vectors(strings, 3)
for string in strings:
v.add(string)
assert v.shape == (len(strings), 3)
def test_get_vector(strings, data):
v = Vectors(strings, data)
for string in strings:
v.add(string)
assert list(v[strings[0]]) == list(data[0])
assert list(v[strings[0]]) != list(data[1])
assert list(v[strings[1]]) != list(data[0])
@ -35,6 +57,8 @@ def test_get_vector(strings, data):
def test_set_vector(strings, data):
orig = data.copy()
v = Vectors(strings, data)
for string in strings:
v.add(string)
assert list(v[strings[0]]) == list(orig[0])
assert list(v[strings[0]]) != list(orig[1])
v[strings[0]] = data[1]
@ -42,125 +66,111 @@ def test_set_vector(strings, data):
assert list(v[strings[0]]) != list(orig[0])
#
#@pytest.fixture()
#def tokenizer_v(vocab):
# return Tokenizer(vocab, {}, None, None, None)
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', ["apple and orange"])
#def test_vectors_token_vector(tokenizer_v, vectors, text):
# doc = tokenizer_v(text)
# assert vectors[0] == (doc[0].text, list(doc[0].vector))
# assert vectors[1] == (doc[2].text, list(doc[2].vector))
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', ["apple", "orange"])
#def test_vectors_lexeme_vector(vocab, text):
# lex = vocab[text]
# assert list(lex.vector)
# assert lex.vector_norm
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
#def test_vectors_doc_vector(vocab, text):
# doc = get_doc(vocab, text)
# assert list(doc.vector)
# assert doc.vector_norm
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
#def test_vectors_span_vector(vocab, text):
# span = get_doc(vocab, text)[0:2]
# assert list(span.vector)
# assert span.vector_norm
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', ["apple orange"])
#def test_vectors_token_token_similarity(tokenizer_v, text):
# doc = tokenizer_v(text)
# assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
# assert 0.0 < doc[0].similarity(doc[1]) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
# token = tokenizer_v(text1)
# lex = vocab[text2]
# assert token.similarity(lex) == lex.similarity(token)
# assert 0.0 < token.similarity(lex) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_token_span_similarity(vocab, text):
# doc = get_doc(vocab, text)
# assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
# assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_token_doc_similarity(vocab, text):
# doc = get_doc(vocab, text)
# assert doc[0].similarity(doc) == doc.similarity(doc[0])
# assert 0.0 < doc[0].similarity(doc) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_lexeme_span_similarity(vocab, text):
# doc = get_doc(vocab, text)
# lex = vocab[text[0]]
# assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
# assert 0.0 < doc.similarity(doc[1:3]) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
# lex1 = vocab[text1]
# lex2 = vocab[text2]
# assert lex1.similarity(lex2) == lex2.similarity(lex1)
# assert 0.0 < lex1.similarity(lex2) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_lexeme_doc_similarity(vocab, text):
# doc = get_doc(vocab, text)
# lex = vocab[text[0]]
# assert lex.similarity(doc) == doc.similarity(lex)
# assert 0.0 < lex.similarity(doc) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_span_span_similarity(vocab, text):
# doc = get_doc(vocab, text)
# assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
# assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_span_doc_similarity(vocab, text):
# doc = get_doc(vocab, text)
# assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
# assert 0.0 < doc[0:2].similarity(doc) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text1,text2', [
# (["apple", "and", "apple", "pie"], ["orange", "juice"])])
#def test_vectors_doc_doc_similarity(vocab, text1, text2):
# doc1 = get_doc(vocab, text1)
# doc2 = get_doc(vocab, text2)
# assert doc1.similarity(doc2) == doc2.similarity(doc1)
# assert 0.0 < doc1.similarity(doc2) < 1.0
@pytest.fixture()
def tokenizer_v(vocab):
return Tokenizer(vocab, {}, None, None, None)
@pytest.mark.parametrize('text', ["apple and orange"])
def test_vectors_token_vector(tokenizer_v, vectors, text):
doc = tokenizer_v(text)
assert vectors[0] == (doc[0].text, list(doc[0].vector))
assert vectors[1] == (doc[2].text, list(doc[2].vector))
@pytest.mark.parametrize('text', ["apple", "orange"])
def test_vectors_lexeme_vector(vocab, text):
lex = vocab[text]
assert list(lex.vector)
assert lex.vector_norm
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
def test_vectors_doc_vector(vocab, text):
doc = get_doc(vocab, text)
assert list(doc.vector)
assert doc.vector_norm
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
def test_vectors_span_vector(vocab, text):
span = get_doc(vocab, text)[0:2]
assert list(span.vector)
assert span.vector_norm
@pytest.mark.parametrize('text', ["apple orange"])
def test_vectors_token_token_similarity(tokenizer_v, text):
doc = tokenizer_v(text)
assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
assert -1. < doc[0].similarity(doc[1]) < 1.0
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
token = tokenizer_v(text1)
lex = vocab[text2]
assert token.similarity(lex) == lex.similarity(token)
assert -1. < token.similarity(lex) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_token_span_similarity(vocab, text):
doc = get_doc(vocab, text)
assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
assert -1. < doc[0].similarity(doc[1:3]) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_token_doc_similarity(vocab, text):
doc = get_doc(vocab, text)
assert doc[0].similarity(doc) == doc.similarity(doc[0])
assert -1. < doc[0].similarity(doc) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_lexeme_span_similarity(vocab, text):
doc = get_doc(vocab, text)
lex = vocab[text[0]]
assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
assert -1. < doc.similarity(doc[1:3]) < 1.0
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
lex1 = vocab[text1]
lex2 = vocab[text2]
assert lex1.similarity(lex2) == lex2.similarity(lex1)
assert -1. < lex1.similarity(lex2) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_lexeme_doc_similarity(vocab, text):
doc = get_doc(vocab, text)
lex = vocab[text[0]]
assert lex.similarity(doc) == doc.similarity(lex)
assert -1. < lex.similarity(doc) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_span_span_similarity(vocab, text):
doc = get_doc(vocab, text)
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_span_doc_similarity(vocab, text):
doc = get_doc(vocab, text)
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
assert -1. < doc[0:2].similarity(doc) < 1.0
@pytest.mark.parametrize('text1,text2', [
(["apple", "and", "apple", "pie"], ["orange", "juice"])])
def test_vectors_doc_doc_similarity(vocab, text1, text2):
doc1 = get_doc(vocab, text1)
doc2 = get_doc(vocab, text2)
assert doc1.similarity(doc2) == doc2.similarity(doc1)
assert -1. < doc1.similarity(doc2) < 1.0

View File

@ -238,6 +238,29 @@ cdef class Doc:
def doc(self):
return self
def char_span(self, int start_idx, int end_idx, label=0, vector=None):
"""Create a `Span` object from the slice `doc.text[start : end]`.
doc (Doc): The parent document.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
RETURNS (Span): The newly constructed object.
"""
if not isinstance(label, int):
label = self.vocab.strings.add(label)
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
cdef int end = token_by_end(self.c, self.length, end_idx)
if end == -1:
return None
# Currently we have the token index, we want the range-end index
end += 1
cdef Span span = Span(self, start, end, label=label, vector=vector)
return span
def similarity(self, other):
"""Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
@ -280,8 +303,14 @@ cdef class Doc:
return self.user_hooks['vector'](self)
if self._vector is not None:
return self._vector
elif self.has_vector and len(self):
self._vector = sum(t.vector for t in self) / len(self)
elif not len(self):
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
return self._vector
elif self.has_vector:
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
for token in self.c[:self.length]:
vector += self.vocab.get_vector(token.lex.orth)
self._vector = vector / len(self)
return self._vector
elif self.tensor is not None:
self._vector = self.tensor.mean(axis=0)

View File

@ -15,5 +15,5 @@ cdef class Span:
cdef public _vector
cdef public _vector_norm
cpdef int _recalculate_indices(self) except -1
cpdef np.ndarray to_array(self, object features)

View File

@ -7,7 +7,7 @@ import numpy
import numpy.linalg
from libc.math cimport sqrt
from .doc cimport token_by_start, token_by_end
from .doc cimport token_by_start, token_by_end, get_token_attr
from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t, hash_t
from ..attrs cimport attr_id_t
@ -135,6 +135,29 @@ cdef class Span:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
The values will be 32-bit integers.
attr_ids (list[int]): A list of attribute ID ints.
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
per word, and one column per attribute indicated in the input
`attr_ids`.
"""
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
cdef int length = self.end - self.start
output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64)
for i in range(self.start, self.end):
for j, feature in enumerate(attr_ids):
output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
return output
cpdef int _recalculate_indices(self) except -1:
if self.end > self.doc.length \
or self.doc.c[self.start].idx != self.start_char \

View File

@ -62,18 +62,26 @@ cdef class Token:
def __richcmp__(self, Token other, int op):
# http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
cdef Doc my_doc = self.doc
cdef Doc other_doc = other.doc
my = self.idx
their = other.idx if other is not None else None
if op == 0:
return my < their
elif op == 2:
if my_doc is other_doc:
return my == their
else:
return False
elif op == 4:
return my > their
elif op == 1:
return my <= their
elif op == 3:
if my_doc is other_doc:
return my != their
else:
return True
elif op == 5:
return my >= their
else:

View File

@ -22,7 +22,7 @@ import ujson
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import copy_array, normalize_string_keys, getattr_
from .compat import copy_array, normalize_string_keys, getattr_, import_file
LANGUAGES = {}
@ -112,15 +112,13 @@ def load_model(name, **overrides):
def load_model_from_link(name, **overrides):
"""Load a model from a shortcut link, or directory in spaCy data path."""
init_file = get_data_path() / name / '__init__.py'
spec = importlib.util.spec_from_file_location(name, init_file)
path = get_data_path() / name / '__init__.py'
try:
cls = importlib.util.module_from_spec(spec)
cls = import_file(name, path)
except AttributeError:
raise IOError(
"Cant' load '%s'. If you're using a shortcut link, make sure it "
"points to a valid model package (not just a data directory)." % name)
spec.loader.exec_module(cls)
return cls.load(**overrides)
@ -171,8 +169,8 @@ def get_model_meta(path):
raise IOError("Could not read meta.json from %s" % meta_path)
meta = read_json(meta_path)
for setting in ['lang', 'name', 'version']:
if setting not in meta:
raise ValueError('No %s setting found in model meta.json' % setting)
if setting not in meta or not meta[setting]:
raise ValueError("No valid '%s' setting found in model meta.json" % setting)
return meta

View File

@ -1,18 +1,25 @@
from __future__ import unicode_literals
from libc.stdint cimport int32_t, uint64_t
import numpy
from collections import OrderedDict
import msgpack
import msgpack_numpy
msgpack_numpy.patch()
cimport numpy as np
from .typedefs cimport attr_t
from .strings cimport StringStore
from . import util
from .compat import basestring_
cdef class Vectors:
'''Store, save and load word vectors.'''
cdef public object data
cdef readonly StringStore strings
cdef public object key2i
cdef public object key2row
cdef public object keys
cdef public int i
def __init__(self, strings, data_or_width):
self.strings = StringStore()
@ -21,10 +28,10 @@ cdef class Vectors:
dtype='f')
else:
data = data_or_width
self.i = 0
self.data = data
self.key2i = {}
for i, string in enumerate(strings):
self.key2i[self.strings.add(string)] = i
self.key2row = {}
self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
def __reduce__(self):
return (Vectors, (self.strings, self.data))
@ -32,7 +39,7 @@ cdef class Vectors:
def __getitem__(self, key):
if isinstance(key, basestring):
key = self.strings[key]
i = self.key2i[key]
i = self.key2row[key]
if i is None:
raise KeyError(key)
else:
@ -41,14 +48,36 @@ cdef class Vectors:
def __setitem__(self, key, vector):
if isinstance(key, basestring):
key = self.strings.add(key)
i = self.key2i[key]
i = self.key2row[key]
self.data[i] = vector
def __iter__(self):
yield from self.data
def __len__(self):
return len(self.strings)
return self.i
def __contains__(self, key):
if isinstance(key, basestring_):
key = self.strings[key]
return key in self.key2row
def add(self, key, vector=None):
if isinstance(key, basestring_):
key = self.strings.add(key)
if key not in self.key2row:
i = self.i
if i >= self.keys.shape[0]:
self.keys.resize((self.keys.shape[0]*2,))
self.data.resize((self.data.shape[0]*2, self.data.shape[1]))
self.key2row[key] = self.i
self.keys[self.i] = key
self.i += 1
else:
i = self.key2row[key]
if vector is not None:
self.data[i] = vector
return i
def items(self):
for i, string in enumerate(self.strings):
@ -61,34 +90,87 @@ cdef class Vectors:
def most_similar(self, key):
raise NotImplementedError
def to_disk(self, path):
raise NotImplementedError
def from_glove(self, path):
'''Load GloVe vectors from a directory. Assumes binary format,
that the vocab is in a vocab.txt, and that vectors are named
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
By default GloVe outputs 64-bit vectors.'''
path = util.ensure_path(path)
for name in path.iterdir():
if name.parts[-1].startswith('vectors'):
_, dims, dtype, _2 = name.parts[-1].split('.')
self.width = int(dims)
break
else:
raise IOError("Expected file named e.g. vectors.128.f.bin")
bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
dtype=dtype)
with bin_loc.open('rb') as file_:
self.data = numpy.fromfile(file_, dtype='float64')
self.data = numpy.ascontiguousarray(self.data, dtype='float32')
n = 0
with (path / 'vocab.txt').open('r') as file_:
for line in file_:
self.add(line.strip())
n += 1
if (self.data.size % self.width) == 0:
self.data
def from_disk(self, path):
raise NotImplementedError
def to_disk(self, path, **exclude):
serializers = OrderedDict((
('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
))
return util.to_disk(path, serializers, exclude)
def from_disk(self, path, **exclude):
def load_keys(path):
if path.exists():
self.keys = numpy.load(path)
for i, key in enumerate(self.keys):
self.keys[i] = key
self.key2row[key] = i
def load_vectors(path):
if path.exists():
self.data = numpy.load(path)
serializers = OrderedDict((
('keys', load_keys),
('vectors', load_vectors),
))
util.from_disk(path, serializers, exclude)
return self
def to_bytes(self, **exclude):
def serialize_weights():
if hasattr(self.weights, 'to_bytes'):
return self.weights.to_bytes()
if hasattr(self.data, 'to_bytes'):
return self.data.to_bytes()
else:
return msgpack.dumps(self.weights)
return msgpack.dumps(self.data)
serializers = OrderedDict((
('strings', lambda: self.strings.to_bytes()),
('weights', serialize_weights)
('keys', lambda: msgpack.dumps(self.keys)),
('vectors', serialize_weights)
))
return util.to_bytes(serializers, exclude)
def from_bytes(self, data, **exclude):
def deserialize_weights(b):
if hasattr(self.weights, 'from_bytes'):
self.weights.from_bytes()
if hasattr(self.data, 'from_bytes'):
self.data.from_bytes()
else:
self.weights = msgpack.loads(b)
self.data = msgpack.loads(b)
def load_keys(keys):
self.keys.resize((len(keys),))
for i, key in enumerate(keys):
self.keys[i] = key
self.key2row[key] = i
deserializers = OrderedDict((
('strings', lambda b: self.strings.from_bytes(b)),
('weights', deserialize_weights)
('keys', lambda b: load_keys(msgpack.loads(b))),
('vectors', deserialize_weights)
))
return util.from_bytes(deserializers, exclude)
util.from_bytes(data, deserializers, exclude)
return self

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import bz2
import ujson
import re
import numpy
from libc.string cimport memset, memcpy
from libc.stdint cimport int32_t
@ -19,9 +20,10 @@ from .tokens.token cimport Token
from .attrs cimport PROB, LANG
from .structs cimport SerializedLexemeC
from .compat import copy_reg, pickle
from .compat import copy_reg, pickle, basestring_
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .vectors import Vectors
from . import util
from . import attrs
from . import symbols
@ -63,6 +65,7 @@ cdef class Vocab:
self.strings.add(name)
self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.vectors = Vectors(self.strings, 300)
property lang:
def __get__(self):
@ -242,13 +245,15 @@ cdef class Vocab:
@property
def vectors_length(self):
raise NotImplementedError
return self.vectors.data.shape[1]
def clear_vectors(self):
def clear_vectors(self, new_dim=None):
"""Drop the current vector table. Because all vectors must be the same
width, you have to call this to change the size of the vectors.
"""
raise NotImplementedError
if new_dim is None:
new_dim = self.vectors.data.shape[1]
self.vectors = Vectors(self.strings, new_dim)
def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary.
@ -262,7 +267,12 @@ cdef class Vocab:
RAISES: If no vectors data is loaded, ValueError is raised.
"""
raise NotImplementedError
if isinstance(orth, basestring_):
orth = self.strings.add(orth)
if orth in self.vectors.key2row:
return self.vectors[orth]
else:
return numpy.zeros((self.vectors_length,), dtype='f')
def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary.
@ -272,15 +282,19 @@ cdef class Vocab:
RETURNS:
None
"""
raise NotImplementedError
if not isinstance(orth, basestring_):
orth = self.strings[orth]
self.vectors.add(orth, vector=vector)
def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no
vectors have been loaded. Words can be looked up by string
or int ID."""
return False
if isinstance(orth, basestring_):
orth = self.strings.add(orth)
return orth in self.vectors
def to_disk(self, path):
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
@ -292,8 +306,10 @@ cdef class Vocab:
self.strings.to_disk(path / 'strings.json')
with (path / 'lexemes.bin').open('wb') as file_:
file_.write(self.lexemes_to_bytes())
if self.vectors is not None:
self.vectors.to_disk(path)
def from_disk(self, path):
def from_disk(self, path, **exclude):
"""Loads state from a directory. Modifies the object in place and
returns it.
@ -305,6 +321,8 @@ cdef class Vocab:
self.strings.from_disk(path / 'strings.json')
with (path / 'lexemes.bin').open('rb') as file_:
self.lexemes_from_bytes(file_.read())
if self.vectors is not None:
self.vectors.from_disk(path, exclude='strings.json')
return self
def to_bytes(self, **exclude):
@ -313,9 +331,16 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vocab` object.
"""
def deserialize_vectors():
if self.vectors is None:
return None
else:
return self.vectors.to_bytes(exclude='strings.json')
getters = OrderedDict((
('strings', lambda: self.strings.to_bytes()),
('lexemes', lambda: self.lexemes_to_bytes()),
('vectors', deserialize_vectors)
))
return util.to_bytes(getters, exclude)
@ -326,9 +351,15 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being loaded.
RETURNS (Vocab): The `Vocab` object.
"""
def serialize_vectors(b):
if self.vectors is None:
return None
else:
return self.vectors.from_bytes(b, exclude='strings')
setters = OrderedDict((
('strings', lambda b: self.strings.from_bytes(b)),
('lexemes', lambda b: self.lexemes_from_bytes(b)),
('vectors', lambda b: serialize_vectors(b))
))
util.from_bytes(bytes_data, setters, exclude)
return self

View File

@ -2,9 +2,8 @@
if [ "${VIA}" == "pypi" ]; then
rm -rf *
pip install spacy
python -m spacy.en.download
python -m spacy.de.download
pip install spacy-nightly
python -m spacy download en
fi
if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then

View File

@ -103,20 +103,20 @@ mixin button(url, trusted, ...style)
label - [string] aside title (optional or false for no label)
language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS
icon - [string] icon to display next to code block, mostly used for old/new
prompt - [string] prompt or icon to display next to code block, (mostly used for old/new)
height - [integer] optional height to clip code block to
mixin code(label, language, icon, height)
mixin code(label, language, prompt, height)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
- var icon = (prompt == 'accept' || prompt == 'reject')
if icon
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
+icon(icon, 18)
code.c-code-block__content
code.c-code-block__content(data-prompt=icon ? null : prompt)
block

View File

@ -112,6 +112,10 @@
.u-nowrap
white-space: nowrap
.u-break.u-break
word-wrap: break-word
white-space: initial
.u-no-border
border: none

View File

@ -35,6 +35,13 @@
font: normal normal 1.1rem/#{2} $font-code
padding: 1em 2em
&[data-prompt]:before,
content: attr(data-prompt)
margin-right: 0.65em
display: inline-block
vertical-align: middle
opacity: 0.5
//- Inline code

View File

@ -21,7 +21,7 @@ p
+pos-row("$", "SYM", "SymType=currency", "symbol, currency")
+pos-row("ADD", "X", "", "email")
+pos-row("AFX", "ADJ", "Hyph=yes", "affix")
+pos-row("BES", "VERB", "", 'auxillary "be"')
+pos-row("BES", "VERB", "", 'auxiliary "be"')
+pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating")
+pos-row("CD", "NUM", "NumType=card", "cardinal number")
+pos-row("DT", "DET", "determiner")
@ -35,7 +35,7 @@ p
+pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative")
+pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative")
+pos-row("LS", "PUNCT", "NumType=ord", "list item marker")
+pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary")
+pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary")
+pos-row("NFP", "PUNCT", "", "superfluous punctuation")
+pos-row("NIL", "", "", "missing tag")
+pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass")

View File

@ -5,16 +5,7 @@ include ../../_includes/_mixins
p
| As of v1.7.0, spaCy comes with new command line helpers to download and
| link models and show useful debugging information. For a list of available
| commands, type #[code python -m spacy]. To make the command even more
| convenient, we recommend
| #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias]
| mapping #[code python -m spacy] to #[code spacy].
+aside("Why python -m?")
| The problem with a global entry point is that it's resolved by looking up
| entries in your #[code PATH] environment variable. This can give you
| unexpected results, like executing the wrong spaCy installation.
| #[code python -m] prevents fallbacks to system modules.
| commands, type #[code spacy --help].
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, the #[code model] command to initialise a model data
@ -33,8 +24,8 @@ p
| Direct downloads don't perform any compatibility checks and require the
| model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]).
+code(false, "bash").
python -m spacy download [model] [--direct]
+code(false, "bash", "$").
spacy download [model] [--direct]
+table(["Argument", "Type", "Description"])
+row
@ -80,8 +71,8 @@ p
| or use the #[+api("cli#package") #[code package]] command to create a
| model package.
+code(false, "bash").
python -m spacy link [origin] [link_name] [--force]
+code(false, "bash", "$").
spacy link [origin] [link_name] [--force]
+table(["Argument", "Type", "Description"])
+row
@ -112,8 +103,8 @@ p
| markup to copy-paste into #[+a(gh("spacy") + "/issues") GitHub issues].
+code(false, "bash").
python -m spacy info [--markdown]
python -m spacy info [model] [--markdown]
spacy info [--markdown]
spacy info [model] [--markdown]
+table(["Argument", "Type", "Description"])
+row
@ -139,8 +130,8 @@ p
| functions. The right converter is chosen based on the file extension of
| the input file. Currently only supports #[code .conllu].
+code(false, "bash").
python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
+code(false, "bash", "$").
spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
+table(["Argument", "Type", "Description"])
+row
@ -174,8 +165,8 @@ p
| Train a model. Expects data in spaCy's
| #[+a("/docs/api/annotation#json-input") JSON format].
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+code(false, "bash", "$").
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
+table(["Argument", "Type", "Description"])
+row
@ -345,8 +336,8 @@ p
| sure you're always using the latest versions. This means you need to be
| connected to the internet to use this command.
+code(false, "bash").
python -m spacy package [input_dir] [output_dir] [--meta] [--force]
+code(false, "bash", "$").
spacy package [input_dir] [output_dir] [--meta] [--force]
+table(["Argument", "Type", "Description"])
+row
@ -360,10 +351,17 @@ p
+cell Directory to create package folder in.
+row
+cell #[code meta]
+cell #[code --meta-path], #[code -m]
+cell option
+cell Path to meta.json file (optional).
+row
+cell #[code --create-meta], #[code -c]
+cell flag
+cell
| Create a meta.json file on the command line, even if one already
| exists in the directory.
+row
+cell #[code --force], #[code -f]
+cell flag

View File

@ -140,6 +140,43 @@ p Get the number of tokens in the document.
+cell int
+cell The number of tokens in the document.
+h(2, "char_span") Doc.char_span
+tag method
+tag-new(2)
p Create a #[code Span] object from the slice #[code doc.text[start : end]].
+aside-code("Example").
doc = nlp(u'I like New York')
span = doc.char_span(7, 15, label=u'GPE')
assert span.text == 'New York'
+table(["Name", "Type", "Description"])
+row
+cell #[code start]
+cell int
+cell The index of the first character of the span.
+row
+cell #[code end]
+cell int
+cell The index of the first character after the span.
+row
+cell #[code label]
+cell uint64 / unicode
+cell A label to attach to the Span, e.g. for named entities.
+row
+cell #[code vector]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A meaning representation of the span.
+footrow
+cell returns
+cell #[code Span]
+cell The newly constructed object.
+h(2, "similarity") Doc.similarity
+tag method
+tag-model("vectors")
@ -211,12 +248,12 @@ p
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell list
+cell A list of attribute ID ints.
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
@ -245,7 +282,7 @@ p
+row
+cell #[code array]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
@ -509,7 +546,7 @@ p
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "vector_norm") Doc.vector_norm

View File

@ -8,9 +8,9 @@ p
+aside-code("Download language models", "bash").
python -m spacy download en
python -m spacy download de
python -m spacy download fr
spacy download en
spacy download de
spacy download fr
+table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"])
+row

View File

@ -111,6 +111,14 @@ p
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code as_tuples]
+cell bool
+cell
| If set to #[code True], inputs should be a sequence of
| #[code (text, context)] tuples. Output will then be a sequence of
| #[code (doc, context)] tuples. Defaults to #[code False].
+row
+cell #[code n_threads]
+cell int

View File

@ -129,7 +129,7 @@ p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the lexeme's semantics.
+h(2, "vector_norm") Lexeme.vector_norm

View File

@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
+row
+cell #[code vector]
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A meaning representation of the span.
+footrow
@ -145,11 +145,47 @@ p
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "to_array") Span.to_array
+tag method
+tag-new(2)
p
| Given a list of #[code M] attribute IDs, export the tokens to a numpy
| #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
| the document. The values will be 32-bit integers.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(u'I like New York in Autumn.')
span = doc[2:3]
# All strings mapped to integers, for easy export to numpy
np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell list
+cell A list of attribute ID ints.
+footrow
+cell returns
+cell #[code.u-break numpy.ndarray[long, ndim=2]]
+cell
| A feature matrix, with one row per word, and one column per
| attribute indicated in the input #[code attr_ids].
+h(2, "merge") Span.merge
+tag method
p Retokenize the document, such that the span is merged into a single token.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
span = doc[2:3]
span.merge()
assert len(doc) == 6
assert doc[2].text == 'New York'
+table(["Name", "Type", "Description"])
+row
+cell #[code **attributes]
@ -169,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
p
| The token within the span that's highest in the parse tree. If there's a
| tie, the earlist is prefered.
| tie, the earliest is preferred.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
@ -270,7 +306,7 @@ p
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the span's semantics.
+h(2, "vector_norm") Span.vector_norm

View File

@ -250,7 +250,7 @@ p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the token's semantics.
+h(2, "vector_norm") Span.vector_norm

View File

@ -205,7 +205,7 @@ p
+infobox("Why lazy-loading?")
| Some languages contain large volumes of custom data, like lemmatizer
| loopup tables, or complex regular expression that are expensive to
| lookup tables, or complex regular expression that are expensive to
| compute. As of spaCy v2.0, #[code Language] classes are not imported on
| initialisation and are only loaded when you import them directly, or load
| a model that requires a language to be loaded. To lazy-load languages in
@ -789,4 +789,4 @@ p
| model use the using spaCy's #[+api("cli#train") #[code train]] command:
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]
spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities]

View File

@ -39,7 +39,7 @@ p
+h(2, "special-cases") Adding special case tokenization rules
p
| Most domains have at least some idiosyncracies that require custom
| Most domains have at least some idiosyncrasies that require custom
| tokenization rules. This could be very certain expressions, or
| abbreviations only used in this specific field.

View File

@ -32,10 +32,10 @@ p
+qs({package: 'source'}) pip install -r requirements.txt
+qs({package: 'source'}) pip install -e .
+qs({model: 'en'}) python -m spacy download en
+qs({model: 'de'}) python -m spacy download de
+qs({model: 'fr'}) python -m spacy download fr
+qs({model: 'es'}) python -m spacy download es
+qs({model: 'en'}) spacy download en
+qs({model: 'de'}) spacy download de
+qs({model: 'fr'}) spacy download fr
+qs({model: 'es'}) spacy download es
+h(2, "installation") Installation instructions
@ -52,7 +52,7 @@ p Using pip, spaCy releases are currently only available as source packages.
| and available models, see the #[+a("/docs/usage/models") docs on models].
+code.o-no-block.
python -m spacy download en
spacy download en
&gt;&gt;&gt; import spacy
&gt;&gt;&gt; nlp = spacy.load('en')
@ -109,7 +109,7 @@ p
| The other way to install spaCy is to clone its
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
| the common way if you want to make changes to the code base. You'll need to
| make sure that you have a development enviroment consisting of a Python
| make sure that you have a development environment consisting of a Python
| distribution including header files, a compiler,
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
| #[+a("https://virtualenv.pypa.io/") virtualenv] and
@ -312,7 +312,9 @@ p
| This error may occur when running the #[code spacy] command from the
| command line. spaCy does not currently add an entry to our #[code PATH]
| environment variable, as this can lead to unexpected results, especially
| when using #[code virtualenv]. Run the command with #[code python -m],
| when using #[code virtualenv]. Instead, spaCy adds an auto-alias that
| maps #[code spacy] to #[code python -m spacy]. If this is not working as
| expected, run the command with #[code python -m], yourself
| for example #[code python -m spacy download en]. For more info on this,
| see #[+api("cli#download") download].

View File

@ -10,8 +10,8 @@ p
+h(2, "models") Install models and process text
+code(false, "bash").
python -m spacy download en
python -m spacy download de
spacy download en
spacy download de
+code.
import spacy

Some files were not shown because too many files have changed in this diff Show More