Revert noise-level back to default 0.0

This commit is contained in:
Matthew Honnibal 2017-09-06 04:58:33 -05:00
commit 167f6a8938
130 changed files with 67009 additions and 17315 deletions

1
.gitignore vendored
View File

@ -40,7 +40,6 @@ venv/
# Distribution / packaging
env/
bin/
build/
develop-eggs/
dist/

View File

@ -14,8 +14,7 @@ os:
env:
- VIA=compile LC_ALL=en_US.ascii
- VIA=compile
# - VIA=sdist
#- VIA=pypi_nightly
install:
- "./travis.sh"
@ -23,7 +22,7 @@ install:
script:
- "pip install pytest pytest-timeout"
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
- if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
notifications:

View File

@ -1,3 +1,4 @@
recursive-include include *.h
include LICENSE
include README.rst
include bin/spacy

View File

@ -229,7 +229,7 @@ Compile from source
The other way to install spaCy is to clone its
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
source. That is the common way if you want to make changes to the code base.
You'll need to make sure that you have a development enviroment consisting of a
You'll need to make sure that you have a development environment consisting of a
Python distribution including header files, a compiler,
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.

1
bin/spacy Normal file
View File

@ -0,0 +1 @@
python -m spacy "$@"

View File

@ -0,0 +1,109 @@
from __future__ import unicode_literals
import plac
import random
import tqdm
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
import thinc.extra.datasets
import spacy.lang.en
from spacy.gold import GoldParse, minibatch
from spacy.util import compounding
from spacy.pipeline import TextCategorizer
def train_textcat(tokenizer, textcat,
train_texts, train_cats, dev_texts, dev_cats,
n_iter=20):
'''
Train the TextCategorizer without associated pipeline.
'''
textcat.begin_training()
optimizer = Adam(NumpyOps(), 0.001)
train_docs = [tokenizer(text) for text in train_texts]
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
zip(train_docs, train_cats)]
train_data = zip(train_docs, train_gold)
batch_sizes = compounding(4., 128., 1.001)
for i in range(n_iter):
losses = {}
train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
for batch in minibatch(train_data, size=batch_sizes):
docs, golds = zip(*batch)
textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
losses=losses)
with textcat.model.use_params(optimizer.averages):
scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
yield losses['textcat'], scores
def evaluate(tokenizer, textcat, texts, cats):
docs = (tokenizer(text) for text in texts)
tp = 1e-8 # True positives
fp = 1e-8 # False positives
fn = 1e-8 # False negatives
tn = 1e-8 # True negatives
for i, doc in enumerate(textcat.pipe(docs)):
gold = cats[i]
for label, score in doc.cats.items():
if score >= 0.5 and label in gold:
tp += 1.
elif score >= 0.5 and label not in gold:
fp += 1.
elif score < 0.5 and label not in gold:
tn += 1
if score < 0.5 and label in gold:
fn += 1
precis = tp / (tp + fp)
recall = tp / (tp + fn)
fscore = 2 * (precis * recall) / (precis + recall)
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
def load_data():
# Partition off part of the train data --- avoid running experiments
# against test.
train_data, _ = thinc.extra.datasets.imdb()
random.shuffle(train_data)
texts, labels = zip(*train_data)
cats = [(['POSITIVE'] if y else []) for y in labels]
split = int(len(train_data) * 0.8)
train_texts = texts[:split]
train_cats = cats[:split]
dev_texts = texts[split:]
dev_cats = cats[split:]
return (train_texts, train_cats), (dev_texts, dev_cats)
def main(model_loc=None):
nlp = spacy.lang.en.English()
tokenizer = nlp.tokenizer
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
print("Load IMDB data")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
print("Itn.\tLoss\tP\tR\tF")
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
train_texts, train_cats,
dev_texts, dev_cats, n_iter=20)):
print(progress.format(i=i, loss=loss, **scores))
# How to save, load and use
nlp.pipeline.append(textcat)
if model_loc is not None:
nlp.to_disk(model_loc)
nlp = spacy.load(model_loc)
doc = nlp(u'This movie sucked!')
print(doc.cats)
if __name__ == '__main__':
plac.call(main)

View File

@ -3,7 +3,7 @@ pathlib
numpy>=1.7
cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0
thinc>=6.7.3,<6.8.0
thinc>=6.8.0,<6.9.0
murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6
six

View File

@ -28,7 +28,9 @@ MOD_NAMES = [
'spacy.pipeline',
'spacy.syntax.stateclass',
'spacy.syntax._state',
'spacy.syntax._beam_utils',
'spacy.tokenizer',
'spacy._cfile',
'spacy.syntax.parser',
'spacy.syntax.nn_parser',
'spacy.syntax.beam_parser',
@ -187,12 +189,13 @@ def setup_package():
url=about['__uri__'],
license=about['__license__'],
ext_modules=ext_modules,
scripts=['bin/spacy'],
install_requires=[
'numpy>=1.7',
'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0',
'thinc>=6.7.3,<6.8.0',
'thinc>=6.8.0,<6.9.0',
'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0',
'six',

View File

@ -13,5 +13,10 @@ def load(name, **overrides):
return util.load_model(name, **overrides)
def blank(name, **kwargs):
LangClass = util.get_lang_class(name)
return LangClass(**kwargs)
def info(model=None, markdown=False):
return cli_info(None, model, markdown)

View File

@ -3,15 +3,23 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals
if __name__ == '__main__':
import plac
import sys
from spacy.cli import download, link, info, package, train, convert
from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile
from spacy.util import prints
commands = {'download': download, 'link': link, 'info': info, 'train': train,
'convert': convert, 'package': package}
commands = {
'download': download,
'link': link,
'info': info,
'train': train,
'convert': convert,
'package': package,
'model': model,
'profile': profile,
}
if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1)
command = sys.argv.pop(1)
@ -19,5 +27,7 @@ if __name__ == '__main__':
if command in commands:
plac.call(commands[command])
else:
prints("Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command, exits=1)
prints(
"Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command,
exits=1)

26
spacy/_cfile.pxd Normal file
View File

@ -0,0 +1,26 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool
cdef class CFile:
cdef FILE* fp
cdef bint is_open
cdef Pool mem
cdef int size # For compatibility with subclass
cdef int _capacity # For compatibility with subclass
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
cdef class StringCFile(CFile):
cdef unsigned char* data
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *

88
spacy/_cfile.pyx Normal file
View File

@ -0,0 +1,88 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memcpy
cdef class CFile:
def __init__(self, loc, mode, on_open_error=None):
if isinstance(mode, unicode):
mode_str = mode.encode('ascii')
else:
mode_str = mode
if hasattr(loc, 'as_posix'):
loc = loc.as_posix()
self.mem = Pool()
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode_str)
if self.fp == NULL:
if on_open_error is not None:
on_open_error()
else:
raise IOError("Could not open binary file %s" % bytes_loc)
self.is_open = True
def __dealloc__(self):
if self.is_open:
fclose(self.fp)
def close(self):
fclose(self.fp)
self.is_open = False
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
st = fread(dest, elem_size, number, self.fp)
if st != number:
raise IOError
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
st = fwrite(src, elem_size, number, self.fp)
if st != number:
raise IOError
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)
cdef class StringCFile:
def __init__(self, mode, bytes data=b'', on_open_error=None):
self.mem = Pool()
self.is_open = 'w' in mode
self._capacity = max(len(data), 8)
self.size = len(data)
self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
for i in range(len(data)):
self.data[i] = data[i]
def close(self):
self.is_open = False
def string_data(self):
return (self.data-self.size)[:self.size]
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
memcpy(dest, self.data, elem_size * number)
self.data += elem_size * number
cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
write_size = number * elem_size
if (self.size + write_size) >= self._capacity:
self._capacity = (self.size + write_size) * 2
self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
memcpy(&self.data[self.size], src, elem_size * number)
self.size += write_size
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)

View File

@ -3,23 +3,101 @@ from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
import random
import cytoolz
from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm
from thinc.neural._classes.batchnorm import BatchNorm as BN
from thinc.neural._classes.layernorm import LayerNorm as LN
from thinc.neural._classes.resnet import Residual
from thinc.neural import ReLu
from thinc.neural._classes.selu import SELU
from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed
from thinc.api import FeatureExtracter, with_getitem
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
from thinc.neural._classes.attention import ParametricAttention
from thinc.linear.linear import LinearModel
from thinc.api import uniqued, wrap, flatten_add_lengths
from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
from .tokens.doc import Doc
from . import util
import numpy
import io
@layerize
def _flatten_add_lengths(seqs, pad=0, drop=0.):
ops = Model.ops
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths, pad=pad)
X = ops.flatten(seqs, pad=pad)
return (X, lengths), finish_update
@layerize
def _logistic(X, drop=0.):
xp = get_array_module(X)
if not isinstance(X, xp.ndarray):
X = xp.asarray(X)
# Clip to range (-10, 10)
X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y))
return dX
return Y, logistic_bwd
@layerize
def add_tuples(X, drop=0.):
"""Give inputs of sequence pairs, where each sequence is (vals, length),
sum the values, returning a single sequence.
If input is:
((vals1, length), (vals2, length)
Output is:
(vals1+vals2, length)
vals are a single tensor for the whole batch.
"""
(vals1, length1), (vals2, length2) = X
assert length1 == length2
def add_tuples_bwd(dY, sgd=None):
return (dY, dY)
return (vals1+vals2, length), add_tuples_bwd
def _zero_init(model):
def _zero_init_impl(self, X, y):
self.W.fill(0)
model.on_data_hooks.append(_zero_init_impl)
if model.W is not None:
model.W.fill(0.)
return model
@layerize
def _preprocess_doc(docs, drop=0.):
keys = [doc.to_array([LOWER]) for doc in docs]
keys = [a[:, 0] for a in keys]
ops = Model.ops
lengths = ops.asarray([arr.shape[0] for arr in keys])
keys = ops.xp.concatenate(keys)
vals = ops.allocate(keys.shape[0]) + 1
return (keys, vals, lengths), None
def _init_for_precomputed(W, ops):
if (W**2).sum() != 0.:
return
@ -27,6 +105,7 @@ def _init_for_precomputed(W, ops):
ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape)
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
nI=Dimension("Input size"),
@ -130,25 +209,42 @@ class PrecomputableMaxouts(Model):
return dXf
return Yfp, backward
def drop_layer(layer, factor=2.):
def drop_layer_fwd(X, drop=0.):
if drop <= 0.:
return layer.begin_update(X, drop=drop)
else:
coinflip = layer.ops.xp.random.random()
if (coinflip / factor) >= drop:
return layer.begin_update(X, drop=drop)
else:
return X, lambda dX, sgd=None: dX
model = wrap(drop_layer_fwd, layer)
model.predict = layer
return model
def Tok2Vec(width, embed_size, preprocess=None):
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
embed = (norm | prefix | suffix | shape )
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
tok2vec = (
with_flatten(
asarray(Model.ops, dtype='uint64')
>> embed
>> Maxout(width, width*4, pieces=3)
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
pad=4)
>> uniqued(embed, column=5)
>> drop_layer(
Residual(
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
)
) ** 4, pad=4
)
)
if preprocess not in (False, None):
tok2vec = preprocess >> tok2vec
@ -243,7 +339,8 @@ def zero_init(model):
def doc2feats(cols=None):
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
if cols is None:
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
def forward(docs, drop=0.):
feats = []
for doc in docs:
@ -269,6 +366,45 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
return vectors, backward
def fine_tune(embedding, combine=None):
if combine is not None:
raise NotImplementedError(
"fine_tune currently only supports addition. Set combine=None")
def fine_tune_fwd(docs_tokvecs, drop=0.):
docs, tokvecs = docs_tokvecs
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
flat_tokvecs = embedding.ops.flatten(tokvecs)
flat_vecs = embedding.ops.flatten(vecs)
output = embedding.ops.unflatten(
(model.mix[0] * flat_tokvecs + model.mix[1] * flat_vecs), lengths)
def fine_tune_bwd(d_output, sgd=None):
flat_grad = model.ops.flatten(d_output)
model.d_mix[0] += flat_tokvecs.dot(flat_grad.T).sum()
model.d_mix[1] += flat_vecs.dot(flat_grad.T).sum()
bp_vecs([d_o * model.mix[1] for d_o in d_output], sgd=sgd)
if sgd is not None:
sgd(model._mem.weights, model._mem.gradient, key=model.id)
return [d_o * model.mix[0] for d_o in d_output]
return output, fine_tune_bwd
def fine_tune_predict(docs_tokvecs):
docs, tokvecs = docs_tokvecs
vecs = embedding(docs)
return [model.mix[0]*tv+model.mix[1]*v
for tv, v in zip(tokvecs, vecs)]
model = wrap(fine_tune_fwd, embedding)
model.mix = model._mem.add((model.id, 'mix'), (2,))
model.mix.fill(0.5)
model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix'))
model.predict = fine_tune_predict
return model
@layerize
def flatten(seqs, drop=0.):
if isinstance(seqs[0], numpy.ndarray):
@ -282,3 +418,201 @@ def flatten(seqs, drop=0.):
return ops.unflatten(d_X, lengths)
X = ops.xp.vstack(seqs)
return X, finish_update
@layerize
def logistic(X, drop=0.):
xp = get_array_module(X)
if not isinstance(X, xp.ndarray):
X = xp.asarray(X)
# Clip to range (-10, 10)
X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y))
return dX
return Y, logistic_bwd
def zero_init(model):
def _zero_init_impl(self, X, y):
self.W.fill(0)
model.on_data_hooks.append(_zero_init_impl)
return model
@layerize
def preprocess_doc(docs, drop=0.):
keys = [doc.to_array([LOWER]) for doc in docs]
keys = [a[:, 0] for a in keys]
ops = Model.ops
lengths = ops.asarray([arr.shape[0] for arr in keys])
keys = ops.xp.concatenate(keys)
vals = ops.allocate(keys.shape[0]) + 1
return (keys, vals, lengths), None
def getitem(i):
def getitem_fwd(X, drop=0.):
return X[i], None
return layerize(getitem_fwd)
def build_tagger_model(nr_class, token_vector_width, **cfg):
embed_size = util.env_opt('embed_size', 7500)
with Model.define_operators({'>>': chain, '+': add}):
# Input: (doc, tensor) tuples
private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats())
model = (
fine_tune(private_tok2vec)
>> with_flatten(
Maxout(token_vector_width, token_vector_width)
>> Softmax(nr_class, token_vector_width)
)
)
model.nI = None
return model
@layerize
def SpacyVectors(docs, drop=0.):
xp = get_array_module(docs[0].vocab.vectors.data)
width = docs[0].vocab.vectors.data.shape[1]
batch = []
for doc in docs:
indices = numpy.zeros((len(doc),), dtype='i')
for i, word in enumerate(doc):
if word.orth in doc.vocab.vectors.key2row:
indices[i] = doc.vocab.vectors.key2row[word.orth]
else:
indices[i] = 0
vectors = doc.vocab.vectors.data[indices]
batch.append(vectors)
return batch, None
def foreach(layer, drop_factor=1.0):
'''Map a layer across elements in a list'''
def foreach_fwd(Xs, drop=0.):
drop *= drop_factor
ys = []
backprops = []
for X in Xs:
y, bp_y = layer.begin_update(X, drop=drop)
ys.append(y)
backprops.append(bp_y)
def foreach_bwd(d_ys, sgd=None):
d_Xs = []
for d_y, bp_y in zip(d_ys, backprops):
if bp_y is not None and bp_y is not None:
d_Xs.append(d_y, sgd=sgd)
else:
d_Xs.append(None)
return d_Xs
return ys, foreach_bwd
model = wrap(foreach_fwd, layer)
return model
def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 5000)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
'**': clone}):
if cfg.get('low_data'):
model = (
SpacyVectors
>> flatten_add_lengths
>> with_getitem(0,
Affine(width, 300)
)
>> ParametricAttention(width)
>> Pooling(sum_pool)
>> Residual(ReLu(width, width)) ** 2
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
>> logistic
)
return model
lower = HashEmbed(width, nr_vector, column=1)
prefix = HashEmbed(width//2, nr_vector, column=2)
suffix = HashEmbed(width//2, nr_vector, column=3)
shape = HashEmbed(width//2, nr_vector, column=4)
trained_vectors = (
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
>> with_flatten(
uniqued(
(lower | prefix | suffix | shape)
>> LN(Maxout(width, width+(width//2)*3)),
column=0
)
)
)
static_vectors = (
SpacyVectors
>> with_flatten(Affine(width, 300))
)
cnn_model = (
# TODO Make concatenate support lists
concatenate_lists(trained_vectors, static_vectors)
>> with_flatten(
LN(Maxout(width, width*2))
>> Residual(
(ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3)))
) ** 2, pad=2
)
>> flatten_add_lengths
>> ParametricAttention(width)
>> Pooling(sum_pool)
>> Residual(zero_init(Maxout(width, width)))
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
)
linear_model = (
_preprocess_doc
>> LinearModel(nr_class, drop_factor=0.)
)
model = (
(linear_model | cnn_model)
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
>> logistic
)
model.lsuv = False
return model
@layerize
def flatten(seqs, drop=0.):
ops = Model.ops
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths, pad=0)
X = ops.flatten(seqs, pad=0)
return X, finish_update
def concatenate_lists(*layers, **kwargs): # pragma: no cover
'''Compose two or more models `f`, `g`, etc, such that their outputs are
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
'''
if not layers:
return noop()
drop_factor = kwargs.get('drop_factor', 1.0)
ops = layers[0].ops
layers = [chain(layer, flatten) for layer in layers]
concat = concatenate(*layers)
def concatenate_lists_fwd(Xs, drop=0.):
drop *= drop_factor
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
ys = ops.unflatten(flat_y, lengths)
def concatenate_lists_bwd(d_ys, sgd=None):
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
return ys, concatenate_lists_bwd
model = wrap(concatenate_lists_fwd, concat)
return model

View File

@ -3,7 +3,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy-nightly'
__version__ = '2.0.0a1'
__version__ = '2.0.0a13'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
__author__ = 'Explosion AI'

View File

@ -2,5 +2,7 @@ from .download import download
from .info import info
from .link import link
from .package import package
from .profile import profile
from .train import train
from .convert import convert
from .model import model

View File

@ -21,10 +21,10 @@ CONVERTERS = {
@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
n_sents=("Number of sentences per doc", "option", "n", int),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(cmd, input_file, output_dir, n_sents, morphology):
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
"""
Convert files into JSON format for use with train command and other
experiment management functions.

View File

@ -73,10 +73,10 @@ def generate_sentence(sent):
tokens = []
for i, id in enumerate(id_):
token = {}
token["orth"] = word[id]
token["tag"] = tag[id]
token["head"] = head[id] - i
token["dep"] = dep[id]
token["orth"] = word[i]
token["tag"] = tag[i]
token["head"] = head[i] - id
token["dep"] = dep[i]
tokens.append(token)
sentence["tokens"] = tokens
return sentence

View File

@ -8,7 +8,7 @@ import subprocess
import sys
from .link import link
from ..util import prints
from ..util import prints, get_package_path
from .. import about
@ -24,24 +24,29 @@ def download(cmd, model, direct=False):
with version.
"""
if direct:
download_model('{m}/{m}.tar.gz'.format(m=model))
dl = download_model('{m}/{m}.tar.gz'.format(m=model))
else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
model_name = shortcuts.get(model, model)
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
try:
link(None, model_name, model, force=True)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and
# loading instructions, even if linking fails.
prints("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load "
"the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful")
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
if dl == 0:
try:
# Get package path here because link uses
# pip.get_installed_distributions() to check if model is a package,
# which fails if model was just installed via subprocess
package_path = get_package_path(model_name)
link(None, model_name, model, force=True, model_path=package_path)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and
# loading instructions, even if linking fails.
prints("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load "
"the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful")
def get_json(url, desc):
@ -73,6 +78,6 @@ def get_version(model, comp):
def download_model(filename):
download_url = about.__download_url__ + '/' + filename
subprocess.call([sys.executable, '-m',
return subprocess.call([sys.executable, '-m',
'pip', 'install', '--no-cache-dir', download_url],
env=os.environ.copy())

View File

@ -14,7 +14,7 @@ from .. import util
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(cmd, origin, link_name, force=False):
def link(cmd, origin, link_name, force=False, model_path=None):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
@ -23,7 +23,7 @@ def link(cmd, origin, link_name, force=False):
if util.is_package(origin):
model_path = util.get_package_path(origin)
else:
model_path = Path(origin)
model_path = Path(origin) if model_path is None else Path(model_path)
if not model_path.exists():
prints("The data should be located in %s" % path2str(model_path),
title="Can't locate model data", exits=1)

137
spacy/cli/model.py Normal file
View File

@ -0,0 +1,137 @@
# coding: utf8
from __future__ import unicode_literals
import bz2
import gzip
import math
from ast import literal_eval
from pathlib import Path
import numpy as np
import spacy
from preshed.counter import PreshCounter
from .. import util
from ..compat import fix_text
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data,
min_doc_freq=5, min_word_freq=200):
model_path = Path(model_dir)
freqs_path = Path(freqs_data)
clusters_path = Path(clusters_data) if clusters_data else None
vectors_path = Path(vectors_data) if vectors_data else None
check_dirs(freqs_path, clusters_path, vectors_path)
vocab = util.get_lang_class(lang).Defaults.create_vocab()
nlp = spacy.blank(lang)
vocab = nlp.vocab
probs, oov_prob = read_probs(
freqs_path, min_doc_freq=int(min_doc_freq), min_freq=int(min_doc_freq))
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
add_vectors(vocab, vectors_path)
create_model(model_path, nlp)
def add_vectors(vocab, vectors_path):
with bz2.BZ2File(vectors_path.as_posix()) as f:
num_words, dim = next(f).split()
vocab.clear_vectors(int(dim))
for line in f:
word_w_vector = line.decode("utf8").strip().split(" ")
word = word_w_vector[0]
vector = np.array([float(val) for val in word_w_vector[1:]])
if word in vocab:
vocab.set_vector(word, vector)
def create_model(model_path, model):
if not model_path.exists():
model_path.mkdir()
model.to_disk(model_path.as_posix())
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(
key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(
sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = file_path.as_posix()
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()
def check_dirs(freqs_data, clusters_data, vectors_data):
if not freqs_data.is_file():
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
if clusters_data and not clusters_data.is_file():
util.sys_exit(
clusters_data.as_posix(), title="No Brown clusters file found")
if vectors_data and not vectors_data.is_file():
util.sys_exit(
vectors_data.as_posix(), title="No word vectors file found")

View File

@ -15,10 +15,11 @@ from .. import about
@plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
meta_path=("path to meta.json", "option", "m", str),
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(cmd, input_dir, output_dir, meta=None, force=False):
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
@ -26,7 +27,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
"""
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta)
meta_path = util.ensure_path(meta_path)
if not input_path or not input_path.exists():
prints(input_path, title="Model directory not found", exits=1)
if not output_path or not output_path.exists():
@ -38,7 +39,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
template_manifest = get_template('MANIFEST.in')
template_init = get_template('xx_model_name/__init__.py')
meta_path = meta_path or input_path / 'meta.json'
if meta_path.is_file():
if not create_meta and meta_path.is_file():
prints(meta_path, title="Reading meta.json from file")
meta = util.read_json(meta_path)
else:
@ -100,7 +101,7 @@ def generate_meta():
def generate_pipeline():
prints("If set to 'True', the default pipeline is used. If set to 'False', "
"the pipeline will be disabled. Components should be specified as a "
"comma-separated list of component names, e.g. vectorizer, tagger, "
"comma-separated list of component names, e.g. tensorizer, tagger, "
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)

45
spacy/cli/profile.py Normal file
View File

@ -0,0 +1,45 @@
# coding: utf8
from __future__ import unicode_literals, division, print_function
import plac
from pathlib import Path
import ujson
import cProfile
import pstats
import spacy
import sys
import tqdm
import cytoolz
def read_inputs(loc):
if loc is None:
file_ = sys.stdin
file_ = (line.encode('utf8') for line in file_)
else:
file_ = Path(loc).open()
for line in file_:
data = ujson.loads(line)
text = data['text']
yield text
@plac.annotations(
lang=("model/language", "positional", None, str),
inputs=("Location of input file", "positional", None, read_inputs)
)
def profile(cmd, lang, inputs=None):
"""
Profile a spaCy pipeline, to find out which functions take the most time.
"""
nlp = spacy.load(lang)
texts = list(cytoolz.take(10000, inputs))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp, texts):
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
pass

View File

@ -32,10 +32,12 @@ from ..compat import json_dumps
resume=("Whether to resume training", "flag", "R", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
)
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
@ -69,7 +71,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
util.env_opt('batch_to', 64),
util.env_opt('batch_compound', 1.001))
gold_preproc = util.env_opt('gold_preproc', False)
noise_level = util.env_opt('noise_level', 0.25)
noise_level = util.env_opt('noise_level', 0.0)
if resume:
prints(output_path / 'model19.pickle', title="Resuming training")
@ -95,15 +97,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses)
drop=next(dropout_rates), losses=losses,
update_shared=True)
pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages):
util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path)
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1)
nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate(

View File

@ -5,6 +5,7 @@ import six
import ftfy
import sys
import ujson
import itertools
from thinc.neural.util import copy_array
@ -35,6 +36,7 @@ CudaStream = CudaStream
cupy = cupy
fix_text = ftfy.fix_text
copy_array = copy_array
izip = getattr(itertools, 'izip', zip)
is_python2 = six.PY2
is_python3 = six.PY3
@ -44,21 +46,31 @@ is_osx = sys.platform == 'darwin'
if is_python2:
import imp
bytes_ = str
unicode_ = unicode
basestring_ = basestring
input_ = raw_input
json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8')
path2str = lambda path: str(path).decode('utf8')
elif is_python3:
import importlib.util
bytes_ = bytes
unicode_ = str
basestring_ = str
input_ = input
json_dumps = lambda data: ujson.dumps(data, indent=2)
json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False)
path2str = lambda path: str(path)
def b_to_str(b_str):
if is_python2:
return b_str
# important: if no encoding is set, string becomes "b'...'"
return str(b_str, encoding='utf8')
def getattr_(obj, name, *default):
if is_python3 and isinstance(name, bytes):
name = name.decode('utf8')
@ -92,3 +104,12 @@ def normalize_string_keys(old):
return new
def import_file(name, loc):
loc = str(loc)
if is_python2:
return imp.load_source(name, loc)
else:
spec = importlib.util.spec_from_file_location(name, str(loc))
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module

View File

@ -15,7 +15,7 @@ def depr_model_download(lang):
lang (unicode): Language shortcut, 'en' or 'de'.
"""
prints("The spacy.%s.download command is now deprecated. Please use "
"python -m spacy download [model name or shortcut] instead. For "
"spacy download [model name or shortcut] instead. For "
"more info, see the documentation:" % lang,
about.__docs_models__,
"Downloading default '%s' model now..." % lang,

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc
from ..compat import b_to_str
from ..util import prints, is_in_jupyter
@ -65,7 +66,9 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
def app(environ, start_response):
start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')])
# headers and status need to be bytes in Python 2, see #1227
headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
start_response(b_to_str(b'200 OK'), headers)
res = _html['parsed'].encode(encoding='utf-8')
return [res]

View File

@ -60,7 +60,7 @@ GLOSSARY = {
'JJR': 'adjective, comparative',
'JJS': 'adjective, superlative',
'LS': 'list item marker',
'MD': 'verb, modal auxillary',
'MD': 'verb, modal auxiliary',
'NIL': 'missing tag',
'NN': 'noun, singular or mass',
'NNP': 'noun, proper singular',
@ -91,7 +91,7 @@ GLOSSARY = {
'NFP': 'superfluous punctuation',
'GW': 'additional word in multi-word expression',
'XX': 'unknown',
'BES': 'auxillary "be"',
'BES': 'auxiliary "be"',
'HVS': 'forms of "have"',

View File

@ -9,6 +9,7 @@ cdef struct GoldParseC:
int* tags
int* heads
int* has_dep
int* sent_start
attr_t* labels
int** brackets
Transition* ner
@ -29,6 +30,7 @@ cdef class GoldParse:
cdef public list ner
cdef public list ents
cdef public dict brackets
cdef public object cats
cdef readonly list cand_to_gold
cdef readonly list gold_to_cand

View File

@ -381,7 +381,8 @@ cdef class GoldParse:
make_projective=make_projective)
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False):
deps=None, entities=None, make_projective=False,
cats=tuple()):
"""Create a GoldParse.
doc (Doc): The document the annotations refer to.
@ -392,6 +393,12 @@ cdef class GoldParse:
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
cats (iterable): A sequence of labels for text classification. Each
label may be a string or an int, or a `(start_char, end_char, label)`
tuple, indicating that the label is applied to only part of the
document (usually a sentence). Unlike entity annotations, label
annotations can overlap, i.e. a single word can be covered by
multiple labelled spans.
RETURNS (GoldParse): The newly constructed object.
"""
if words is None:
@ -399,11 +406,11 @@ cdef class GoldParse:
if tags is None:
tags = [None for _ in doc]
if heads is None:
heads = [token.i for token in doc]
heads = [None for token in doc]
if deps is None:
deps = [None for _ in doc]
if entities is None:
entities = ['-' for _ in doc]
entities = [None for _ in doc]
elif len(entities) == 0:
entities = ['O' for _ in doc]
elif not isinstance(entities[0], basestring):
@ -419,8 +426,10 @@ cdef class GoldParse:
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.cats = list(cats)
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
self.heads = [None] * len(doc)
@ -474,8 +483,12 @@ cdef class GoldParse:
"""
return not nonproj.is_nonproj_tree(self.heads)
@property
def sent_starts(self):
return [self.c.sent_start[i] for i in range(self.length)]
def biluo_tags_from_offsets(doc, entities):
def biluo_tags_from_offsets(doc, entities, missing='O'):
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (BILUO).
@ -527,7 +540,7 @@ def biluo_tags_from_offsets(doc, entities):
if i in entity_chars:
break
else:
biluo[token.i] = 'O'
biluo[token.i] = missing
return biluo

View File

@ -27,7 +27,7 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
'TB T G M K')
'TB T G M K %')
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'

18
spacy/lang/da/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.da.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple overvejer at købe et britisk statup for 1 milliard dollar",
"Selvkørende biler flytter forsikringsansvaret over på producenterne",
"San Francisco overvejer at forbyde leverandørrobotter på fortov",
"London er en stor by i Storbritannien"
]

22
spacy/lang/de/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.de.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
"Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
"Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
"Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
"San Francisco erwägt Verbot von Lieferrobotern",
"Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
"Wo bist du?",
"Was ist die Hauptstadt von Deutschland?"
]

22
spacy/lang/en/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.en.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple is looking at buying U.K. startup for $1 billion",
"Autonomous cars shift insurance liability toward manufacturers",
"San Francisco considers banning sidewalk delivery robots",
"London is a big city in the United Kingdom.",
"Where are you?",
"Who is the president of France?",
"What is the capital of the United States?",
"When was Barack Obama born?"
]

View File

@ -59,7 +59,8 @@ MORPH_RULES = {
"VBP": {
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
"'re": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
"am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
},
"VBD": {

View File

@ -232,7 +232,10 @@ for verb_data in [
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be", NORM: "was"},
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
{ORTH: "were", LEMMA: "be", NORM: "were"},
{ORTH: "have", NORM: "have"},
{ORTH: "has", LEMMA: "have", NORM: "has"},
{ORTH: "dare", NORM: "dare"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:

22
spacy/lang/es/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.es.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
"Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
"San Francisco analiza prohibir los robots delivery",
"Londres es una gran ciudad del Reino Unido",
"El gato come pescado",
"Veo al hombre con el telescopio",
"La araña come moscas",
"El pingüino incuba en su nido"
]

View File

@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -24,6 +25,7 @@ class FrenchDefaults(Language.Defaults):
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):

File diff suppressed because it is too large Load Diff

26
spacy/lang/fr/examples.py Normal file
View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.fr.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
"Les voitures autonomes voient leur assurances décalées vers les constructeurs",
"San Francisco envisage d'interdire les robots coursiers",
"Londres est une grande ville du Royaume-Uni",
"LItalie choisit ArcelorMittal pour reprendre la plus grande aciérie dEurope",
"Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
"La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
"Nouvelles attaques de Trump contre le maire de Londres",
"Où es-tu ?",
"Qui est le président de la France ?",
"Où est la capitale des Etats-Unis ?",
"Quand est né Barack Obama ?"
]

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

28
spacy/lang/he/examples.py Normal file
View File

@ -0,0 +1,28 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.he.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
'רה"מ הודיע כי יחרים טקס בחסותו',
'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
'סע לשלום, המפתחות בפנים.',
'מלצר, פעמיים טורקי!',
'ואהבת לרעך כמוך.',
'היום נעשה משהו בלתי נשכח.',
'איפה הילד?',
'מיהו נשיא צרפת?',
'מהי בירת ארצות הברית?',
"איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
'מה הייתה הדקה?',
'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
]

42
spacy/lang/id/__init__.py Normal file
View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .lemmatizer import LOOKUP
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
class IndonesianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'id'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Indonesian(Language):
lang = 'id'
Defaults = IndonesianDefaults
__all__ = ['Indonesian']

File diff suppressed because it is too large Load Diff

22
spacy/lang/id/examples.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.en.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Al Qaidah mengklaim bom mobil yang menewaskan 60 Orang di Mali",
"Abu Sayyaf mengeksekusi sandera warga Filipina",
"Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
"PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
"Jakarta adalah kota besar yang nyaris tidak pernah tidur."
"Kamu ada di mana semalam?",
"Siapa yang membeli makanan ringan tersebut?",
"Siapa presiden pertama Republik Indonesia?"
]

36883
spacy/lang/id/lemmatizer.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen',
'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety',
'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion',
'gajillion', 'bazillion',
'nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
'delapan', 'sembilan', 'sepuluh', 'sebelas', 'duabelas', 'tigabelas',
'empatbelas', 'limabelas', 'enambelas', 'tujuhbelas', 'delapanbelas',
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
'noniliun', 'desiliun',
]
def like_num(text):
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
if text.count('-') == 1:
_, num = text.split('-')
if num.isdigit() or num in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -0,0 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
_exc = {
"Rp": "$",
"IDR": "$",
"RMB": "$",
"USD": "$",
"AUD": "$",
"GBP": "$",
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -0,0 +1,53 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ..char_classes import merge_chars, split_chars, _currency, _units
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
_units = (_units + 's bit Gbps Mbps mbps Kbps kbps ƒ ppi px '
'Hz kHz MHz GHz mAh '
'ratus rb ribu ribuan '
'juta jt jutaan mill?iar million bil[l]?iun bilyun billion '
)
_currency = (_currency + r' USD Rp IDR RMB SGD S\$')
_months = ('Januari Februari Maret April Mei Juni Juli Agustus September '
'Oktober November Desember January February March May June '
'July August October December Jan Feb Mar Jun Jul Aug Sept '
'Oct Okt Nov Des ')
UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency)
HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>'
HTML_SUFFIX = r'</(b|strong|i|em|p|span|div|a)>'
MONTHS = merge_chars(_months)
LIST_CURRENCY = split_chars(_currency)
TOKENIZER_PREFIXES.remove('#') # hashtag
_prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['/', '']
_suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '-[KkMm]u', '[—-]'] + [
r'(?<={c})(?:[0-9]+)'.format(c=CURRENCY),
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
r'(?<=[0-9])%',
r'(?<=[0-9{a}]{h})(?:[\.,:-])'.format(a=ALPHA, h=HTML_SUFFIX),
r'(?<=[0-9{a}])(?:{h})'.format(a=ALPHA, h=HTML_SUFFIX),
]
_infixes = TOKENIZER_INFIXES + [
r'(?<=[0-9])[\\/](?=[0-9%-])',
r'(?<=[0-9])%(?=[{a}0-9/])'.format(a=ALPHA),
r'(?<={u})[\/-](?=[0-9])'.format(u=UNITS),
r'(?<={m})[\/-](?=[0-9])'.format(m=MONTHS),
r'(?<=[0-9\)][\.,])"(?=[0-9])',
r'(?<=[{a}\)][\.,\'])["—](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])-(?=[0-9])'.format(a=ALPHA),
r'(?<=[0-9])-(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])[\/-](?={c}{a})'.format(a=ALPHA, c=CURRENCY),
]
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

763
spacy/lang/id/stop_words.py Normal file
View File

@ -0,0 +1,763 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
ada
adalah
adanya
adapun
agak
agaknya
agar
akan
akankah
akhir
akhiri
akhirnya
aku
akulah
amat
amatlah
anda
andalah
antar
antara
antaranya
apa
apaan
apabila
apakah
apalagi
apatah
artinya
asal
asalkan
atas
atau
ataukah
ataupun
awal
awalnya
bagai
bagaikan
bagaimana
bagaimanakah
bagaimanapun
bagi
bagian
bahkan
bahwa
bahwasanya
baik
bakal
bakalan
balik
banyak
bapak
baru
bawah
beberapa
begini
beginian
beginikah
beginilah
begitu
begitukah
begitulah
begitupun
bekerja
belakang
belakangan
belum
belumlah
benar
benarkah
benarlah
berada
berakhir
berakhirlah
berakhirnya
berapa
berapakah
berapalah
berapapun
berarti
berawal
berbagai
berdatangan
beri
berikan
berikut
berikutnya
berjumlah
berkali-kali
berkata
berkehendak
berkeinginan
berkenaan
berlainan
berlalu
berlangsung
berlebihan
bermacam
bermacam-macam
bermaksud
bermula
bersama
bersama-sama
bersiap
bersiap-siap
bertanya
bertanya-tanya
berturut
berturut-turut
bertutur
berujar
berupa
besar
betul
betulkah
biasa
biasanya
bila
bilakah
bisa
bisakah
boleh
bolehkah
bolehlah
buat
bukan
bukankah
bukanlah
bukannya
bulan
bung
cara
caranya
cukup
cukupkah
cukuplah
cuma
dahulu
dalam
dan
dapat
dari
daripada
datang
dekat
demi
demikian
demikianlah
dengan
depan
di
dia
diakhiri
diakhirinya
dialah
diantara
diantaranya
diberi
diberikan
diberikannya
dibuat
dibuatnya
didapat
didatangkan
digunakan
diibaratkan
diibaratkannya
diingat
diingatkan
diinginkan
dijawab
dijelaskan
dijelaskannya
dikarenakan
dikatakan
dikatakannya
dikerjakan
diketahui
diketahuinya
dikira
dilakukan
dilalui
dilihat
dimaksud
dimaksudkan
dimaksudkannya
dimaksudnya
diminta
dimintai
dimisalkan
dimulai
dimulailah
dimulainya
dimungkinkan
dini
dipastikan
diperbuat
diperbuatnya
dipergunakan
diperkirakan
diperlihatkan
diperlukan
diperlukannya
dipersoalkan
dipertanyakan
dipunyai
diri
dirinya
disampaikan
disebut
disebutkan
disebutkannya
disini
disinilah
ditambahkan
ditandaskan
ditanya
ditanyai
ditanyakan
ditegaskan
ditujukan
ditunjuk
ditunjuki
ditunjukkan
ditunjukkannya
ditunjuknya
dituturkan
dituturkannya
diucapkan
diucapkannya
diungkapkan
dong
dua
dulu
empat
enggak
enggaknya
entah
entahlah
guna
gunakan
hal
hampir
hanya
hanyalah
hari
harus
haruslah
harusnya
hendak
hendaklah
hendaknya
hingga
ia
ialah
ibarat
ibaratkan
ibaratnya
ibu
ikut
ingat
ingat-ingat
ingin
inginkah
inginkan
ini
inikah
inilah
itu
itukah
itulah
jadi
jadilah
jadinya
jangan
jangankan
janganlah
jauh
jawab
jawaban
jawabnya
jelas
jelaskan
jelaslah
jelasnya
jika
jikalau
juga
jumlah
jumlahnya
justru
kala
kalau
kalaulah
kalaupun
kalian
kami
kamilah
kamu
kamulah
kan
kapan
kapankah
kapanpun
karena
karenanya
kasus
kata
katakan
katakanlah
katanya
ke
keadaan
kebetulan
kecil
kedua
keduanya
keinginan
kelamaan
kelihatan
kelihatannya
kelima
keluar
kembali
kemudian
kemungkinan
kemungkinannya
kenapa
kepada
kepadanya
kesampaian
keseluruhan
keseluruhannya
keterlaluan
ketika
khususnya
kini
kinilah
kira
kira-kira
kiranya
kita
kitalah
kok
kurang
lagi
lagian
lah
lain
lainnya
lalu
lama
lamanya
lanjut
lanjutnya
lebih
lewat
lima
luar
macam
maka
makanya
makin
malah
malahan
mampu
mampukah
mana
manakala
manalagi
masa
masalah
masalahnya
masih
masihkah
masing
masing-masing
mau
maupun
melainkan
melakukan
melalui
melihat
melihatnya
memang
memastikan
memberi
memberikan
membuat
memerlukan
memihak
meminta
memintakan
memisalkan
memperbuat
mempergunakan
memperkirakan
memperlihatkan
mempersiapkan
mempersoalkan
mempertanyakan
mempunyai
memulai
memungkinkan
menaiki
menambahkan
menandaskan
menanti
menanti-nanti
menantikan
menanya
menanyai
menanyakan
mendapat
mendapatkan
mendatang
mendatangi
mendatangkan
menegaskan
mengakhiri
mengapa
mengatakan
mengatakannya
mengenai
mengerjakan
mengetahui
menggunakan
menghendaki
mengibaratkan
mengibaratkannya
mengingat
mengingatkan
menginginkan
mengira
mengucapkan
mengucapkannya
mengungkapkan
menjadi
menjawab
menjelaskan
menuju
menunjuk
menunjuki
menunjukkan
menunjuknya
menurut
menuturkan
menyampaikan
menyangkut
menyatakan
menyebutkan
menyeluruh
menyiapkan
merasa
mereka
merekalah
merupakan
meski
meskipun
meyakini
meyakinkan
minta
mirip
misal
misalkan
misalnya
mula
mulai
mulailah
mulanya
mungkin
mungkinkah
nah
naik
namun
nanti
nantinya
nyaris
nyatanya
oleh
olehnya
pada
padahal
padanya
pak
paling
panjang
pantas
para
pasti
pastilah
penting
pentingnya
per
percuma
perlu
perlukah
perlunya
pernah
persoalan
pertama
pertama-tama
pertanyaan
pertanyakan
pihak
pihaknya
pukul
pula
pun
punya
rasa
rasanya
rata
rupanya
saat
saatnya
saja
sajalah
saling
sama
sama-sama
sambil
sampai
sampai-sampai
sampaikan
sana
sangat
sangatlah
satu
saya
sayalah
se
sebab
sebabnya
sebagai
sebagaimana
sebagainya
sebagian
sebaik
sebaik-baiknya
sebaiknya
sebaliknya
sebanyak
sebegini
sebegitu
sebelum
sebelumnya
sebenarnya
seberapa
sebesar
sebetulnya
sebisanya
sebuah
sebut
sebutlah
sebutnya
secara
secukupnya
sedang
sedangkan
sedemikian
sedikit
sedikitnya
seenaknya
segala
segalanya
segera
seharusnya
sehingga
seingat
sejak
sejauh
sejenak
sejumlah
sekadar
sekadarnya
sekali
sekali-kali
sekalian
sekaligus
sekalipun
sekarang
sekarang
sekecil
seketika
sekiranya
sekitar
sekitarnya
sekurang-kurangnya
sekurangnya
sela
selain
selaku
selalu
selama
selama-lamanya
selamanya
selanjutnya
seluruh
seluruhnya
semacam
semakin
semampu
semampunya
semasa
semasih
semata
semata-mata
semaunya
sementara
semisal
semisalnya
sempat
semua
semuanya
semula
sendiri
sendirian
sendirinya
seolah
seolah-olah
seorang
sepanjang
sepantasnya
sepantasnyalah
seperlunya
seperti
sepertinya
sepihak
sering
seringnya
serta
serupa
sesaat
sesama
sesampai
sesegera
sesekali
seseorang
sesuatu
sesuatunya
sesudah
sesudahnya
setelah
setempat
setengah
seterusnya
setiap
setiba
setibanya
setidak-tidaknya
setidaknya
setinggi
seusai
sewaktu
siap
siapa
siapakah
siapapun
sini
sinilah
soal
soalnya
suatu
sudah
sudahkah
sudahlah
supaya
tadi
tadinya
tahu
tahun
tak
tambah
tambahnya
tampak
tampaknya
tandas
tandasnya
tanpa
tanya
tanyakan
tanyanya
tapi
tegas
tegasnya
telah
tempat
tengah
tentang
tentu
tentulah
tentunya
tepat
terakhir
terasa
terbanyak
terdahulu
terdapat
terdiri
terhadap
terhadapnya
teringat
teringat-ingat
terjadi
terjadilah
terjadinya
terkira
terlalu
terlebih
terlihat
termasuk
ternyata
tersampaikan
tersebut
tersebutlah
tertentu
tertuju
terus
terutama
tetap
tetapi
tiap
tiba
tiba-tiba
tidak
tidakkah
tidaklah
tiga
tinggi
toh
tunjuk
turut
tutur
tuturnya
ucap
ucapnya
ujar
ujarnya
umum
umumnya
ungkap
ungkapnya
untuk
usah
usai
waduh
wah
wahai
waktu
waktunya
walau
walaupun
wong
yaitu
yakin
yakni
yang
""".split())

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -0,0 +1,50 @@
# coding: utf8
from __future__ import unicode_literals
import regex as re
from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
from ..tokenizer_exceptions import URL_PATTERN
from ...symbols import ORTH
_exc = {}
for orth in ID_BASE_EXCEPTIONS:
_exc[orth] = [{ORTH: orth}]
orth_title = orth.title()
_exc[orth_title] = [{ORTH: orth_title}]
orth_caps = orth.upper()
_exc[orth_caps] = [{ORTH: orth_caps}]
orth_lower = orth.lower()
_exc[orth_lower] = [{ORTH: orth_lower}]
if '-' in orth:
orth_title = '-'.join([part.title() for part in orth.split('-')])
_exc[orth_title] = [{ORTH: orth_title}]
orth_caps = '-'.join([part.upper() for part in orth.split('-')])
_exc[orth_caps] = [{ORTH: orth_caps}]
for orth in [
"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
"E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
"Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
"Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
"B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
"M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
"M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
"S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
"S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
"a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
"dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
"n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)

18
spacy/lang/it/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.it.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
"Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
"San Francisco prevede di bandire i robot di consegna porta a porta",
"Londra è una grande città del Regno Unito."
]

View File

@ -137,6 +137,7 @@ LEX_ATTRS = {
attrs.IS_UPPER: lambda string: string.isupper(),
attrs.IS_STOP: lambda string: False,
attrs.IS_OOV: lambda string: True,
attrs.PROB: lambda string: -20.,
attrs.LIKE_EMAIL: like_email,
attrs.LIKE_NUM: like_num,
attrs.IS_PUNCT: is_punct,

18
spacy/lang/nb/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.nb.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
"Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
"San Francisco vurderer å forby robotbud på fortauene",
"London er en stor by i Storbritannia."
]

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -15,7 +16,7 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'pl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)

20
spacy/lang/pl/examples.py Normal file
View File

@ -0,0 +1,20 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.pl.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Poczuł przyjemną woń mocnej kawy.",
"Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
"Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
"Nowy abonament pod lupą Komisji Europejskiej",
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
]

View File

@ -0,0 +1,23 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import ORTH, LEMMA, POS
_exc = {}
for exc_data in [
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
_exc[exc_data[ORTH]] = [dict(exc_data)],
for orth in [
"w.", "r."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)

18
spacy/lang/pt/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.pt.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
"São Francisco considera banir os robôs de entrega que andam pelas calçadas",
"Londres é a maior cidade do Reino Unido"
]

18
spacy/lang/sv/examples.py Normal file
View File

@ -0,0 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.sv.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple överväger att köpa brittisk startup för 1 miljard dollar.",
"Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
"San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
"London är en storstad i Storbritannien."
]

View File

@ -15,6 +15,7 @@ class Chinese(Language):
raise ImportError("The Chinese tokenizer requires the Jieba library: "
"https://github.com/fxsjy/jieba")
words = list(jieba.cut(text, cut_all=True))
words=[x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -10,6 +10,7 @@ from thinc.neural.optimizers import Adam, SGD
import random
import ujson
from collections import OrderedDict
import itertools
from .tokenizer import Tokenizer
from .vocab import Vocab
@ -22,8 +23,10 @@ from .pipeline import NeuralDependencyParser, EntityRecognizer
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
from .pipeline import NeuralLabeller
from .pipeline import SimilarityHook
from .pipeline import TextCategorizer
from . import about
from .compat import json_dumps
from .compat import json_dumps, izip
from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
@ -92,7 +95,7 @@ class BaseDefaults(object):
meta = nlp.meta if nlp is not None else {}
# Resolve strings, like "cnn", "lstm", etc
pipeline = []
for entry in cls.pipeline:
for entry in meta.get('pipeline', []):
if entry in disable or getattr(entry, 'name', entry) in disable:
continue
factory = cls.Defaults.factories[entry]
@ -107,6 +110,8 @@ class BaseDefaults(object):
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize],
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
# Temporary compatibility -- delete after pivot
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
@ -115,7 +120,6 @@ class BaseDefaults(object):
nonproj.deprojectivize,
],
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)]
}
token_match = TOKEN_MATCH
@ -147,8 +151,8 @@ class Language(object):
Defaults = BaseDefaults
lang = None
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={},
disable=tuple(), **kwargs):
def __init__(self, vocab=True, make_doc=True, pipeline=None,
meta={}, disable=tuple(), **kwargs):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
@ -165,7 +169,7 @@ class Language(object):
models to add model meta data.
RETURNS (Language): The newly constructed object.
"""
self.meta = dict(meta)
self._meta = dict(meta)
if vocab is True:
factory = self.Defaults.create_vocab
vocab = factory(self, **meta.get('vocab', {}))
@ -196,6 +200,29 @@ class Language(object):
else:
flat_list.append(pipe)
self.pipeline = flat_list
self._optimizer = None
@property
def meta(self):
self._meta.setdefault('lang', self.vocab.lang)
self._meta.setdefault('name', '')
self._meta.setdefault('version', '0.0.0')
self._meta.setdefault('spacy_version', about.__version__)
self._meta.setdefault('description', '')
self._meta.setdefault('author', '')
self._meta.setdefault('email', '')
self._meta.setdefault('url', '')
self._meta.setdefault('license', '')
pipeline = []
for component in self.pipeline:
if hasattr(component, 'name'):
pipeline.append(component.name)
self._meta['pipeline'] = pipeline
return self._meta
@meta.setter
def meta(self, value):
self._meta = value
# Conveniences to access pipeline components
@property
@ -218,7 +245,7 @@ class Language(object):
def matcher(self):
return self.get_component('matcher')
def get_component(self, name):
def get_component(self, name):
if self.pipeline in (True, None):
return None
for proc in self.pipeline:
@ -251,7 +278,8 @@ class Language(object):
def make_doc(self, text):
return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None):
def update(self, docs, golds, drop=0., sgd=None, losses=None,
update_shared=False):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
@ -266,6 +294,15 @@ class Language(object):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
if len(docs) != len(golds):
raise IndexError("Update expects same number of docs and golds "
"Got: %d, %d" % (len(docs), len(golds)))
if len(docs) == 0:
return
if sgd is None:
if self._optimizer is None:
self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
grads = {}
@ -273,14 +310,18 @@ class Language(object):
grads[key] = (W, dW)
pipes = list(self.pipeline[1:])
random.shuffle(pipes)
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
for proc in pipes:
if not hasattr(proc, 'update'):
continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses)
if d_tokvecses is not None:
bp_tokvecses(d_tokvecses, sgd=sgd)
if update_shared and d_tokvecses is not None:
for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv
if update_shared and bp_tokvecses is not None:
bp_tokvecses(all_d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory.
@ -343,16 +384,25 @@ class Language(object):
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
optimizer.max_grad_norm = max_grad_norm
optimizer.device = device
return optimizer
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
def evaluate(self, docs_golds):
docs, golds = zip(*docs_golds)
scorer = Scorer()
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
docs, golds = zip(*docs_golds)
docs = list(docs)
golds = list(golds)
for pipe in self.pipeline:
if not hasattr(pipe, 'pipe'):
for doc in docs:
pipe(doc)
else:
docs = list(pipe.pipe(docs))
assert len(docs) == len(golds)
for doc, gold in zip(docs, golds):
scorer.score(doc, gold)
doc.tensor = None
return scorer
@ -386,11 +436,16 @@ class Language(object):
except StopIteration:
pass
def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
texts (iterator): A sequence of texts to process.
as_tuples (bool):
If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
@ -402,8 +457,16 @@ class Language(object):
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed
"""
if as_tuples:
text_context1, text_context2 = itertools.tee(texts)
texts = (tc[0] for tc in text_context1)
contexts = (tc[1] for tc in text_context2)
docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
disable=disable)
for doc, context in izip(docs, contexts):
yield (doc, context)
return
docs = (self.make_doc(text) for text in texts)
docs = texts
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disable:

View File

@ -44,6 +44,11 @@ class Lemmatizer(object):
return True
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
morphology.get('Tense') == 'pres'):
return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
return True
elif VerbForm_inf in morphology:

View File

@ -171,6 +171,8 @@ cdef class Lexeme:
property rank:
def __get__(self):
return self.c.id
def __set__(self, value):
self.c.id = value
property sentiment:
def __get__(self):

View File

@ -42,15 +42,148 @@ from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
from ._ml import build_text_classifier, build_tagger_model
from .parts_of_speech import X
class TokenVectorEncoder(object):
class SentenceSegmenter(object):
'''A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse).
To change the sentence boundary detection strategy, pass a generator
function `strategy` on initialization, or assign a new strategy to
the .strategy attribute.
Sentence detection strategies should be generators that take `Doc` objects
and yield `Span` objects for each sentence.
'''
name = 'sbd'
def __init__(self, vocab, strategy=None):
self.vocab = vocab
if strategy is None or strategy == 'on_punct':
strategy = self.split_on_punct
self.strategy = strategy
def __call__(self, doc):
doc.user_hooks['sents'] = self.strategy
@staticmethod
def split_on_punct(doc):
start = 0
seen_period = False
for i, word in enumerate(doc):
if seen_period and not word.is_punct:
yield doc[start : word.i]
start = word.i
seen_period = False
elif word.text in ['.', '!', '?']:
seen_period = True
if start < len(doc):
yield doc[start : len(doc)]
class BaseThincComponent(object):
name = None
@classmethod
def Model(cls, *shape, **kwargs):
raise NotImplementedError
def __init__(self, vocab, model=True, **cfg):
raise NotImplementedError
def __call__(self, doc):
scores = self.predict([doc])
self.set_annotations([doc], scores)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
scores = self.predict(docs)
self.set_annotations(docs, scores)
yield from docs
def predict(self, docs):
raise NotImplementedError
def set_annotations(self, docs, scores):
raise NotImplementedError
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
raise NotImplementedError
def get_loss(self, docs, golds, scores):
raise NotImplementedError
def begin_training(self, gold_tuples=tuple(), pipeline=None):
token_vector_width = pipeline[0].model.nO
if self.model is True:
self.model = self.Model(1, token_vector_width)
def use_params(self, params):
with self.model.use_params(params):
yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('cfg', lambda: json_dumps(self.cfg)),
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
def load_model(b):
if self.model is True:
self.model = self.Model(**self.cfg)
self.model.from_bytes(b)
deserialize = OrderedDict((
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('model', load_model),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
def load_model(p):
if self.model is True:
self.model = self.Model(**self.cfg)
self.model.from_bytes(p.open('rb').read())
deserialize = OrderedDict((
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
('model', load_model),
('vocab', lambda p: self.vocab.from_disk(p)),
))
util.from_disk(path, deserialize, exclude)
return self
def _load_cfg(path):
if path.exists():
return ujson.load(path.open())
else:
return {}
class TokenVectorEncoder(BaseThincComponent):
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
name = 'tensorizer'
@classmethod
def Model(cls, width=128, embed_size=7500, **cfg):
def Model(cls, width=128, embed_size=4000, **cfg):
"""Create a new statistical model for the class.
width (int): Output size of the model.
@ -79,6 +212,7 @@ class TokenVectorEncoder(object):
self.vocab = vocab
self.doc2feats = doc2feats()
self.model = model
self.cfg = dict(cfg)
def __call__(self, doc):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -144,7 +278,7 @@ class TokenVectorEncoder(object):
# TODO: implement
raise NotImplementedError
def begin_training(self, gold_tuples, pipeline=None):
def begin_training(self, gold_tuples=tuple(), pipeline=None):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer.
@ -155,74 +289,34 @@ class TokenVectorEncoder(object):
if self.model is True:
self.model = self.Model()
def use_params(self, params):
"""Replace weights of models in the pipeline with those provided in the
params dictionary.
params (dict): A dictionary of parameters keyed by model ID.
"""
with self.model.use_params(params):
yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda b: self.model.from_bytes(b)),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
('vocab', lambda p: self.vocab.from_disk(p))
))
util.from_disk(path, deserialize, exclude)
return self
class NeuralTagger(object):
class NeuralTagger(BaseThincComponent):
name = 'tagger'
def __init__(self, vocab, model=True):
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = dict(cfg)
def __call__(self, doc):
tags = self.predict([doc.tensor])
tags = self.predict(([doc], [doc.tensor]))
self.set_annotations([doc], tags)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
tokvecs = [d.tensor for d in docs]
tag_ids = self.predict(tokvecs)
tag_ids = self.predict((docs, tokvecs))
self.set_annotations(docs, tag_ids)
yield from docs
def predict(self, tokvecs):
scores = self.model(tokvecs)
def predict(self, docs_tokvecs):
scores = self.model(docs_tokvecs)
scores = self.model.ops.flatten(scores)
guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get()
tokvecs = docs_tokvecs[1]
guesses = self.model.ops.unflatten(guesses,
[tv.shape[0] for tv in tokvecs])
return guesses
@ -235,6 +329,8 @@ class NeuralTagger(object):
cdef Vocab vocab = self.vocab
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, 'get'):
doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
@ -243,16 +339,18 @@ class NeuralTagger(object):
doc.is_tagged = True
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
if self.model.nI is None:
self.model.nI = tokvecs[0].shape[1]
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
return d_tokvecs
def get_loss(self, docs, golds, scores):
@ -276,7 +374,7 @@ class NeuralTagger(object):
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
def begin_training(self, gold_tuples, pipeline=None):
def begin_training(self, gold_tuples=tuple(), pipeline=None):
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = {}
for raw_text, annots_brackets in gold_tuples:
@ -300,10 +398,8 @@ class NeuralTagger(object):
@classmethod
def Model(cls, n_tags, token_vector_width):
return with_flatten(
chain(Maxout(token_vector_width, token_vector_width),
Softmax(n_tags, token_vector_width)))
return build_tagger_model(n_tags, token_vector_width)
def use_params(self, params):
with self.model.use_params(params):
yield
@ -321,7 +417,8 @@ class NeuralTagger(object):
def from_bytes(self, bytes_data, **exclude):
def load_model(b):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
token_vector_width = util.env_opt('token_vector_width',
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(b)
@ -348,13 +445,15 @@ class NeuralTagger(object):
use_bin_type=True,
encoding='utf8'))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
def load_model(p):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
token_vector_width = util.env_opt('token_vector_width',
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(p.open('rb').read())
@ -370,6 +469,7 @@ class NeuralTagger(object):
('vocab', lambda p: self.vocab.from_disk(p)),
('tag_map', load_tag_map),
('model', load_model),
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
))
util.from_disk(path, deserialize, exclude)
return self
@ -377,15 +477,23 @@ class NeuralTagger(object):
class NeuralLabeller(NeuralTagger):
name = 'nn_labeller'
def __init__(self, vocab, model=True):
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.labels = {}
self.cfg = dict(cfg)
@property
def labels(self):
return self.cfg.setdefault('labels', {})
@labels.setter
def labels(self, value):
self.cfg['labels'] = value
def set_annotations(self, docs, dep_ids):
pass
def begin_training(self, gold_tuples, pipeline=None):
def begin_training(self, gold_tuples=tuple(), pipeline=None):
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets:
@ -399,10 +507,8 @@ class NeuralLabeller(NeuralTagger):
@classmethod
def Model(cls, n_tags, token_vector_width):
return with_flatten(
chain(Maxout(token_vector_width, token_vector_width),
Softmax(n_tags, token_vector_width)))
return build_tagger_model(n_tags, token_vector_width)
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
cdef int idx = 0
@ -423,7 +529,7 @@ class NeuralLabeller(NeuralTagger):
return float(loss), d_scores
class SimilarityHook(object):
class SimilarityHook(BaseThincComponent):
"""
Experimental
@ -439,9 +545,10 @@ class SimilarityHook(object):
Where W is a vector of dimension weights, initialized to 1.
"""
name = 'similarity'
def __init__(self, vocab, model=True):
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = dict(cfg)
@classmethod
def Model(cls, length):
@ -467,7 +574,7 @@ class SimilarityHook(object):
return d_tensor1s, d_tensor2s
def begin_training(self, _, pipeline=None):
def begin_training(self, _=tuple(), pipeline=None):
"""
Allocate model, using width from tensorizer in pipeline.
@ -477,48 +584,77 @@ class SimilarityHook(object):
if self.model is True:
self.model = self.Model(pipeline[0].model.nO)
def use_params(self, params):
"""Replace weights of models in the pipeline with those provided in the
params dictionary.
params (dict): A dictionary of parameters keyed by model ID.
"""
with self.model.use_params(params):
yield
class TextCategorizer(BaseThincComponent):
name = 'textcat'
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
return util.to_bytes(serialize, exclude)
@classmethod
def Model(cls, nr_class=1, width=64, **cfg):
return build_text_classifier(nr_class, width, **cfg)
def from_bytes(self, bytes_data, **exclude):
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = dict(cfg)
@property
def labels(self):
return self.cfg.get('labels', ['LABEL'])
@labels.setter
def labels(self, value):
self.cfg['labels'] = value
def __call__(self, doc):
scores = self.predict([doc])
self.set_annotations([doc], scores)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
scores = self.predict(docs)
self.set_annotations(docs, scores)
yield from docs
def predict(self, docs):
scores = self.model(docs)
scores = self.model.ops.asarray(scores)
return scores
def set_annotations(self, docs, scores):
for i, doc in enumerate(docs):
for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j])
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
docs, tensors = docs_tensors
scores, bp_scores = self.model.begin_update(docs, drop=drop)
loss, d_scores = self.get_loss(docs, golds, scores)
d_tensors = bp_scores(d_scores, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
return d_tensors
def get_loss(self, docs, golds, scores):
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
for i, gold in enumerate(golds):
for j, label in enumerate(self.labels):
truths[i, j] = label in gold.cats
truths = self.model.ops.asarray(truths)
d_scores = (scores-truths) / scores.shape[0]
mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
return mean_square_error, d_scores
def begin_training(self, gold_tuples=tuple(), pipeline=None):
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
token_vector_width = pipeline[0].model.nO
else:
token_vector_width = 64
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda b: self.model.from_bytes(b)),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
('vocab', lambda p: self.vocab.from_disk(p))
))
util.from_disk(path, deserialize, exclude)
return self
self.model = self.Model(len(self.labels), token_vector_width,
**self.cfg)
cdef class EntityRecognizer(LinearParser):
@ -569,6 +705,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6
def predict_confidences(self, docs):
tensors = [d.tensor for d in docs]
samples = []
for i in range(10):
states = self.parse_batch(docs, tensors, drop=0.3)
for state in states:
samples.append(self._get_entities(state))
def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)

View File

@ -215,7 +215,10 @@ cdef class StringStore:
path = util.ensure_path(path)
with path.open('r') as file_:
strings = ujson.load(file_)
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
return self
def to_bytes(self, **exclude):
@ -234,7 +237,10 @@ cdef class StringStore:
RETURNS (StringStore): The `StringStore` object.
"""
strings = ujson.loads(bytes_data)
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
return self
def set_frozen(self, bint is_frozen):

View File

@ -0,0 +1,286 @@
# cython: infer_types=True
# cython: profile=True
cimport numpy as np
import numpy
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation
from thinc.typedefs cimport hash_t, class_t
from thinc.extra.search cimport MaxViolation
from .transition_system cimport TransitionSystem, Transition
from .stateclass cimport StateClass
from ..gold cimport GoldParse
from ..tokens.doc cimport Doc
# These are passed as callbacks to thinc.search.Beam
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
dest = <StateClass>_dest
src = <StateClass>_src
moves = <const Transition*>_moves
dest.clone(src)
moves[clas].do(dest.c, moves[clas].label)
cdef int _check_final_state(void* _state, void* extra_args) except -1:
return (<StateClass>_state).is_final()
def _cleanup(Beam beam):
for i in range(beam.width):
Py_XDECREF(<PyObject*>beam._states[i].content)
Py_XDECREF(<PyObject*>beam._parents[i].content)
cdef hash_t _hash_state(void* _state, void* _) except 0:
state = <StateClass>_state
if state.c.is_final():
return 1
else:
return state.c.hash()
cdef class ParserBeam(object):
cdef public TransitionSystem moves
cdef public object states
cdef public object golds
cdef public object beams
cdef public object dones
def __init__(self, TransitionSystem moves, states, golds,
int width, float density):
self.moves = moves
self.states = states
self.golds = golds
self.beams = []
cdef Beam beam
cdef StateClass state, st
for state in states:
beam = Beam(self.moves.n_moves, width, density)
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
for i in range(beam.width):
st = <StateClass>beam.at(i)
st.c.offset = state.c.offset
self.beams.append(beam)
self.dones = [False] * len(self.beams)
def __dealloc__(self):
if self.beams is not None:
for beam in self.beams:
if beam is not None:
_cleanup(beam)
@property
def is_done(self):
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
def __getitem__(self, i):
return self.beams[i]
def __len__(self):
return len(self.beams)
def advance(self, scores, follow_gold=False):
cdef Beam beam
for i, beam in enumerate(self.beams):
if beam.is_done or not scores[i].size or self.dones[i]:
continue
self._set_scores(beam, scores[i])
if self.golds is not None:
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
if follow_gold:
beam.advance(_transition_state, NULL, <void*>self.moves.c)
else:
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL)
if beam.is_done and self.golds is not None:
for j in range(beam.size):
state = <StateClass>beam.at(j)
if state.is_final():
try:
if self.moves.is_gold_parse(state, self.golds[i]):
beam._states[j].loss = 0.0
elif beam._states[j].loss == 0.0:
beam._states[j].loss = 1.0
except NotImplementedError:
break
def _set_scores(self, Beam beam, float[:, ::1] scores):
cdef float* c_scores = &scores[0, 0]
cdef int nr_state = min(scores.shape[0], beam.size)
cdef int nr_class = scores.shape[1]
for i in range(nr_state):
state = <StateClass>beam.at(i)
if not state.is_final():
for j in range(nr_class):
beam.scores[i][j] = c_scores[i * nr_class + j]
self.moves.set_valid(beam.is_valid[i], state.c)
else:
for j in range(beam.nr_class):
beam.scores[i][j] = 0
beam.costs[i][j] = 0
def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False):
for i in range(beam.size):
state = <StateClass>beam.at(i)
if not state.c.is_final():
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
if follow_gold:
for j in range(beam.nr_class):
if beam.costs[i][j] >= 1:
beam.is_valid[i][j] = 0
def get_token_ids(states, int n_tokens):
cdef StateClass state
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
dtype='int32', order='C')
c_ids = <int*>ids.data
for i, state in enumerate(states):
if not state.is_final():
state.c.set_context_tokens(c_ids, n_tokens)
else:
ids[i] = -1
c_ids += ids.shape[1]
return ids
nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, tokvecs, golds,
state2vec, vec2scores,
int width, float density,
sgd=None, losses=None, drop=0.):
global nr_update
cdef MaxViolation violn
nr_update += 1
pbeam = ParserBeam(moves, states, golds,
width=width, density=density)
gbeam = ParserBeam(moves, states, golds,
width=width, density=0.0)
cdef StateClass state
beam_maps = []
backprops = []
violns = [MaxViolation() for _ in range(len(states))]
for t in range(max_steps):
if pbeam.is_done and gbeam.is_done:
break
# The beam maps let us find the right row in the flattened scores
# arrays for each state. States are identified by (example id, history).
# We keep a different beam map for each step (since we'll have a flat
# scores array for each step). The beam map will let us take the per-state
# losses, and compute the gradient for each (step, state, class).
beam_maps.append({})
# Gather all states from the two beams in a list. Some stats may occur
# in both beams. To figure out which beam each state belonged to,
# we keep two lists of indices, p_indices and g_indices
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
if not states:
break
# Now that we have our flat list of states, feed them through the model
token_ids = get_token_ids(states, nr_feature)
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
# Store the callbacks for the backward pass
backprops.append((token_ids, bp_vectors, bp_scores))
# Unpack the flat scores into lists for the two beams. The indices arrays
# tell us which example and state the scores-row refers to.
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
# Now advance the states in the beams. The gold beam is contrained to
# to follow only gold analyses.
pbeam.advance(p_scores)
gbeam.advance(g_scores, follow_gold=True)
# Track the "maximum violation", to use in the update.
for i, violn in enumerate(violns):
violn.check_crf(pbeam[i], gbeam[i])
histories = []
losses = []
for violn in violns:
if violn.p_hist:
histories.append(violn.p_hist + violn.g_hist)
losses.append(violn.p_probs + violn.g_probs)
else:
histories.append([])
losses.append([])
states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses)
return states_d_scores, backprops[:len(states_d_scores)]
def get_states(pbeams, gbeams, beam_map, nr_update):
seen = {}
states = []
p_indices = []
g_indices = []
cdef Beam pbeam, gbeam
assert len(pbeams) == len(gbeams)
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
p_indices.append([])
g_indices.append([])
for i in range(pbeam.size):
state = <StateClass>pbeam.at(i)
if not state.is_final():
key = tuple([eg_id] + pbeam.histories[i])
assert key not in seen, (key, seen)
seen[key] = len(states)
p_indices[-1].append(len(states))
states.append(state)
beam_map.update(seen)
for i in range(gbeam.size):
state = <StateClass>gbeam.at(i)
if not state.is_final():
key = tuple([eg_id] + gbeam.histories[i])
if key in seen:
g_indices[-1].append(seen[key])
else:
g_indices[-1].append(len(states))
beam_map[key] = len(states)
states.append(state)
p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices]
g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices]
return states, p_idx, g_idx
def get_gradient(nr_class, beam_maps, histories, losses):
"""
The global model assigns a loss to each parse. The beam scores
are additive, so the same gradient is applied to each action
in the history. This gives the gradient of a single *action*
for a beam state -- so we have "the gradient of loss for taking
action i given history H."
Histories: Each hitory is a list of actions
Each candidate has a history
Each beam has multiple candidates
Each batch has multiple beams
So history is list of lists of lists of ints
"""
nr_step = len(beam_maps)
grads = []
nr_step = 0
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
if loss != 0.0 and not numpy.isnan(loss):
nr_step = max(nr_step, len(hist))
for i in range(nr_step):
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
assert len(histories) == len(losses)
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
if loss == 0.0 or numpy.isnan(loss):
continue
key = tuple([eg_id])
# Adjust loss for length
avg_loss = loss / len(hist)
loss += avg_loss * (nr_step - len(hist))
for j, clas in enumerate(hist):
i = beam_maps[j][key]
# In step j, at state i action clas
# resulted in loss
grads[j][i, clas] += loss
key = key + tuple([clas])
return grads

View File

@ -37,6 +37,7 @@ cdef cppclass StateC:
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
this.offset = 0
cdef int i
for i in range(length + (PADDING * 2)):
this._ents[i].end = -1
@ -73,7 +74,16 @@ cdef cppclass StateC:
free(this.shifted - PADDING)
void set_context_tokens(int* ids, int n) nogil:
if n == 13:
if n == 8:
ids[0] = this.B(0)
ids[1] = this.B(1)
ids[2] = this.S(0)
ids[3] = this.S(1)
ids[4] = this.H(this.S(0))
ids[5] = this.L(this.B(0), 1)
ids[6] = this.L(this.S(0), 2)
ids[7] = this.R(this.S(0), 1)
elif n == 13:
ids[0] = this.B(0)
ids[1] = this.B(1)
ids[2] = this.S(0)

View File

@ -10,6 +10,8 @@ from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cymem.cymem cimport Pool
from collections import OrderedDict
from thinc.extra.search cimport Beam
import numpy
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
@ -18,7 +20,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
from ..lexeme cimport Lexeme
from ..structs cimport TokenC
@ -284,7 +286,7 @@ cdef class Break:
return 0
cdef int _get_root(int word, const GoldParseC* gold) nogil:
while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
while gold.heads[word] != word and gold.has_dep[word] and word >= 0:
word = gold.heads[word]
if not gold.has_dep[word]:
return -1
@ -349,6 +351,20 @@ cdef class ArcEager(TransitionSystem):
def __get__(self):
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
def is_gold_parse(self, StateClass state, GoldParse gold):
predicted = set()
truth = set()
for i in range(gold.length):
if gold.cand_to_gold[i] is None:
continue
if state.safe_get(i).dep:
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
else:
predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
truth.add((id_, head, dep))
return truth == predicted
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.heads)
if all([tag is None for tag in gold.heads[start:end]]):
@ -360,7 +376,7 @@ cdef class ArcEager(TransitionSystem):
if not self.has_gold(gold):
return None
for i in range(gold.length):
if gold.heads[i] is None: # Missing values
if gold.heads[i] is None or gold.labels[i] is None: # Missing values
gold.c.heads[i] = i
gold.c.has_dep[i] = False
else:
@ -383,6 +399,7 @@ cdef class ArcEager(TransitionSystem):
for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
return Transition(clas=0, move=MISSING, label=0)
def move_name(self, int move, attr_t label):
label_str = self.strings[label]
@ -499,9 +516,11 @@ cdef class ArcEager(TransitionSystem):
"before training and after parsing. Either pass make_projective=True "
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
else:
print(gold.orig_annot)
print(gold.words)
print(gold.heads)
print(gold.labels)
print(gold.sent_starts)
raise ValueError(
"Could not find a gold-standard action to supervise the dependency "
"parser.\n"
@ -510,3 +529,23 @@ cdef class ArcEager(TransitionSystem):
"State at failure:\n"
"%s" % (self.n_moves, stcls.print_state(gold.words)))
assert n_gold >= 1
def get_beam_annot(self, Beam beam):
length = (<StateClass>beam.at(0)).c.length
heads = [{} for _ in range(length)]
deps = [{} for _ in range(length)]
probs = beam.probs
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
self.finalize_state(stcls.c)
if stcls.is_final():
prob = probs[i]
for j in range(stcls.c.length):
head = j + stcls.c._sent[j].head
dep = stcls.c._sent[j].dep
heads[j].setdefault(head, 0.0)
heads[j][head] += prob
deps[j].setdefault(dep, 0.0)
deps[j][dep] += prob
return heads, deps

View File

@ -107,7 +107,7 @@ cdef class BeamParser(Parser):
# The non-monotonic oracle makes it difficult to ensure final costs are
# correct. Therefore do final correction
for i in range(pred.size):
if is_gold(<StateClass>pred.at(i), gold_parse, self.moves.strings):
if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
pred._states[i].loss = 0.0
elif pred._states[i].loss == 0.0:
pred._states[i].loss = 1.0
@ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
if not pred._states[i].is_done or pred._states[i].loss == 0:
continue
state = <StateClass>pred.at(i)
if is_gold(state, gold_parse, moves.strings) == True:
if moves.is_gold_parse(state, gold_parse) == True:
for dep in gold_parse.orig_annot:
print(dep[1], dep[3], dep[4])
print("Cost", pred._states[i].loss)
@ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
if not gold._states[i].is_done:
continue
state = <StateClass>gold.at(i)
if is_gold(state, gold_parse, moves.strings) == False:
if moves.is_gold(state, gold_parse) == False:
print("Truth")
for dep in gold_parse.orig_annot:
print(dep[1], dep[3], dep[4])
@ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio
raise Exception("Gold parse is not gold-standard")
def is_gold(StateClass state, GoldParse gold, StringStore strings):
predicted = set()
truth = set()
for i in range(gold.length):
if gold.cand_to_gold[i] is None:
continue
if state.safe_get(i).dep:
predicted.add((i, state.H(i), strings[state.safe_get(i).dep]))
else:
predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
truth.add((id_, head, dep))
return truth == predicted

View File

@ -110,5 +110,35 @@ def es_noun_chunks(obj):
token = next_token(token)
def french_noun_chunks(obj):
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
'es': es_noun_chunks}
'es': es_noun_chunks, 'fr': french_noun_chunks}

View File

@ -2,7 +2,10 @@
from __future__ import unicode_literals
from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
from collections import OrderedDict
import numpy
from thinc.neural.ops import NumpyOps
from .stateclass cimport StateClass
from ._state cimport StateC
@ -110,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem):
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.ner)
if all([tag == '-' for tag in gold.ner[start:end]]):
if all([tag in ('-', None) for tag in gold.ner[start:end]]):
return False
else:
return True
@ -122,11 +125,46 @@ cdef class BiluoPushDown(TransitionSystem):
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
return gold
def get_beam_annot(self, Beam beam):
entities = {}
probs = beam.probs
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if stcls.is_final():
self.finalize_state(stcls.c)
prob = probs[i]
for j in range(stcls.c._e_i):
start = stcls.c._ents[j].start
end = stcls.c._ents[j].end
label = stcls.c._ents[j].label
entities.setdefault((start, end, label), 0.0)
entities[(start, end, label)] += prob
return entities
def get_beam_parses(self, Beam beam):
parses = []
probs = beam.probs
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if stcls.is_final():
self.finalize_state(stcls.c)
prob = probs[i]
parse = []
for j in range(stcls.c._e_i):
start = stcls.c._ents[j].start
end = stcls.c._ents[j].end
label = stcls.c._ents[j].label
parse.append((start, end, self.strings[label]))
parses.append((prob, parse))
return parses
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == None:
move_str = 'M'
label = 0
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)
elif '-' in name:
move_str, label_str = name.split('-', 1)
# Hacky way to denote 'not this entity'
@ -308,6 +346,9 @@ cdef class In:
elif g_act == UNIT:
# I, Gold U --> True iff next tag == O
return next_act != OUT
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
else:
return 1
@ -350,6 +391,9 @@ cdef class Last:
elif g_act == UNIT:
# L, Gold U --> True
return 0
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
else:
return 1
@ -418,7 +462,9 @@ cdef class Out:
cdef int g_act = gold.ner[s.B(0)].move
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING or g_act == ISNT:
if g_act == ISNT and g_tag == 0:
return 1
elif g_act == MISSING or g_act == ISNT:
return 0
elif g_act == BEGIN:
# O, Gold B --> False

View File

@ -29,21 +29,26 @@ from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
from thinc.extra.eg cimport Example
from thinc.extra.search cimport Beam
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.api import layerize, chain, noop, clone
from thinc.neural import Model, Affine, ELU, ReLu, Maxout
from thinc.api import layerize, chain, noop, clone, with_flatten
from thinc.neural import Model, Affine, ReLu, Maxout
from thinc.neural._classes.batchnorm import BatchNorm as BN
from thinc.neural._classes.selu import SELU
from thinc.neural._classes.layernorm import LayerNorm
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
from .. import util
from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
from .._ml import Residual, drop_layer
from ..compat import json_dumps
from . import _parse_features
@ -58,8 +63,10 @@ from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from ..gold cimport GoldParse
from ..attrs cimport TAG, DEP
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
from . import _beam_utils
USE_FINE_TUNE = True
def get_templates(*args, **kwargs):
return []
@ -110,7 +117,6 @@ cdef class precompute_hiddens:
self.nO = cached.shape[2]
self.nP = getattr(lower_model, 'nP', 1)
self.ops = lower_model.ops
self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
self._is_synchronized = False
self._cuda_stream = cuda_stream
self._cached = cached
@ -127,13 +133,12 @@ cdef class precompute_hiddens:
return self.begin_update(X)[0]
def begin_update(self, token_ids, drop=0.):
self._features.fill(0)
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
# This is tricky, but (assuming GPU available);
# - Input to forward on CPU
# - Output from forward on CPU
# - Input to backward on GPU!
# - Output from backward on GPU
cdef np.ndarray state_vector = self._features[:len(token_ids)]
bp_hiddens = self._bp_hiddens
feat_weights = self.get_feat_weights()
@ -233,11 +238,14 @@ cdef class Parser:
Base class of the DependencyParser and EntityRecognizer.
"""
@classmethod
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg):
depth = util.env_opt('parser_hidden_depth', depth)
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width)
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
embed_size = util.env_opt('embed_size', 4000)
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size,
preprocess=doc2feats()))
if parser_maxout_pieces == 1:
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
@ -269,7 +277,7 @@ cdef class Parser:
'hidden_width': hidden_width,
'maxout_pieces': parser_maxout_pieces
}
return (lower, upper), cfg
return (tensors, lower, upper), cfg
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
"""
@ -295,6 +303,10 @@ cdef class Parser:
self.moves = self.TransitionSystem(self.vocab.strings, {})
else:
self.moves = moves
if 'beam_width' not in cfg:
cfg['beam_width'] = util.env_opt('beam_width', 1)
if 'beam_density' not in cfg:
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
self.cfg = cfg
if 'actions' in self.cfg:
for action, labels in self.cfg.get('actions', {}).items():
@ -305,7 +317,7 @@ cdef class Parser:
def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None)
def __call__(self, Doc doc):
def __call__(self, Doc doc, beam_width=None, beam_density=None):
"""
Apply the parser or entity recognizer, setting the annotations onto the Doc object.
@ -314,11 +326,26 @@ cdef class Parser:
Returns:
None
"""
states = self.parse_batch([doc], [doc.tensor])
self.set_annotations([doc], states)
return doc
if beam_width is None:
beam_width = self.cfg.get('beam_width', 1)
if beam_density is None:
beam_density = self.cfg.get('beam_density', 0.0)
cdef Beam beam
if beam_width == 1:
states = self.parse_batch([doc], [doc.tensor])
self.set_annotations([doc], states)
return doc
else:
beam = self.beam_parse([doc], [doc.tensor],
beam_width=beam_width, beam_density=beam_density)[0]
output = self.moves.get_beam_annot(beam)
state = <StateClass>beam.at(0)
self.set_annotations([doc], [state])
_cleanup(beam)
return output
def pipe(self, docs, int batch_size=1000, int n_threads=2):
def pipe(self, docs, int batch_size=1000, int n_threads=2,
beam_width=None, beam_density=None):
"""
Process a stream of documents.
@ -330,13 +357,23 @@ cdef class Parser:
The number of threads with which to work on the buffer in parallel.
Yields (Doc): Documents, in order.
"""
cdef StateClass parse_state
if beam_width is None:
beam_width = self.cfg.get('beam_width', 1)
if beam_density is None:
beam_density = self.cfg.get('beam_density', 0.0)
cdef Doc doc
queue = []
cdef Beam beam
for docs in cytoolz.partition_all(batch_size, docs):
docs = list(docs)
tokvecs = [d.tensor for d in docs]
parse_states = self.parse_batch(docs, tokvecs)
tokvecs = [doc.tensor for doc in docs]
if beam_width == 1:
parse_states = self.parse_batch(docs, tokvecs)
else:
beams = self.beam_parse(docs, tokvecs,
beam_width=beam_width, beam_density=beam_density)
parse_states = []
for beam in beams:
parse_states.append(<StateClass>beam.at(0))
self.set_annotations(docs, parse_states)
yield from docs
@ -351,8 +388,13 @@ cdef class Parser:
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
if isinstance(docs, Doc):
docs = [docs]
if isinstance(tokvecses, np.ndarray):
tokvecses = [tokvecses]
tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE:
# TODO: This is incorrect! Unhack when training next model
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
nr_state = len(docs)
nr_class = self.moves.n_moves
@ -404,6 +446,55 @@ cdef class Parser:
next_step.push_back(st)
return states
def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001):
cdef Beam beam
cdef np.ndarray scores
cdef Doc doc
cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output
tokvecs = self.model[0].ops.flatten(tokvecses)
if USE_FINE_TUNE:
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
cuda_stream, 0.0)
beams = []
cdef int offset = 0
cdef int j = 0
cdef int k
for doc in docs:
beam = Beam(nr_class, beam_width, min_density=beam_density)
beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
for i in range(beam.width):
stcls = <StateClass>beam.at(i)
stcls.c.offset = offset
offset += len(doc)
beam.check_done(_check_final_state, NULL)
while not beam.is_done:
states = []
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
# This way we avoid having to score finalized states
# We do have to take care to keep indexes aligned, though
if not stcls.is_final():
states.append(stcls)
token_ids = self.get_token_ids(states)
vectors = state2vec(token_ids)
scores = vec2scores(vectors)
j = 0
c_scores = <float*>scores.data
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if not stcls.is_final():
self.moves.set_valid(beam.is_valid[i], stcls.c)
for k in range(nr_class):
beam.scores[i][k] = c_scores[j * scores.shape[1] + k]
j += 1
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL)
beams.append(beam)
return beams
cdef void _parse_step(self, StateC* state,
const float* feat_weights,
int nr_class, int nr_feat, int nr_piece) nogil:
@ -427,6 +518,12 @@ cdef class Parser:
free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
return self.update_beam(docs_tokvecs, golds,
self.cfg['beam_width'], self.cfg['beam_density'],
drop=drop, sgd=sgd, losses=losses)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs
@ -434,6 +531,9 @@ cdef class Parser:
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs]
golds = [golds]
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs += self.model[0].ops.flatten(my_tokvecs)
cuda_stream = get_cuda_stream()
@ -460,13 +560,14 @@ cdef class Parser:
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
d_scores = self.get_batch_loss(states, golds, scores)
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
d_scores /= len(docs)
d_vector = bp_scores(d_scores, sgd=sgd)
if drop != 0:
d_vector *= mask
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to CPU, asynchronously
# Move token_ids and d_vector to GPU, asynchronously
backprops.append((
get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector),
@ -483,7 +584,65 @@ cdef class Parser:
break
self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream)
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def update_beam(self, docs_tokvecs, golds, width=None, density=None,
drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
if not golds:
return None
if width is None:
width = self.cfg.get('beam_width', 2)
if density is None:
density = self.cfg.get('beam_density', 0.0)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
lengths = [len(d) for d in docs]
assert min(lengths) >= 1
tokvecs = self.model[0].ops.flatten(tokvecs)
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs += self.model[0].ops.flatten(my_tokvecs)
states = self.moves.init_batch(docs)
for gold in golds:
self.moves.preprocess_gold(gold)
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0)
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
states, tokvecs, golds,
state2vec, vec2scores,
width, density,
sgd=sgd, drop=drop, losses=losses)
backprop_lower = []
cdef float batch_size = len(docs)
for i, d_scores in enumerate(states_d_scores):
d_scores /= batch_size
if losses is not None:
losses[self.name] += (d_scores**2).sum()
ids, bp_vectors, bp_scores = backprops[i]
d_vector = bp_scores(d_scores, sgd=sgd)
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(ids, state2vec.ops.xp.ndarray):
backprop_lower.append((
get_async(cuda_stream, ids),
get_async(cuda_stream, d_vector),
bp_vectors))
else:
backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long
@ -528,14 +687,10 @@ cdef class Parser:
xp = get_array_module(d_tokvecs)
for ids, d_vector, bp_vector in backprops:
d_state_features = bp_vector(d_vector, sgd=sgd)
active_feats = ids * (ids >= 0)
active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
if hasattr(xp, 'scatter_add'):
xp.scatter_add(d_tokvecs,
ids, d_state_features * active_feats)
else:
xp.add.at(d_tokvecs,
ids, d_state_features * active_feats)
mask = ids >= 0
d_state_features *= mask.reshape(ids.shape + (1,))
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
d_state_features)
@property
def move_names(self):
@ -546,7 +701,7 @@ cdef class Parser:
return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
lower, upper = self.model
_, lower, upper = self.model
state2vec = precompute_hiddens(batch_size, tokvecs,
lower, stream, drop=dropout)
return state2vec, upper
@ -560,7 +715,8 @@ cdef class Parser:
dtype='i', order='C')
c_ids = <int*>ids.data
for i, state in enumerate(states):
state.c.set_context_tokens(c_ids, n_tokens)
if not state.is_final():
state.c.set_context_tokens(c_ids, n_tokens)
c_ids += ids.shape[1]
return ids
@ -635,10 +791,12 @@ cdef class Parser:
def to_disk(self, path, **exclude):
serializers = {
'lower_model': lambda p: p.open('wb').write(
'tok2vec_model': lambda p: p.open('wb').write(
self.model[0].to_bytes()),
'upper_model': lambda p: p.open('wb').write(
'lower_model': lambda p: p.open('wb').write(
self.model[1].to_bytes()),
'upper_model': lambda p: p.open('wb').write(
self.model[2].to_bytes()),
'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False),
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
@ -659,24 +817,29 @@ cdef class Parser:
self.model, cfg = self.Model(**self.cfg)
else:
cfg = {}
with (path / 'lower_model').open('rb') as file_:
with (path / 'tok2vec_model').open('rb') as file_:
bytes_data = file_.read()
self.model[0].from_bytes(bytes_data)
with (path / 'upper_model').open('rb') as file_:
with (path / 'lower_model').open('rb') as file_:
bytes_data = file_.read()
self.model[1].from_bytes(bytes_data)
with (path / 'upper_model').open('rb') as file_:
bytes_data = file_.read()
self.model[2].from_bytes(bytes_data)
self.cfg.update(cfg)
return self
def to_bytes(self, **exclude):
serializers = OrderedDict((
('lower_model', lambda: self.model[0].to_bytes()),
('upper_model', lambda: self.model[1].to_bytes()),
('tok2vec_model', lambda: self.model[0].to_bytes()),
('lower_model', lambda: self.model[1].to_bytes()),
('upper_model', lambda: self.model[2].to_bytes()),
('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)),
('cfg', lambda: ujson.dumps(self.cfg))
))
if 'model' in exclude:
exclude['tok2vec_model'] = True
exclude['lower_model'] = True
exclude['upper_model'] = True
exclude.pop('model')
@ -687,6 +850,7 @@ cdef class Parser:
('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('tok2vec_model', lambda b: None),
('lower_model', lambda b: None),
('upper_model', lambda b: None)
))
@ -696,10 +860,12 @@ cdef class Parser:
self.model, cfg = self.Model(self.moves.n_moves)
else:
cfg = {}
if 'tok2vec_model' in msg:
self.model[0].from_bytes(msg['tok2vec_model'])
if 'lower_model' in msg:
self.model[0].from_bytes(msg['lower_model'])
self.model[1].from_bytes(msg['lower_model'])
if 'upper_model' in msg:
self.model[1].from_bytes(msg['upper_model'])
self.model[2].from_bytes(msg['upper_model'])
self.cfg.update(cfg)
return self
@ -762,3 +928,30 @@ cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actio
mode = i
score = scores[i]
return mode
# These are passed as callbacks to thinc.search.Beam
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
dest = <StateClass>_dest
src = <StateClass>_src
moves = <const Transition*>_moves
dest.clone(src)
moves[clas].do(dest.c, moves[clas].label)
cdef int _check_final_state(void* _state, void* extra_args) except -1:
return (<StateClass>_state).is_final()
def _cleanup(Beam beam):
for i in range(beam.width):
Py_XDECREF(<PyObject*>beam._states[i].content)
Py_XDECREF(<PyObject*>beam._parents[i].content)
cdef hash_t _hash_state(void* _state, void* _) except 0:
state = <StateClass>_state
if state.c.is_final():
return 1
else:
return state.c.hash()

View File

@ -99,6 +99,9 @@ cdef class TransitionSystem:
def preprocess_gold(self, GoldParse gold):
raise NotImplementedError
def is_gold_parse(self, StateClass state, GoldParse gold):
raise NotImplementedError
cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError
@ -107,6 +110,8 @@ cdef class TransitionSystem:
def is_valid(self, StateClass stcls, move_name):
action = self.lookup_transition(move_name)
if action.move == 0:
return False
return action.is_valid(stcls.c, action.label)
cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
@ -137,6 +142,10 @@ cdef class TransitionSystem:
"the entity recognizer\n"
"The transition system has %d actions." % (self.n_moves))
def get_class_name(self, int clas):
act = self.c[clas]
return self.move_name(act.move, act.label)
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int):

View File

@ -11,9 +11,9 @@ from ..strings import StringStore
from .. import util
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'],
'fr': ['fr_depvec_web_lg'],
'xx': ['xx_ent_web_md']}
@ -86,6 +86,9 @@ def hu_tokenizer():
def fi_tokenizer():
return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture
def id_tokenizer():
return util.get_lang_class('id').Defaults.create_tokenizer()
@pytest.fixture
def sv_tokenizer():

View File

@ -2,12 +2,18 @@
from __future__ import unicode_literals
import pytest
from ....tokens.doc import Doc
@pytest.fixture
def en_lemmatizer(EN):
return EN.Defaults.create_lemmatizer()
@pytest.mark.models('en')
def test_doc_lemmatization(EN):
doc = Doc(EN.vocab, words=['bleed'])
doc[0].tag_ = 'VBP'
assert doc[0].lemma_ == 'bleed'
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
@ -19,6 +25,16 @@ def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
assert en_lemmatizer.noun(text) == set(lemmas)
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
("feed", ["feed"]),
("need", ["need"]),
("ring", ["ring"]),
("axes", ["axis", "axe", "ax"])])
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
assert en_lemmatizer.noun(text) == set(lemmas)
@pytest.mark.xfail
@pytest.mark.models('en')
def test_en_lemmatizer_base_forms(en_lemmatizer):

View File

@ -25,7 +25,6 @@ def test_tag_names(EN):
doc = EN(text, disable=['parser'])
assert type(doc[2].pos) == int
assert isinstance(doc[2].pos_, six.text_type)
assert type(doc[2].dep) == int
assert isinstance(doc[2].dep_, six.text_type)
assert doc[2].tag_ == u'NNS'

View File

View File

@ -0,0 +1,115 @@
# coding: utf-8
"""Test that tokenizer prefixes, suffixes and infixes are handled correctly."""
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(Ma'arif)"])
def test_tokenizer_splits_no_special(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Ma'arif"])
def test_tokenizer_splits_no_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["(Ma'arif"])
def test_tokenizer_splits_prefix_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["Ma'arif)"])
def test_tokenizer_splits_suffix_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(Ma'arif)"])
def test_tokenizer_splits_even_wrap(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(Ma'arif?)"])
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
def test_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
tokens = id_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["S.Kom.)"])
def test_tokenizer_splits_suffix_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(S.Kom.)"])
def test_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(S.Kom.?)"])
def test_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)])
def test_tokenizer_splits_hyphens(id_tokenizer, text, length):
tokens = id_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_tokenizer_splits_numeric_range(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"])
def test_tokenizer_splits_period_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"])
def test_tokenizer_splits_comma_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
assert tokens[0].text == text.split(",")[0]
assert tokens[1].text == ","
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"])
def test_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
def test_tokenizer_splits_double_hyphen_infix(id_tokenizer):
tokens = id_tokenizer("Arsene Wenger--manajer Arsenal--melakukan konferensi pers.")
assert len(tokens) == 10
assert tokens[0].text == "Arsene"
assert tokens[1].text == "Wenger"
assert tokens[2].text == "--"
assert tokens[3].text == "manajer"
assert tokens[4].text == "Arsenal"
assert tokens[5].text == "--"
assert tokens[6].text == "melakukan"
assert tokens[7].text == "konferensi"
assert tokens[8].text == "pers"
assert tokens[9].text == "."

View File

@ -0,0 +1,10 @@
import spacy
import pytest
@pytest.mark.models
def test_beam_parse():
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Australia is a country', disable=['ner'])
ents = nlp.entity(doc, beam_width=2)
print(ents)

View File

@ -0,0 +1,73 @@
from __future__ import unicode_literals
import pytest
from ...vocab import Vocab
from ...syntax.ner import BiluoPushDown
from ...gold import GoldParse
from ...tokens import Doc
@pytest.fixture
def vocab():
return Vocab()
@pytest.fixture
def doc(vocab):
return Doc(vocab, words=['Casey', 'went', 'to', 'New', 'York', '.'])
@pytest.fixture
def entity_annots(doc):
casey = doc[0:1]
ny = doc[3:5]
return [(casey.start_char, casey.end_char, 'PERSON'),
(ny.start_char, ny.end_char, 'GPE')]
@pytest.fixture
def entity_types(entity_annots):
return sorted(set([label for (s, e, label) in entity_annots]))
@pytest.fixture
def tsys(vocab, entity_types):
actions = BiluoPushDown.get_actions(entity_types=entity_types)
return BiluoPushDown(vocab.strings, actions)
def test_get_oracle_moves(tsys, doc, entity_annots):
gold = GoldParse(doc, entities=entity_annots)
tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes]
assert names == ['U-PERSON', 'O', 'O', 'B-GPE', 'L-GPE', 'O']
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
entity_annots = [(s, e, '!' + label) for s, e, label in entity_annots]
gold = GoldParse(doc, entities=entity_annots)
for i, tag in enumerate(gold.ner):
if tag == 'L-!GPE':
gold.ner[i] = '-'
tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes]
def test_get_oracle_moves_negative_entities2(tsys, vocab):
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
gold = GoldParse(doc, entities=[])
gold.ner = ['B-!PERSON', 'L-!PERSON', 'B-!PERSON', 'L-!PERSON']
tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes]
def test_get_oracle_moves_negative_O(tsys, vocab):
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
gold = GoldParse(doc, entities=[])
gold.ner = ['O', '!O', 'O', '!O']
tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes]

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from thinc.neural import Model
from mock import Mock
import pytest
import numpy
@ -36,7 +35,7 @@ def parser(vocab, arc_eager):
@pytest.fixture
def model(arc_eager, tok2vec):
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
@pytest.fixture
def doc(vocab):
@ -45,29 +44,50 @@ def doc(vocab):
@pytest.fixture
def gold(doc):
return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R'])
def test_can_init_nn_parser(parser):
assert parser.model is None
def test_build_model(parser):
parser.model = Parser.Model(parser.moves.n_moves)
parser.model = Parser.Model(parser.moves.n_moves)[0]
assert parser.model is not None
@pytest.mark.xfail
def test_predict_doc(parser, tok2vec, model, doc):
doc.tensor = tok2vec([doc])
doc.tensor = tok2vec([doc])[0]
parser.model = model
parser(doc)
@pytest.mark.xfail
def test_update_doc(parser, tok2vec, model, doc, gold):
parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update((doc, tokvecs), gold)
assert d_tokvecs.shape == tokvecs.shape
d_tokvecs = parser.update(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize)
assert d_tokvecs.sum() == 0.
assert d_tokvecs[0].sum() == 0.
def test_predict_doc_beam(parser, tok2vec, model, doc):
doc.tensor = tok2vec([doc])[0]
parser.model = model
parser(doc, beam_width=32, beam_density=0.001)
for word in doc:
print(word.text, word.head, word.dep_)
def test_update_doc_beam(parser, tok2vec, model, doc, gold):
parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize)
assert d_tokvecs[0].sum() == 0.

View File

@ -0,0 +1,87 @@
from __future__ import unicode_literals
import pytest
import numpy
from thinc.api import layerize
from ...vocab import Vocab
from ...syntax.arc_eager import ArcEager
from ...tokens import Doc
from ...gold import GoldParse
from ...syntax._beam_utils import ParserBeam, update_beam
from ...syntax.stateclass import StateClass
@pytest.fixture
def vocab():
return Vocab()
@pytest.fixture
def moves(vocab):
aeager = ArcEager(vocab.strings, {})
aeager.add_action(2, 'nsubj')
aeager.add_action(3, 'dobj')
aeager.add_action(2, 'aux')
return aeager
@pytest.fixture
def docs(vocab):
return [Doc(vocab, words=['Rats', 'bite', 'things'])]
@pytest.fixture
def states(docs):
return [StateClass(doc) for doc in docs]
@pytest.fixture
def tokvecs(docs, vector_size):
output = []
for doc in docs:
vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size))
output.append(numpy.asarray(vec))
return output
@pytest.fixture
def golds(docs):
return [GoldParse(doc) for doc in docs]
@pytest.fixture
def batch_size(docs):
return len(docs)
@pytest.fixture
def beam_width():
return 4
@pytest.fixture
def vector_size():
return 6
@pytest.fixture
def beam(moves, states, golds, beam_width):
return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
@pytest.fixture
def scores(moves, batch_size, beam_width):
return [
numpy.asarray(
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
dtype='f')
for _ in range(batch_size)]
def test_create_beam(beam):
pass
def test_beam_advance(beam, scores):
beam.advance(scores)
def test_beam_advance_too_few_scores(beam, scores):
with pytest.raises(IndexError):
beam.advance(scores[:-1])

View File

@ -0,0 +1,12 @@
'''Test tokens compare correctly'''
from __future__ import unicode_literals
from ..util import get_doc
from ...vocab import Vocab
def test_issue1257():
doc1 = get_doc(Vocab(), ['a', 'b', 'c'])
doc2 = get_doc(Vocab(), ['a', 'c', 'e'])
assert doc1[0] != doc2[0]
assert not doc1[0] == doc2[0]

View File

@ -11,8 +11,8 @@ import pytest
def taggers(en_vocab):
tagger1 = Tagger(en_vocab)
tagger2 = Tagger(en_vocab)
tagger1.model = tagger1.Model(None, None)
tagger2.model = tagger2.Model(None, None)
tagger1.model = tagger1.Model(8, 8)
tagger2.model = tagger1.model
return (tagger1, tagger2)
@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
tagger1, tagger2 = taggers
tagger1_b = tagger1.to_bytes()
tagger2_b = tagger2.to_bytes()
assert tagger1_b == tagger2_b
tagger1 = tagger1.from_bytes(tagger1_b)
assert tagger1.to_bytes() == tagger1_b
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from ..util import get_doc
from ...attrs import ORTH, LENGTH
import pytest
@ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer):
span3 = tokens[0:2]
assert hash(span3) == hash(span1)
def test_spans_by_character(doc):
span1 = doc[1:-2]
span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == 'GPE'
def test_span_to_array(doc):
span = doc[1:-2]
arr = span.to_array([ORTH, LENGTH])
assert arr.shape == (len(span), 2)
assert arr[0, 0] == span[0].orth
assert arr[0, 1] == len(span[0])

View File

@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors):
"""Add list of vector tuples to given vocab. All vectors need to have the
same length. Format: [("text", [1, 2, 3])]"""
length = len(vectors[0][1])
vocab.resize_vectors(length)
vocab.clear_vectors(length)
for word, vec in vectors:
vocab[word].vector = vec
vocab.set_vector(word, vec)
return vocab

View File

@ -14,10 +14,9 @@ def vectors():
@pytest.fixture()
def vocab(en_vocab, vectors):
#return add_vecs_to_vocab(en_vocab, vectors)
return None
add_vecs_to_vocab(en_vocab, vectors)
return en_vocab
@pytest.mark.xfail
def test_vectors_similarity_LL(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
lex1 = vocab[word1]
@ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors):
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@pytest.mark.xfail
def test_vectors_similarity_TT(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])
@ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors):
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
@pytest.mark.xfail
def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
@pytest.mark.xfail
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
@pytest.mark.xfail
def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = get_doc(vocab, words=[word1, word2])

View File

@ -2,6 +2,8 @@
from __future__ import unicode_literals
from ...vectors import Vectors
from ...tokenizer import Tokenizer
from ..util import add_vecs_to_vocab, get_doc
import numpy
import pytest
@ -11,22 +13,42 @@ import pytest
def strings():
return ["apple", "orange"]
@pytest.fixture
def vectors():
return [
("apple", [1, 2, 3]),
("orange", [-1, -2, -3]),
('and', [-1, -1, -1]),
('juice', [5, 5, 10]),
('pie', [7, 6.3, 8.9])]
@pytest.fixture
def data():
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f')
@pytest.fixture()
def vocab(en_vocab, vectors):
add_vecs_to_vocab(en_vocab, vectors)
return en_vocab
def test_init_vectors_with_data(strings, data):
v = Vectors(strings, data)
assert v.shape == data.shape
def test_init_vectors_with_width(strings):
v = Vectors(strings, 3)
for string in strings:
v.add(string)
assert v.shape == (len(strings), 3)
def test_get_vector(strings, data):
v = Vectors(strings, data)
for string in strings:
v.add(string)
assert list(v[strings[0]]) == list(data[0])
assert list(v[strings[0]]) != list(data[1])
assert list(v[strings[1]]) != list(data[0])
@ -35,6 +57,8 @@ def test_get_vector(strings, data):
def test_set_vector(strings, data):
orig = data.copy()
v = Vectors(strings, data)
for string in strings:
v.add(string)
assert list(v[strings[0]]) == list(orig[0])
assert list(v[strings[0]]) != list(orig[1])
v[strings[0]] = data[1]
@ -42,125 +66,111 @@ def test_set_vector(strings, data):
assert list(v[strings[0]]) != list(orig[0])
#
#@pytest.fixture()
#def tokenizer_v(vocab):
# return Tokenizer(vocab, {}, None, None, None)
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', ["apple and orange"])
#def test_vectors_token_vector(tokenizer_v, vectors, text):
# doc = tokenizer_v(text)
# assert vectors[0] == (doc[0].text, list(doc[0].vector))
# assert vectors[1] == (doc[2].text, list(doc[2].vector))
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', ["apple", "orange"])
#def test_vectors_lexeme_vector(vocab, text):
# lex = vocab[text]
# assert list(lex.vector)
# assert lex.vector_norm
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
#def test_vectors_doc_vector(vocab, text):
# doc = get_doc(vocab, text)
# assert list(doc.vector)
# assert doc.vector_norm
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
#def test_vectors_span_vector(vocab, text):
# span = get_doc(vocab, text)[0:2]
# assert list(span.vector)
# assert span.vector_norm
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', ["apple orange"])
#def test_vectors_token_token_similarity(tokenizer_v, text):
# doc = tokenizer_v(text)
# assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
# assert 0.0 < doc[0].similarity(doc[1]) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
# token = tokenizer_v(text1)
# lex = vocab[text2]
# assert token.similarity(lex) == lex.similarity(token)
# assert 0.0 < token.similarity(lex) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_token_span_similarity(vocab, text):
# doc = get_doc(vocab, text)
# assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
# assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_token_doc_similarity(vocab, text):
# doc = get_doc(vocab, text)
# assert doc[0].similarity(doc) == doc.similarity(doc[0])
# assert 0.0 < doc[0].similarity(doc) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_lexeme_span_similarity(vocab, text):
# doc = get_doc(vocab, text)
# lex = vocab[text[0]]
# assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
# assert 0.0 < doc.similarity(doc[1:3]) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
# lex1 = vocab[text1]
# lex2 = vocab[text2]
# assert lex1.similarity(lex2) == lex2.similarity(lex1)
# assert 0.0 < lex1.similarity(lex2) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_lexeme_doc_similarity(vocab, text):
# doc = get_doc(vocab, text)
# lex = vocab[text[0]]
# assert lex.similarity(doc) == doc.similarity(lex)
# assert 0.0 < lex.similarity(doc) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_span_span_similarity(vocab, text):
# doc = get_doc(vocab, text)
# assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
# assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
#def test_vectors_span_doc_similarity(vocab, text):
# doc = get_doc(vocab, text)
# assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
# assert 0.0 < doc[0:2].similarity(doc) < 1.0
#
#
#@pytest.mark.xfail
#@pytest.mark.parametrize('text1,text2', [
# (["apple", "and", "apple", "pie"], ["orange", "juice"])])
#def test_vectors_doc_doc_similarity(vocab, text1, text2):
# doc1 = get_doc(vocab, text1)
# doc2 = get_doc(vocab, text2)
# assert doc1.similarity(doc2) == doc2.similarity(doc1)
# assert 0.0 < doc1.similarity(doc2) < 1.0
@pytest.fixture()
def tokenizer_v(vocab):
return Tokenizer(vocab, {}, None, None, None)
@pytest.mark.parametrize('text', ["apple and orange"])
def test_vectors_token_vector(tokenizer_v, vectors, text):
doc = tokenizer_v(text)
assert vectors[0] == (doc[0].text, list(doc[0].vector))
assert vectors[1] == (doc[2].text, list(doc[2].vector))
@pytest.mark.parametrize('text', ["apple", "orange"])
def test_vectors_lexeme_vector(vocab, text):
lex = vocab[text]
assert list(lex.vector)
assert lex.vector_norm
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
def test_vectors_doc_vector(vocab, text):
doc = get_doc(vocab, text)
assert list(doc.vector)
assert doc.vector_norm
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
def test_vectors_span_vector(vocab, text):
span = get_doc(vocab, text)[0:2]
assert list(span.vector)
assert span.vector_norm
@pytest.mark.parametrize('text', ["apple orange"])
def test_vectors_token_token_similarity(tokenizer_v, text):
doc = tokenizer_v(text)
assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0])
assert -1. < doc[0].similarity(doc[1]) < 1.0
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
token = tokenizer_v(text1)
lex = vocab[text2]
assert token.similarity(lex) == lex.similarity(token)
assert -1. < token.similarity(lex) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_token_span_similarity(vocab, text):
doc = get_doc(vocab, text)
assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0])
assert -1. < doc[0].similarity(doc[1:3]) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_token_doc_similarity(vocab, text):
doc = get_doc(vocab, text)
assert doc[0].similarity(doc) == doc.similarity(doc[0])
assert -1. < doc[0].similarity(doc) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_lexeme_span_similarity(vocab, text):
doc = get_doc(vocab, text)
lex = vocab[text[0]]
assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex)
assert -1. < doc.similarity(doc[1:3]) < 1.0
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
lex1 = vocab[text1]
lex2 = vocab[text2]
assert lex1.similarity(lex2) == lex2.similarity(lex1)
assert -1. < lex1.similarity(lex2) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_lexeme_doc_similarity(vocab, text):
doc = get_doc(vocab, text)
lex = vocab[text[0]]
assert lex.similarity(doc) == doc.similarity(lex)
assert -1. < lex.similarity(doc) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_span_span_similarity(vocab, text):
doc = get_doc(vocab, text)
assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2])
assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
def test_vectors_span_doc_similarity(vocab, text):
doc = get_doc(vocab, text)
assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2])
assert -1. < doc[0:2].similarity(doc) < 1.0
@pytest.mark.parametrize('text1,text2', [
(["apple", "and", "apple", "pie"], ["orange", "juice"])])
def test_vectors_doc_doc_similarity(vocab, text1, text2):
doc1 = get_doc(vocab, text1)
doc2 = get_doc(vocab, text2)
assert doc1.similarity(doc2) == doc2.similarity(doc1)
assert -1. < doc1.similarity(doc2) < 1.0

View File

@ -33,6 +33,7 @@ cdef class Doc:
cdef public object _vector_norm
cdef public object tensor
cdef public object cats
cdef public object user_data
cdef TokenC* c

View File

@ -117,6 +117,7 @@ cdef class Doc:
self.is_tagged = False
self.is_parsed = False
self.sentiment = 0.0
self.cats = {}
self.user_hooks = {}
self.user_token_hooks = {}
self.user_span_hooks = {}
@ -237,6 +238,29 @@ cdef class Doc:
def doc(self):
return self
def char_span(self, int start_idx, int end_idx, label=0, vector=None):
"""Create a `Span` object from the slice `doc.text[start : end]`.
doc (Doc): The parent document.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
RETURNS (Span): The newly constructed object.
"""
if not isinstance(label, int):
label = self.vocab.strings.add(label)
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
cdef int end = token_by_end(self.c, self.length, end_idx)
if end == -1:
return None
# Currently we have the token index, we want the range-end index
end += 1
cdef Span span = Span(self, start, end, label=label, vector=vector)
return span
def similarity(self, other):
"""Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
@ -279,8 +303,14 @@ cdef class Doc:
return self.user_hooks['vector'](self)
if self._vector is not None:
return self._vector
elif self.has_vector and len(self):
self._vector = sum(t.vector for t in self) / len(self)
elif not len(self):
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
return self._vector
elif self.has_vector:
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
for token in self.c[:self.length]:
vector += self.vocab.get_vector(token.lex.orth)
self._vector = vector / len(self)
return self._vector
elif self.tensor is not None:
self._vector = self.tensor.mean(axis=0)

View File

@ -15,5 +15,5 @@ cdef class Span:
cdef public _vector
cdef public _vector_norm
cpdef int _recalculate_indices(self) except -1
cpdef np.ndarray to_array(self, object features)

View File

@ -7,7 +7,7 @@ import numpy
import numpy.linalg
from libc.math cimport sqrt
from .doc cimport token_by_start, token_by_end
from .doc cimport token_by_start, token_by_end, get_token_attr
from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t, hash_t
from ..attrs cimport attr_id_t
@ -135,6 +135,29 @@ cdef class Span:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
The values will be 32-bit integers.
attr_ids (list[int]): A list of attribute ID ints.
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
per word, and one column per attribute indicated in the input
`attr_ids`.
"""
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
cdef int length = self.end - self.start
output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64)
for i in range(self.start, self.end):
for j, feature in enumerate(attr_ids):
output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
return output
cpdef int _recalculate_indices(self) except -1:
if self.end > self.doc.length \
or self.doc.c[self.start].idx != self.start_char \

View File

@ -62,18 +62,26 @@ cdef class Token:
def __richcmp__(self, Token other, int op):
# http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
cdef Doc my_doc = self.doc
cdef Doc other_doc = other.doc
my = self.idx
their = other.idx if other is not None else None
if op == 0:
return my < their
elif op == 2:
return my == their
if my_doc is other_doc:
return my == their
else:
return False
elif op == 4:
return my > their
elif op == 1:
return my <= their
elif op == 3:
return my != their
if my_doc is other_doc:
return my != their
else:
return True
elif op == 5:
return my >= their
else:

View File

@ -22,7 +22,7 @@ import ujson
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import copy_array, normalize_string_keys, getattr_
from .compat import copy_array, normalize_string_keys, getattr_, import_file
LANGUAGES = {}
@ -112,15 +112,13 @@ def load_model(name, **overrides):
def load_model_from_link(name, **overrides):
"""Load a model from a shortcut link, or directory in spaCy data path."""
init_file = get_data_path() / name / '__init__.py'
spec = importlib.util.spec_from_file_location(name, init_file)
path = get_data_path() / name / '__init__.py'
try:
cls = importlib.util.module_from_spec(spec)
cls = import_file(name, path)
except AttributeError:
raise IOError(
"Cant' load '%s'. If you're using a shortcut link, make sure it "
"points to a valid model package (not just a data directory)." % name)
spec.loader.exec_module(cls)
return cls.load(**overrides)
@ -171,8 +169,8 @@ def get_model_meta(path):
raise IOError("Could not read meta.json from %s" % meta_path)
meta = read_json(meta_path)
for setting in ['lang', 'name', 'version']:
if setting not in meta:
raise ValueError('No %s setting found in model meta.json' % setting)
if setting not in meta or not meta[setting]:
raise ValueError("No valid '%s' setting found in model meta.json" % setting)
return meta

View File

@ -1,18 +1,25 @@
from __future__ import unicode_literals
from libc.stdint cimport int32_t, uint64_t
import numpy
from collections import OrderedDict
import msgpack
import msgpack_numpy
msgpack_numpy.patch()
cimport numpy as np
from .typedefs cimport attr_t
from .strings cimport StringStore
from . import util
from .compat import basestring_
cdef class Vectors:
'''Store, save and load word vectors.'''
cdef public object data
cdef readonly StringStore strings
cdef public object key2i
cdef public object key2row
cdef public object keys
cdef public int i
def __init__(self, strings, data_or_width):
self.strings = StringStore()
@ -21,10 +28,10 @@ cdef class Vectors:
dtype='f')
else:
data = data_or_width
self.i = 0
self.data = data
self.key2i = {}
for i, string in enumerate(strings):
self.key2i[self.strings.add(string)] = i
self.key2row = {}
self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
def __reduce__(self):
return (Vectors, (self.strings, self.data))
@ -32,7 +39,7 @@ cdef class Vectors:
def __getitem__(self, key):
if isinstance(key, basestring):
key = self.strings[key]
i = self.key2i[key]
i = self.key2row[key]
if i is None:
raise KeyError(key)
else:
@ -41,14 +48,36 @@ cdef class Vectors:
def __setitem__(self, key, vector):
if isinstance(key, basestring):
key = self.strings.add(key)
i = self.key2i[key]
i = self.key2row[key]
self.data[i] = vector
def __iter__(self):
yield from self.data
def __len__(self):
return len(self.strings)
return self.i
def __contains__(self, key):
if isinstance(key, basestring_):
key = self.strings[key]
return key in self.key2row
def add(self, key, vector=None):
if isinstance(key, basestring_):
key = self.strings.add(key)
if key not in self.key2row:
i = self.i
if i >= self.keys.shape[0]:
self.keys.resize((self.keys.shape[0]*2,))
self.data.resize((self.data.shape[0]*2, self.data.shape[1]))
self.key2row[key] = self.i
self.keys[self.i] = key
self.i += 1
else:
i = self.key2row[key]
if vector is not None:
self.data[i] = vector
return i
def items(self):
for i, string in enumerate(self.strings):
@ -61,34 +90,87 @@ cdef class Vectors:
def most_similar(self, key):
raise NotImplementedError
def to_disk(self, path):
raise NotImplementedError
def from_glove(self, path):
'''Load GloVe vectors from a directory. Assumes binary format,
that the vocab is in a vocab.txt, and that vectors are named
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
By default GloVe outputs 64-bit vectors.'''
path = util.ensure_path(path)
for name in path.iterdir():
if name.parts[-1].startswith('vectors'):
_, dims, dtype, _2 = name.parts[-1].split('.')
self.width = int(dims)
break
else:
raise IOError("Expected file named e.g. vectors.128.f.bin")
bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
dtype=dtype)
with bin_loc.open('rb') as file_:
self.data = numpy.fromfile(file_, dtype='float64')
self.data = numpy.ascontiguousarray(self.data, dtype='float32')
n = 0
with (path / 'vocab.txt').open('r') as file_:
for line in file_:
self.add(line.strip())
n += 1
if (self.data.size % self.width) == 0:
self.data
def from_disk(self, path):
raise NotImplementedError
def to_disk(self, path, **exclude):
serializers = OrderedDict((
('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)),
('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)),
))
return util.to_disk(path, serializers, exclude)
def from_disk(self, path, **exclude):
def load_keys(path):
if path.exists():
self.keys = numpy.load(path)
for i, key in enumerate(self.keys):
self.keys[i] = key
self.key2row[key] = i
def load_vectors(path):
if path.exists():
self.data = numpy.load(path)
serializers = OrderedDict((
('keys', load_keys),
('vectors', load_vectors),
))
util.from_disk(path, serializers, exclude)
return self
def to_bytes(self, **exclude):
def serialize_weights():
if hasattr(self.weights, 'to_bytes'):
return self.weights.to_bytes()
if hasattr(self.data, 'to_bytes'):
return self.data.to_bytes()
else:
return msgpack.dumps(self.weights)
return msgpack.dumps(self.data)
serializers = OrderedDict((
('strings', lambda: self.strings.to_bytes()),
('weights', serialize_weights)
('keys', lambda: msgpack.dumps(self.keys)),
('vectors', serialize_weights)
))
return util.to_bytes(serializers, exclude)
def from_bytes(self, data, **exclude):
def deserialize_weights(b):
if hasattr(self.weights, 'from_bytes'):
self.weights.from_bytes()
if hasattr(self.data, 'from_bytes'):
self.data.from_bytes()
else:
self.weights = msgpack.loads(b)
self.data = msgpack.loads(b)
def load_keys(keys):
self.keys.resize((len(keys),))
for i, key in enumerate(keys):
self.keys[i] = key
self.key2row[key] = i
deserializers = OrderedDict((
('strings', lambda b: self.strings.from_bytes(b)),
('weights', deserialize_weights)
('keys', lambda b: load_keys(msgpack.loads(b))),
('vectors', deserialize_weights)
))
return util.from_bytes(deserializers, exclude)
util.from_bytes(data, deserializers, exclude)
return self

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import bz2
import ujson
import re
import numpy
from libc.string cimport memset, memcpy
from libc.stdint cimport int32_t
@ -19,9 +20,10 @@ from .tokens.token cimport Token
from .attrs cimport PROB, LANG
from .structs cimport SerializedLexemeC
from .compat import copy_reg, pickle
from .compat import copy_reg, pickle, basestring_
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .vectors import Vectors
from . import util
from . import attrs
from . import symbols
@ -63,6 +65,7 @@ cdef class Vocab:
self.strings.add(name)
self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.vectors = Vectors(self.strings, 300)
property lang:
def __get__(self):
@ -242,13 +245,15 @@ cdef class Vocab:
@property
def vectors_length(self):
raise NotImplementedError
return self.vectors.data.shape[1]
def clear_vectors(self):
def clear_vectors(self, new_dim=None):
"""Drop the current vector table. Because all vectors must be the same
width, you have to call this to change the size of the vectors.
"""
raise NotImplementedError
if new_dim is None:
new_dim = self.vectors.data.shape[1]
self.vectors = Vectors(self.strings, new_dim)
def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary.
@ -262,7 +267,12 @@ cdef class Vocab:
RAISES: If no vectors data is loaded, ValueError is raised.
"""
raise NotImplementedError
if isinstance(orth, basestring_):
orth = self.strings.add(orth)
if orth in self.vectors.key2row:
return self.vectors[orth]
else:
return numpy.zeros((self.vectors_length,), dtype='f')
def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary.
@ -272,15 +282,19 @@ cdef class Vocab:
RETURNS:
None
"""
raise NotImplementedError
if not isinstance(orth, basestring_):
orth = self.strings[orth]
self.vectors.add(orth, vector=vector)
def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no
vectors have been loaded. Words can be looked up by string
or int ID."""
return False
if isinstance(orth, basestring_):
orth = self.strings.add(orth)
return orth in self.vectors
def to_disk(self, path):
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
@ -292,8 +306,10 @@ cdef class Vocab:
self.strings.to_disk(path / 'strings.json')
with (path / 'lexemes.bin').open('wb') as file_:
file_.write(self.lexemes_to_bytes())
if self.vectors is not None:
self.vectors.to_disk(path)
def from_disk(self, path):
def from_disk(self, path, **exclude):
"""Loads state from a directory. Modifies the object in place and
returns it.
@ -305,6 +321,8 @@ cdef class Vocab:
self.strings.from_disk(path / 'strings.json')
with (path / 'lexemes.bin').open('rb') as file_:
self.lexemes_from_bytes(file_.read())
if self.vectors is not None:
self.vectors.from_disk(path, exclude='strings.json')
return self
def to_bytes(self, **exclude):
@ -313,9 +331,16 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vocab` object.
"""
def deserialize_vectors():
if self.vectors is None:
return None
else:
return self.vectors.to_bytes(exclude='strings.json')
getters = OrderedDict((
('strings', lambda: self.strings.to_bytes()),
('lexemes', lambda: self.lexemes_to_bytes()),
('vectors', deserialize_vectors)
))
return util.to_bytes(getters, exclude)
@ -326,9 +351,15 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being loaded.
RETURNS (Vocab): The `Vocab` object.
"""
def serialize_vectors(b):
if self.vectors is None:
return None
else:
return self.vectors.from_bytes(b, exclude='strings')
setters = OrderedDict((
('strings', lambda b: self.strings.from_bytes(b)),
('lexemes', lambda b: self.lexemes_from_bytes(b)),
('vectors', lambda b: serialize_vectors(b))
))
util.from_bytes(bytes_data, setters, exclude)
return self

View File

@ -2,9 +2,8 @@
if [ "${VIA}" == "pypi" ]; then
rm -rf *
pip install spacy
python -m spacy.en.download
python -m spacy.de.download
pip install spacy-nightly
python -m spacy download en
fi
if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then

View File

@ -103,20 +103,20 @@ mixin button(url, trusted, ...style)
label - [string] aside title (optional or false for no label)
language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS
icon - [string] icon to display next to code block, mostly used for old/new
prompt - [string] prompt or icon to display next to code block, (mostly used for old/new)
height - [integer] optional height to clip code block to
mixin code(label, language, icon, height)
mixin code(label, language, prompt, height)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
- var icon = (prompt == 'accept' || prompt == 'reject')
if icon
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
+icon(icon, 18)
code.c-code-block__content
code.c-code-block__content(data-prompt=icon ? null : prompt)
block

View File

@ -112,6 +112,10 @@
.u-nowrap
white-space: nowrap
.u-break.u-break
word-wrap: break-word
white-space: initial
.u-no-border
border: none

View File

@ -35,6 +35,13 @@
font: normal normal 1.1rem/#{2} $font-code
padding: 1em 2em
&[data-prompt]:before,
content: attr(data-prompt)
margin-right: 0.65em
display: inline-block
vertical-align: middle
opacity: 0.5
//- Inline code

Some files were not shown because too many files have changed in this diff Show More