Merge branch 'develop' into polish

This commit is contained in:
Jim Regan 2017-06-26 22:42:27 +01:00 committed by GitHub
commit d81ceb0cd5
270 changed files with 37713 additions and 24982 deletions

1
.appveyor.yml Normal file
View File

@ -0,0 +1 @@
build: off

1
.gitignore vendored
View File

@ -30,6 +30,7 @@ Profile.prof
__pycache__/
*.py[cod]
.env/
.env*
.~env/
.venv
venv/

View File

@ -4,12 +4,10 @@ spaCy: Industrial-strength NLP
spaCy is a library for advanced natural language processing in Python and
Cython. spaCy is built on the very latest research, but it isn't researchware.
It was designed from day one to be used in real products. spaCy currently supports
English, German and French, as well as tokenization for Spanish, Italian,
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
Chinese and Japanese. It's commercial open-source software, released under the
MIT license.
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
English, German, French and Spanish, as well as tokenization for Italian,
Portuguese, Dutch, Swedish, Finnish, Norwegian, Danish, Hungarian, Polish,
Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software,
released under the MIT license.
💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
@ -85,7 +83,7 @@ Features
* GIL-free **multi-threading**
* Efficient binary serialization
* Easy **deep learning** integration
* Statistical models for **English** and **German**
* Statistical models for **English**, **German**, **French** and **Spanish**
* State-of-the-art speed
* Robust, rigorously evaluated accuracy
@ -197,7 +195,7 @@ To load a model, use ``spacy.load()`` with the model's shortcut link:
.. code:: python
import spacy
nlp = spacy.load('en_default')
nlp = spacy.load('en')
doc = nlp(u'This is a sentence.')
If you've installed a model via pip, you can also ``import`` it directly and
@ -313,7 +311,7 @@ and ``--model`` are optional and enable additional tests:
# make sure you are using recent pytest version
python -m pip install -U pytest
python -m pytest <spacy-directory> --vectors --models --slow
python -m pytest <spacy-directory>
🛠 Changelog
============

View File

@ -1,68 +1,27 @@
from __future__ import unicode_literals, print_function
import json
import pathlib
import random
import spacy
from spacy.pipeline import EntityRecognizer
from spacy.gold import GoldParse
from spacy.tagger import Tagger
try:
unicode
except:
unicode = str
from spacy.lang.en import English
from spacy.gold import GoldParse, biluo_tags_from_offsets
def train_ner(nlp, train_data, entity_types):
# Add new words to vocab.
for raw_text, _ in train_data:
doc = nlp.make_doc(raw_text)
for word in doc:
_ = nlp.vocab[word.orth]
# Train NER.
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
for itn in range(5):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
ner.update(doc, gold)
return ner
def save_model(ner, model_dir):
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.is_dir()
with (model_dir / 'config.json').open('wb') as file_:
data = json.dumps(ner.cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
ner.model.dump(str(model_dir / 'model'))
if not (model_dir / 'vocab').exists():
(model_dir / 'vocab').mkdir()
ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
ner.vocab.strings.dump(file_)
def reformat_train_data(tokenizer, examples):
"""Reformat data to match JSON format"""
output = []
for i, (text, entity_offsets) in enumerate(examples):
doc = tokenizer(text)
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
words = [w.text for w in doc]
tags = ['-'] * len(doc)
heads = [0] * len(doc)
deps = [''] * len(doc)
sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
output.append((text, [(sentence, [])]))
return output
def main(model_dir=None):
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
# v1.1.2 onwards
if nlp.tagger is None:
print('---- WARNING ----')
print('Data directory not found')
print('please run: `python -m spacy.en.download --force all` for better performance')
print('Using feature templates for tagging')
print('-----------------')
nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
train_data = [
(
'Who is Shaka Khan?',
@ -74,23 +33,35 @@ def main(model_dir=None):
(len('I like London and '), len('I like London and Berlin'), 'LOC')]
)
]
ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])
doc = nlp.make_doc('Who is Shaka Khan?')
nlp.tagger(doc)
ner(doc)
for word in doc:
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
if model_dir is not None:
save_model(ner, model_dir)
nlp = English(pipeline=['tensorizer', 'ner'])
get_data = lambda: reformat_train_data(nlp.tokenizer, train_data)
optimizer = nlp.begin_training(get_data)
for itn in range(100):
random.shuffle(train_data)
losses = {}
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
nlp.update(
[doc], # Batch of Doc objects
[gold], # Batch of GoldParse objects
drop=0.5, # Dropout -- make it harder to memorise data
sgd=optimizer, # Callable to update weights
losses=losses)
print(losses)
print("Save to", model_dir)
nlp.to_disk(model_dir)
print("Load from", model_dir)
nlp = spacy.lang.en.English(pipeline=['tensorizer', 'ner'])
nlp.from_disk(model_dir)
for raw_text, _ in train_data:
doc = nlp(raw_text)
for word in doc:
print(word.text, word.ent_type_, word.ent_iob_)
if __name__ == '__main__':
main('ner')
import plac
plac.call(main)
# Who "" 2
# is "" 2
# Shaka "" PERSON 3

View File

@ -3,8 +3,8 @@ pathlib
numpy>=1.7
cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0
thinc>=6.6.0,<6.7.0
murmurhash>=0.26,<0.27
thinc>=6.7.3,<6.8.0
murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6
six
ujson>=1.35
@ -14,3 +14,6 @@ regex==2017.4.5
ftfy>=4.4.2,<5.0.0
pytest>=3.0.6,<4.0.0
pip>=9.0.0,<10.0.0
mock>=2.0.0,<3.0.0
msgpack-python
msgpack-numpy

View File

@ -44,7 +44,8 @@ MOD_NAMES = [
'spacy.matcher',
'spacy.syntax.ner',
'spacy.symbols',
'spacy.syntax.iterators']
'spacy.vectors',
]
COMPILE_OPTIONS = {
@ -188,10 +189,10 @@ def setup_package():
ext_modules=ext_modules,
install_requires=[
'numpy>=1.7',
'murmurhash>=0.26,<0.27',
'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0',
'thinc>=6.6.0,<6.7.0',
'thinc>=6.7.3,<6.8.0',
'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0',
'six',
@ -200,7 +201,9 @@ def setup_package():
'dill>=0.2,<0.3',
'requests>=2.13.0,<3.0.0',
'regex==2017.4.5',
'ftfy>=4.4.2,<5.0.0'],
'ftfy>=4.4.2,<5.0.0',
'msgpack-python',
'msgpack-numpy'],
classifiers=[
'Development Status :: 5 - Production/Stable',
'Environment :: Console',

View File

@ -1,22 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
import importlib
from .compat import basestring_
from .cli.info import info
from .cli.info import info as cli_info
from .glossary import explain
from .deprecated import resolve_load_name
from .about import __version__
from . import util
def load(name, **overrides):
name = resolve_load_name(name, **overrides)
model_path = util.resolve_model_path(name)
meta = util.parse_package_meta(model_path)
if 'lang' not in meta:
raise IOError('No language setting found in model meta.')
cls = util.get_lang_class(meta['lang'])
overrides['meta'] = meta
overrides['path'] = model_path
return cls(**overrides)
return util.load_model(name, **overrides)
def info(model=None, markdown=False):
return cli_info(None, model, markdown)

View File

@ -3,135 +3,21 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals
import plac
from spacy.cli import download as cli_download
from spacy.cli import link as cli_link
from spacy.cli import info as cli_info
from spacy.cli import package as cli_package
from spacy.cli import train as cli_train
from spacy.cli import model as cli_model
from spacy.cli import convert as cli_convert
class CLI(object):
"""
Command-line interface for spaCy
"""
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
@plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(self, model, direct=False):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
cli_download(model, direct)
@plac.annotations(
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(self, origin, link_name, force=False):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
cli_link(origin, link_name, force)
@plac.annotations(
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(self, model=None, markdown=False):
"""
Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
cli_info(model, markdown)
@plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(self, input_dir, output_dir, meta=None, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
cli_package(input_dir, output_dir, meta, force)
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int),
nsents=("number of sentences", "option", None, int),
parser_L1=("L1 regularization penalty for parser", "option", "L", float),
use_gpu=("Use GPU", "flag", "g", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_ner=("Don't train NER", "flag", "N", bool)
)
def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
nsents=0, parser_L1=0.0, use_gpu=False,
no_tagger=False, no_parser=False, no_ner=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
nsents = nsents or None
cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents,
use_gpu, not no_tagger, not no_parser, not no_ner, parser_L1)
@plac.annotations(
lang=("model language", "positional", None, str),
model_dir=("output directory to store model in", "positional", None, str),
freqs_data=("tab-separated frequencies file", "positional", None, str),
clusters_data=("Brown clusters file", "positional", None, str),
vectors_data=("word vectors file", "positional", None, str)
)
def model(self, lang, model_dir, freqs_data, clusters_data=None, vectors_data=None):
"""
Initialize a new model and its data directory.
"""
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(self, input_file, output_dir, n_sents=10, morphology=False):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
"""
cli_convert(input_file, output_dir, n_sents, morphology)
def __missing__(self, name):
print("\n Command %r does not exist."
"\n Use the --help flag for a list of available commands.\n" % name)
if __name__ == '__main__':
import plac
import sys
sys.argv[0] = 'spacy'
plac.Interpreter.call(CLI)
from spacy.cli import download, link, info, package, train, convert
from spacy.util import prints
commands = {'download': download, 'link': link, 'info': info, 'train': train,
'convert': convert, 'package': package}
if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1)
command = sys.argv.pop(1)
sys.argv[0] = 'spacy %s' % command
if command in commands:
plac.call(commands[command])
else:
prints("Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command, exits=1)

View File

@ -1,3 +1,4 @@
import ujson
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed
@ -7,24 +8,33 @@ from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm
from thinc.neural._classes.resnet import Residual
from thinc.neural import ReLu
from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP
from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
from .tokens.doc import Doc
import numpy
import io
def _init_for_precomputed(W, ops):
if (W**2).sum() != 0.:
return
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape)
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
nI=Dimension("Input size"),
nF=Dimension("Number of features"),
nO=Dimension("Output size"),
W=Synapses("Weights matrix",
lambda obj: (obj.nO, obj.nF, obj.nI),
lambda W, ops: ops.xavier_uniform_init(W)),
lambda obj: (obj.nF, obj.nO, obj.nI),
lambda W, ops: _init_for_precomputed(W, ops)),
b=Biases("Bias vector",
lambda obj: (obj.nO,)),
d_W=Gradient("W"),
@ -39,25 +49,25 @@ class PrecomputableAffine(Model):
def begin_update(self, X, drop=0.):
# X: (b, i)
# Xf: (b, f, i)
# Yf: (b, f, i)
# dY: (b, o)
# dYf: (b, f, o)
#Yf = numpy.einsum('bi,ofi->bfo', X, self.W)
#Yf = numpy.einsum('bi,foi->bfo', X, self.W)
Yf = self.ops.xp.tensordot(
X, self.W, axes=[[1], [2]]).transpose((0, 2, 1))
X, self.W, axes=[[1], [2]])
Yf += self.b
def backward(dY_ids, sgd=None):
tensordot = self.ops.xp.tensordot
dY, ids = dY_ids
Xf = X[ids]
#dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
dXf = tensordot(dY, self.W, axes=[[1], [1]])
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
dW = self.ops.xp.tensordot(dY, Xf, axes=[[0], [0]])
db = dY.sum(axis=0)
#dXf = numpy.einsum('bo,ofi->bfi', dY, self.W)
dXf = self.ops.xp.tensordot(dY, self.W, axes=[[1], [0]])
self.d_W += dW
self.d_b += db
dW = tensordot(dY, Xf, axes=[[0], [0]])
# ofi -> foi
self.d_W += dW.transpose((1, 0, 2))
self.d_b += dY.sum(axis=0)
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
@ -80,10 +90,10 @@ class PrecomputableAffine(Model):
d_b=Gradient("b")
)
class PrecomputableMaxouts(Model):
def __init__(self, nO=None, nI=None, nF=None, pieces=3, **kwargs):
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
Model.__init__(self, **kwargs)
self.nO = nO
self.nP = pieces
self.nP = nP
self.nI = nI
self.nF = nF
@ -121,37 +131,103 @@ class PrecomputableMaxouts(Model):
return Yfp, backward
def Tok2Vec(width, embed_size, preprocess=None):
cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size)
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2)
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2)
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2)
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
embed = (norm | prefix | suffix | shape )
tok2vec = (
flatten
>> (lower | prefix | suffix | shape )
>> Maxout(width, width*4, pieces=3)
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
with_flatten(
asarray(Model.ops, dtype='uint64')
>> embed
>> Maxout(width, width*4, pieces=3)
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
pad=4)
)
if preprocess not in (False, None):
tok2vec = preprocess >> tok2vec
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
tok2vec.nO = width
tok2vec.embed = embed
return tok2vec
def get_col(idx):
def asarray(ops, dtype):
def forward(X, drop=0.):
return ops.asarray(X, dtype=dtype), None
return layerize(forward)
def foreach(layer):
def forward(Xs, drop=0.):
results = []
backprops = []
for X in Xs:
result, bp = layer.begin_update(X, drop=drop)
results.append(result)
backprops.append(bp)
def backward(d_results, sgd=None):
dXs = []
for d_result, backprop in zip(d_results, backprops):
dXs.append(backprop(d_result, sgd))
return dXs
return results, backward
model = layerize(forward)
model._layers.append(layer)
return model
def rebatch(size, layer):
ops = layer.ops
def forward(X, drop=0.):
if X.shape[0] < size:
return layer.begin_update(X)
parts = _divide_array(X, size)
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
for p in parts])
y = ops.flatten(results)
def backward(dy, sgd=None):
d_parts = [bp(y, sgd=sgd) for bp, y in
zip(bp_results, _divide_array(dy, size))]
try:
dX = ops.flatten(d_parts)
except TypeError:
dX = None
except ValueError:
dX = None
return dX
return y, backward
model = layerize(forward)
model._layers.append(layer)
return model
def _divide_array(X, size):
parts = []
index = 0
while index < len(X):
parts.append(X[index : index + size])
index += size
return parts
def get_col(idx):
assert idx >= 0, idx
def forward(X, drop=0.):
assert idx >= 0, idx
if isinstance(X, numpy.ndarray):
ops = NumpyOps()
else:
ops = CupyOps()
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
def backward(y, sgd=None):
assert idx >= 0, idx
dX = ops.allocate(X.shape)
dX[:, idx] += y
return dX
@ -167,21 +243,17 @@ def zero_init(model):
def doc2feats(cols=None):
cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
def forward(docs, drop=0.):
feats = []
for doc in docs:
if 'cached_feats' not in doc.user_data:
doc.user_data['cached_feats'] = model.ops.asarray(
doc.to_array(cols),
dtype='uint64')
feats.append(doc.user_data['cached_feats'])
assert feats[-1].dtype == 'uint64'
feats.append(doc.to_array(cols))
return feats, None
model = layerize(forward)
model.cols = cols
return model
def print_shape(prefix):
def forward(X, drop=0.):
return X, lambda dX, **kwargs: dX

View File

@ -2,16 +2,16 @@
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy'
__version__ = '1.8.2'
__title__ = 'spacy-nightly'
__version__ = '2.0.0a1'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'
__email__ = 'matt@explosion.ai'
__author__ = 'Explosion AI'
__email__ = 'contact@explosion.ai'
__license__ = 'MIT'
__docs_models__ = 'https://spacy.io/docs/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/develop/templates/model/'

View File

@ -83,6 +83,7 @@ cpdef enum attr_id_t:
ENT_IOB
ENT_TYPE
HEAD
SENT_START
SPACY
PROB

View File

@ -85,6 +85,7 @@ IDS = {
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY,
"PROB": PROB,
"LANG": LANG,
@ -149,6 +150,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
else:
int_key = IDS[name.upper()]
if strings_map is not None and isinstance(value, basestring):
value = strings_map[value]
if hasattr(strings_map, 'add'):
value = strings_map.add(value)
else:
value = strings_map[value]
inty_attrs[int_key] = value
return inty_attrs

View File

@ -2,6 +2,5 @@ from .download import download
from .info import info
from .link import link
from .package import package
from .train import train, train_config
from .model import model
from .train import train
from .convert import convert

View File

@ -1,31 +1,43 @@
# coding: utf8
from __future__ import unicode_literals
import plac
from pathlib import Path
from .converters import conllu2json
from .converters import conllu2json, iob2json
from ..util import prints
# Converters are matched by file extension. To add a converter, add a new entry
# to this dict with the file extension mapped to the converter function imported
# from /converters.
CONVERTERS = {
'.conllu': conllu2json,
'.conll': conllu2json
'.conll': conllu2json,
'.iob': iob2json
}
def convert(input_file, output_dir, *args):
@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(cmd, input_file, output_dir, n_sents, morphology):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
"""
input_path = Path(input_file)
output_path = Path(output_dir)
if not input_path.exists():
prints(input_path, title="Input file not found", exits=True)
prints(input_path, title="Input file not found", exits=1)
if not output_path.exists():
prints(output_path, title="Output directory not found", exits=True)
prints(output_path, title="Output directory not found", exits=1)
file_ext = input_path.suffix
if not file_ext in CONVERTERS:
prints("Can't find converter for %s" % input_path.parts[-1],
title="Unknown format", exits=True)
CONVERTERS[file_ext](input_path, output_path, *args)
title="Unknown format", exits=1)
CONVERTERS[file_ext](input_path, output_path,
n_sents=n_sents, use_morphology=morphology)

View File

@ -1 +1,2 @@
from .conllu2json import conllu2json
from .iob2json import iob2json

View File

@ -73,10 +73,10 @@ def generate_sentence(sent):
tokens = []
for i, id in enumerate(id_):
token = {}
token["orth"] = word[id]
token["tag"] = tag[id]
token["head"] = head[id] - i
token["dep"] = dep[id]
token["orth"] = word[i]
token["tag"] = tag[i]
token["head"] = head[i] - id
token["dep"] = dep[i]
tokens.append(token)
sentence["tokens"] = tokens
return sentence

View File

@ -0,0 +1,45 @@
# coding: utf8
from __future__ import unicode_literals
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo
def iob2json(input_path, output_path, n_sents=10, *a, **k):
"""
Convert IOB files into JSON format for use with train cli.
"""
# TODO: This isn't complete yet -- need to map from IOB to
# BILUO
with input_path.open('r', encoding='utf8') as file_:
docs = read_iob(file_)
output_filename = input_path.parts[-1].replace(".iob", ".json")
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints("Created %d documents" % len(docs),
title="Generated output file %s" % path2str(output_file))
def read_iob(file_):
sentences = []
for line in file_:
if not line.strip():
continue
tokens = [t.split('|') for t in line.split()]
if len(tokens[0]) == 3:
words, pos, iob = zip(*tokens)
else:
words, iob = zip(*tokens)
pos = ['-'] * len(words)
biluo = iob_to_biluo(iob)
sentences.append([
{'orth': w, 'tag': p, 'ner': ent}
for (w, p, ent) in zip(words, pos, biluo)
])
sentences = [{'tokens': sent} for sent in sentences]
paragraphs = [{'sentences': [sent]} for sent in sentences]
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
return docs

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import plac
import requests
import os
import subprocess
@ -11,7 +12,17 @@ from ..util import prints
from .. import about
def download(model, direct=False):
@plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(cmd, model, direct=False):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
if direct:
download_model('{m}/{m}.tar.gz'.format(m=model))
else:
@ -20,7 +31,17 @@ def download(model, direct=False):
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
link(model_name, model, force=True)
try:
link(None, model_name, model, force=True)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and
# loading instructions, even if linking fails.
prints("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load "
"the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful")
def get_json(url, desc):
@ -28,7 +49,7 @@ def get_json(url, desc):
if r.status_code != 200:
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
"(v%s), and download it manually." % (desc, about.__version__),
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=True)
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
return r.json()
@ -38,7 +59,7 @@ def get_compatibility():
comp = comp_table['spacy']
if version not in comp:
prints("No compatible models found for v%s of spaCy." % version,
title="Compatibility error", exits=True)
title="Compatibility error", exits=1)
return comp[version]
@ -46,7 +67,7 @@ def get_version(model, comp):
if model not in comp:
version = about.__version__
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
title="Compatibility error", exits=True)
title="Compatibility error", exits=1)
return comp[model][0]

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import plac
import platform
from pathlib import Path
@ -9,17 +10,30 @@ from .. import about
from .. import util
def info(model=None, markdown=False):
@plac.annotations(
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
if model:
data_path = util.get_data_path()
data = util.parse_package_meta(data_path / model, require=True)
model_path = Path(__file__).parent / data_path / model
if model_path.resolve() != model_path:
data['link'] = path2str(model_path)
data['source'] = path2str(model_path.resolve())
if util.is_package(model):
model_path = util.get_package_path(model)
else:
data['source'] = path2str(model_path)
print_info(data, 'model %s' % model, markdown)
model_path = util.get_data_path() / model
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
util.prints(meta_path, title="Can't find model meta.json", exits=1)
meta = util.read_json(meta_path)
if model_path.resolve() != model_path:
meta['link'] = path2str(model_path)
meta['source'] = path2str(model_path.resolve())
else:
meta['source'] = path2str(model_path)
print_info(meta, 'model %s' % model, markdown)
else:
data = {'spaCy version': about.__version__,
'Location': path2str(Path(__file__).parent.parent),

View File

@ -1,24 +1,36 @@
# coding: utf8
from __future__ import unicode_literals
import plac
from pathlib import Path
from ..compat import symlink_to, path2str
from ..util import prints
from .. import util
def link(origin, link_name, force=False):
@plac.annotations(
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(cmd, origin, link_name, force=False):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
if util.is_package(origin):
model_path = util.get_model_package_path(origin)
model_path = util.get_package_path(origin)
else:
model_path = Path(origin)
if not model_path.exists():
prints("The data should be located in %s" % path2str(model_path),
title="Can't locate model data", exits=True)
title="Can't locate model data", exits=1)
link_path = util.get_data_path() / link_name
if link_path.exists() and not force:
prints("To overwrite an existing link, use the --force flag.",
title="Link %s already exists" % link_name, exits=True)
title="Link %s already exists" % link_name, exits=1)
elif link_path.exists():
link_path.unlink()
try:
@ -33,5 +45,5 @@ def link(origin, link_name, force=False):
title="Error: Couldn't link model to '%s'" % link_name)
raise
prints("%s --> %s" % (path2str(model_path), path2str(link_path)),
"You can now load the model via spacy.load('%s')." % link_name,
"You can now load the model via spacy.load('%s')" % link_name,
title="Linking successful")

View File

@ -1,122 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
import gzip
import math
from ast import literal_eval
from preshed.counter import PreshCounter
from ..vocab import write_binary_vectors
from ..compat import fix_text, path2str
from ..util import prints
from .. import util
def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
model_path = util.ensure_path(model_dir)
freqs_path = util.ensure_path(freqs_data)
clusters_path = util.ensure_path(clusters_data)
vectors_path = util.ensure_path(vectors_data)
if not freqs_path.is_file():
prints(freqs_path, title="No frequencies file found", exits=True)
if clusters_path and not clusters_path.is_file():
prints(clusters_path, title="No Brown clusters file found", exits=True)
if vectors_path and not vectors_path.is_file():
prints(vectors_path, title="No word vectors file found", exits=True)
vocab = util.get_lang_class(lang).Defaults.create_vocab()
probs, oov_prob = read_probs(freqs_path)
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
create_model(model_path, vectors_path, vocab, oov_prob)
def create_model(model_path, vectors_path, vocab, oov_prob):
vocab_path = model_path / 'vocab'
lexemes_path = vocab_path / 'lexemes.bin'
strings_path = vocab_path / 'strings.json'
oov_path = vocab_path / 'oov_prob'
if not model_path.exists():
model_path.mkdir()
if not vocab_path.exists():
vocab_path.mkdir()
vocab.dump(path2str(lexemes_path))
with strings_path.open('w') as f:
vocab.strings.dump(f)
with oov_path.open('w') as f:
f.write('%f' % oov_prob)
if vectors_path:
vectors_dest = vocab_path / 'vec.bin'
write_binary_vectors(path2str(vectors_path), path2str(vectors_dest))
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i+1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = path2str(file_path)
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import plac
import shutil
import requests
from pathlib import Path
@ -11,27 +12,38 @@ from .. import util
from .. import about
def package(input_dir, output_dir, meta_path, force):
@plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(cmd, input_dir, output_dir, meta=None, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)
meta_path = util.ensure_path(meta)
if not input_path or not input_path.exists():
prints(input_path, title="Model directory not found", exits=True)
prints(input_path, title="Model directory not found", exits=1)
if not output_path or not output_path.exists():
prints(output_path, title="Output directory not found", exits=True)
prints(output_path, title="Output directory not found", exits=1)
if meta_path and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=True)
prints(meta_path, title="meta.json not found", exits=1)
template_setup = get_template('setup.py')
template_manifest = get_template('MANIFEST.in')
template_init = get_template('en_model_name/__init__.py')
template_init = get_template('xx_model_name/__init__.py')
meta_path = meta_path or input_path / 'meta.json'
if meta_path.is_file():
prints(meta_path, title="Reading meta.json from file")
meta = util.read_json(meta_path)
else:
meta = generate_meta()
validate_meta(meta, ['lang', 'name', 'version'])
meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version']
@ -55,7 +67,7 @@ def create_dirs(package_path, force):
else:
prints(package_path, "Please delete the directory and try again, or "
"use the --force flag to overwrite existing directories.",
title="Package directory already exists", exits=True)
title="Package directory already exists", exits=1)
Path.mkdir(package_path, parents=True)
@ -68,31 +80,45 @@ def generate_meta():
settings = [('lang', 'Model language', 'en'),
('name', 'Model name', 'model'),
('version', 'Model version', '0.0.0'),
('spacy_version', 'Required spaCy version', '>=2.0.0,<3.0.0'),
('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
('description', 'Model description', False),
('author', 'Author', False),
('email', 'Author email', False),
('url', 'Author website', False),
('license', 'License', 'CC BY-NC 3.0')]
prints("Enter the package settings for your model.", title="Generating meta.json")
meta = {}
for setting, desc, default in settings:
response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response
meta['pipeline'] = generate_pipeline()
if about.__title__ != 'spacy':
meta['parent_package'] = about.__title__
return meta
def generate_pipeline():
prints("If set to 'True', the default pipeline is used. If set to 'False', "
"the pipeline will be disabled. Components should be specified as a "
"comma-separated list of component names, e.g. vectorizer, tagger, "
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
def validate_meta(meta, keys):
for key in keys:
if key not in meta or meta[key] == '':
prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=True)
title='No "%s" setting found in meta.json' % key, exits=1)
return meta
def get_template(filepath):
r = requests.get(about.__model_files__ + filepath)
if r.status_code != 200:
prints("Couldn't fetch template files from GitHub.",
title="Server error (%d)" % r.status_code, exits=True)
title="Server error (%d)" % r.status_code, exits=1)
return r.text

View File

@ -1,132 +1,152 @@
# coding: utf8
from __future__ import unicode_literals, division, print_function
import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import read_json_file as read_gold_json
from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
from .. import displacy
from ..compat import json_dumps
def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
use_gpu, tagger, parser, ner, parser_L1):
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int),
resume=("Whether to resume training", "flag", "R", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
)
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
util.set_env_log(True)
n_sents = n_sents or None
output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data)
if not output_path.exists():
prints(output_path, title="Output directory not found", exits=True)
output_path.mkdir()
if not train_path.exists():
prints(train_path, title="Training data not found", exits=True)
prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=True)
prints(dev_path, title="Development data not found", exits=1)
lang = util.get_lang_class(language)
parser_cfg = {
'pseudoprojective': True,
'L1': parser_L1,
'n_iter': n_iter,
'lang': language,
'features': lang.Defaults.parser_features}
entity_cfg = {
'n_iter': n_iter,
'lang': language,
'features': lang.Defaults.entity_features}
tagger_cfg = {
'n_iter': n_iter,
'lang': language,
'features': lang.Defaults.tagger_features}
gold_train = list(read_gold_json(train_path, limit=n_sents))
gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None
lang_class = util.get_lang_class(lang)
train_model(lang, gold_train, gold_dev, output_path, n_iter, use_gpu=use_gpu)
if gold_dev:
scorer = evaluate(lang, gold_dev, output_path)
print_results(scorer)
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
if no_entities and 'entities' in pipeline: pipeline.remove('entities')
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training.
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
util.env_opt('dropout_to', 0.2),
util.env_opt('dropout_decay', 0.0))
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 64),
util.env_opt('batch_compound', 1.001))
if resume:
prints(output_path / 'model19.pickle', title="Resuming training")
nlp = dill.load((output_path / 'model19.pickle').open('rb'))
else:
nlp = lang_class(pipeline=pipeline)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train()
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try:
for i in range(n_iter):
if resume:
i += 20
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True,
gold_preproc=False, max_length=0)
losses = {}
for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses)
pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages):
util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path)
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1)
nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate(
corpus.dev_docs(
nlp_loaded,
gold_preproc=False))
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores))
util.set_env_log(True)
print_progress(i, losses, scorer.scores)
finally:
print("Saving model...")
with (output_path / 'model-final.pickle').open('wb') as file_:
with nlp.use_params(optimizer.averages):
dill.dump(nlp, file_, -1)
def train_config(config):
config_path = util.ensure_path(config)
if not config_path.is_file():
prints(config_path, title="Config file not found", exits=True)
config = json.load(config_path)
for setting in []:
if setting not in config.keys():
prints("%s not found in config file." % setting, title="Missing setting")
def _render_parses(i, to_render):
to_render[0].user_data['title'] = "Batch %d" % i
with Path('/tmp/entities.html').open('w') as file_:
html = displacy.render(to_render[:5], style='ent', page=True)
file_.write(html)
with Path('/tmp/parses.html').open('w') as file_:
html = displacy.render(to_render[:5], style='dep', page=True)
file_.write(html)
def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg):
print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %")
nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies'])
dropout = util.env_opt('dropout', 0.0)
# TODO: Get spaCy using Thinc's trainer and optimizer
with nlp.begin_training(train_data, **cfg) as (trainer, optimizer):
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)):
losses = defaultdict(float)
to_render = []
for i, (docs, golds) in enumerate(epoch):
state = nlp.update(docs, golds, drop=dropout, sgd=optimizer)
losses['dep_loss'] += state.get('parser_loss', 0.0)
losses['tag_loss'] += state.get('tag_loss', 0.0)
to_render.insert(0, nlp(docs[-1].text))
to_render[0].user_data['title'] = "Batch %d" % i
with Path('/tmp/entities.html').open('w') as file_:
html = displacy.render(to_render[:5], style='ent', page=True)
file_.write(html)
with Path('/tmp/parses.html').open('w') as file_:
html = displacy.render(to_render[:5], style='dep', page=True)
file_.write(html)
if dev_data:
with nlp.use_params(optimizer.averages):
dev_scores = trainer.evaluate(dev_data).scores
else:
dev_scores = defaultdict(float)
print_progress(itn, losses, dev_scores)
with (output_path / 'model.bin').open('wb') as file_:
dill.dump(nlp, file_, -1)
#nlp.to_disk(output_path, tokenizer=False)
def evaluate(Language, gold_tuples, path):
with (path / 'model.bin').open('rb') as file_:
nlp = dill.load(file_)
# TODO:
# 1. This code is duplicate with spacy.train.Trainer.evaluate
# 2. There's currently a semantic difference between pipe and
# not pipe! It matters whether we batch the inputs. Must fix!
all_docs = []
all_golds = []
for raw_text, paragraph_tuples in dev_sents:
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
docs = self.make_docs(raw_text, paragraph_tuples)
golds = self.make_golds(docs, paragraph_tuples)
all_docs.extend(docs)
all_golds.extend(golds)
scorer = Scorer()
for doc, gold in zip(self.nlp.pipe(all_docs), all_golds):
scorer.score(doc, gold)
return scorer
def print_progress(itn, losses, dev_scores):
# TODO: Fix!
def print_progress(itn, losses, dev_scores, wps=0.0):
scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', 'ents_f']:
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0
scores.update(losses)
scores['dep_loss'] = losses.get('parser', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores)
tpl = '{:d}\t{dep_loss:.3f}\t{tag_loss:.3f}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
scores['wps'] = wps
tpl = '\t'.join((
'{:d}',
'{dep_loss:.3f}',
'{uas:.3f}',
'{ents_p:.3f}',
'{ents_r:.3f}',
'{ents_f:.3f}',
'{tags_acc:.3f}',
'{token_acc:.3f}',
'{wps:.1f}'))
print(tpl.format(itn, **scores))

View File

@ -6,6 +6,8 @@ import ftfy
import sys
import ujson
from thinc.neural.util import copy_array
try:
import cPickle as pickle
except ImportError:
@ -32,6 +34,7 @@ copy_reg = copy_reg
CudaStream = CudaStream
cupy = cupy
fix_text = ftfy.fix_text
copy_array = copy_array
is_python2 = six.PY2
is_python3 = six.PY3
@ -56,6 +59,11 @@ elif is_python3:
json_dumps = lambda data: ujson.dumps(data, indent=2)
path2str = lambda path: str(path)
def getattr_(obj, name, *default):
if is_python3 and isinstance(name, bytes):
name = name.decode('utf8')
return getattr(obj, name, *default)
def symlink_to(orig, dest):
if is_python2 and is_windows:
@ -71,3 +79,16 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
(windows == None or windows == is_windows) and
(linux == None or linux == is_linux) and
(osx == None or osx == is_osx))
def normalize_string_keys(old):
'''Given a dictionary, make sure keys are unicode strings, not bytes.'''
new = {}
for key, value in old.items():
if isinstance(key, bytes_):
new[key.decode('utf8')] = value
else:
new[key] = value
return new

View File

@ -10,27 +10,28 @@ _html = {}
IS_JUPYTER = is_in_jupyter()
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, options={}):
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
options={}, manual=False):
"""Render displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
jupyter (bool): Experimental, use Jupyter's display() to output markup.
jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
RETURNS (unicode): Rendered HTML markup.
"""
if isinstance(docs, Doc):
docs = [docs]
if style == 'dep':
renderer = DependencyRenderer(options=options)
parsed = [parse_deps(doc, options) for doc in docs]
elif style == 'ent':
renderer = EntityRenderer(options=options)
parsed = [parse_ents(doc, options) for doc in docs]
else:
factories = {'dep': (DependencyRenderer, parse_deps),
'ent': (EntityRenderer, parse_ents)}
if style not in factories:
raise ValueError("Unknown style: %s" % style)
if isinstance(docs, Doc) or isinstance(docs, dict):
docs = [docs]
renderer, converter = factories[style]
renderer = renderer(options=options)
parsed = [converter(doc, options) for doc in docs] if not manual else docs
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
html = _html['parsed']
if jupyter: # return HTML rendered by IPython display()
@ -39,7 +40,8 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, opti
return html
def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
port=5000):
"""Serve displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
@ -47,13 +49,19 @@ def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
port (int): Port to serve visualisation.
"""
from wsgiref import simple_server
render(docs, style=style, page=page, minify=minify, options=options)
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server('0.0.0.0', port, app)
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
httpd.serve_forever()
try:
httpd.serve_forever()
except KeyboardInterrupt:
prints("Shutting down server on port %d." % port)
finally:
httpd.server_close()
def app(environ, start_response):
@ -62,12 +70,13 @@ def app(environ, start_response):
return [res]
def parse_deps(doc, options={}):
def parse_deps(orig_doc, options={}):
"""Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
if options.get('collapse_punct', True):
spans = []
for word in doc[:-1]:

View File

@ -18,12 +18,11 @@ class DependencyRenderer(object):
offset_x, color, bg, font)
"""
self.compact = options.get('compact', False)
distance, arrow_width = (85, 8) if self.compact else (175, 10)
self.word_spacing = options.get('word_spacing', 45)
self.arrow_spacing = options.get('arrow_spacing', 20)
self.arrow_width = options.get('arrow_width', arrow_width)
self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
self.arrow_stroke = options.get('arrow_stroke', 2)
self.distance = options.get('distance', distance)
self.distance = options.get('distance', 150 if self.compact else 175)
self.offset_x = options.get('offset_x', 50)
self.color = options.get('color', '#000000')
self.bg = options.get('bg', '#ffffff')
@ -99,6 +98,8 @@ class DependencyRenderer(object):
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
-self.arrow_spacing*(self.highest_level-level)/4)
y_curve = self.offset_y-level*self.distance/2
if self.compact:
y_curve = self.offset_y-level*self.distance/6
if y_curve == 0 and len(self.levels) > 5:
y_curve = -self.distance
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
@ -175,7 +176,7 @@ class EntityRenderer(object):
minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered HTML markup.
"""
rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed]
rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed]
if page:
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
markup = TPL_PAGE.format(content=docs)

View File

@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
TPL_DEP_ARCS = """
<g class="displacy-arrow">
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
<text dy="1.25em" style="font-size: 0.8em">
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
</text>
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/>

View File

@ -1,13 +1,15 @@
from cymem.cymem cimport Pool
from .structs cimport TokenC
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition
cdef struct GoldParseC:
int* tags
int* heads
int* labels
int* has_dep
attr_t* labels
int** brackets
Transition* ner
@ -18,15 +20,15 @@ cdef class GoldParse:
cdef GoldParseC c
cdef int length
cdef readonly int loss
cdef readonly list words
cdef readonly list tags
cdef readonly list heads
cdef readonly list labels
cdef readonly dict orths
cdef readonly list ner
cdef readonly list ents
cdef readonly dict brackets
cdef public int loss
cdef public list words
cdef public list tags
cdef public list heads
cdef public list labels
cdef public dict orths
cdef public list ner
cdef public list ents
cdef public dict brackets
cdef readonly list cand_to_gold
cdef readonly list gold_to_cand

View File

@ -5,10 +5,13 @@ from __future__ import unicode_literals, print_function
import io
import re
import ujson
import random
import cytoolz
from .syntax import nonproj
from .util import ensure_path
from . import util
from .tokens import Doc
def tags_to_entities(tags):
@ -86,8 +89,8 @@ def _min_edit_path(cand_words, gold_words):
# TODO: Fix this --- just do it properly, make the full edit matrix and
# then walk back over it...
# Preprocess inputs
cand_words = [punct_re.sub('', w) for w in cand_words]
gold_words = [punct_re.sub('', w) for w in gold_words]
cand_words = [punct_re.sub('', w).lower() for w in cand_words]
gold_words = [punct_re.sub('', w).lower() for w in gold_words]
if cand_words == gold_words:
return 0, ''.join(['M' for _ in gold_words])
@ -139,8 +142,164 @@ def _min_edit_path(cand_words, gold_words):
return prev_costs[n_gold], previous_row[-1]
def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
make_supertags = util.env_opt('make_supertags', make_supertags)
def minibatch(items, size=8):
'''Iterate over batches of items. `size` may be an iterator,
so that batch-size can vary on each step.
'''
items = iter(items)
while True:
batch_size = next(size) #if hasattr(size, '__next__') else size
batch = list(cytoolz.take(int(batch_size), items))
if len(batch) == 0:
break
yield list(batch)
class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER."""
def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
"""Create a GoldCorpus.
train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data.
"""
self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path)
self.limit = limit
self.train_locs = self.walk_corpus(self.train_path)
self.dev_locs = self.walk_corpus(self.dev_path)
@property
def train_tuples(self):
i = 0
for loc in self.train_locs:
gold_tuples = read_json_file(loc)
for item in gold_tuples:
yield item
i += len(item[1])
if self.limit and i >= self.limit:
break
@property
def dev_tuples(self):
i = 0
for loc in self.dev_locs:
gold_tuples = read_json_file(loc)
for item in gold_tuples:
yield item
i += 1
if self.limit and i >= self.limit:
break
def count_train(self):
n = 0
i = 0
for raw_text, paragraph_tuples in self.train_tuples:
n += sum([len(s[0][1]) for s in paragraph_tuples])
if self.limit and i >= self.limit:
break
i += len(paragraph_tuples)
return n
def train_docs(self, nlp, gold_preproc=False,
projectivize=False, max_length=None,
noise_level=0.0):
train_tuples = self.train_tuples
if projectivize:
train_tuples = nonproj.preprocess_training_data(
self.train_tuples)
random.shuffle(train_tuples)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length,
noise_level=noise_level)
yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
#gold_docs = nlp.preprocess_gold(gold_docs)
yield from gold_docs
@classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0):
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length:
yield doc, gold
@classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
noise_level=0.0):
if raw_text is not None:
raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)]
else:
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples]
@classmethod
def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples)
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
else:
return [GoldParse.from_annot_tuples(doc, sent_tuples)
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
@staticmethod
def walk_corpus(path):
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith('.'):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith('.json'):
locs.append(path)
return locs
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
corrupted = [_corrupt(word, noise_level) for word in orig]
corrupted = [w for w in corrupted if w]
return corrupted
else:
return ''.join(_corrupt(c, noise_level) for c in orig)
def _corrupt(c, noise_level):
if random.random() >= noise_level:
return c
elif c == ' ':
return '\n'
elif c == '\n':
return ' '
elif c in ['.', "'", "!", "?"]:
return ''
else:
return c.lower()
def read_json_file(loc, docs_filter=None, limit=None):
loc = ensure_path(loc)
if loc.is_dir():
for filename in loc.iterdir():
@ -173,16 +332,14 @@ def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
if labels[-1].lower() == 'root':
labels[-1] = 'ROOT'
ner.append(token.get('ner', '-'))
if make_supertags:
tags[-1] = '-'.join((tags[-1], labels[-1], ner[-1]))
sents.append([
[ids, words, tags, heads, labels, ner],
sent.get('brackets', [])])
sent.get('brackets', [])])
if sents:
yield [paragraph.get('raw', None), sents]
def _iob_to_biluo(tags):
def iob_to_biluo(tags):
out = []
curr_label = None
tags = list(tags)
@ -225,25 +382,17 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False):
"""
Create a GoldParse.
"""Create a GoldParse.
Arguments:
doc (Doc):
The document the annotations refer to.
words:
A sequence of unicode word strings.
tags:
A sequence of strings, representing tag annotations.
heads:
A sequence of integers, representing syntactic head offsets.
deps:
A sequence of strings, representing the syntactic relation types.
entities:
A sequence of named entity annotations, either as BILUO tag strings,
or as (start_char, end_char, label) tuples, representing the entity
positions.
Returns (GoldParse): The newly constructed object.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations.
heads (iterable): A sequence of integers, representing syntactic head offsets.
deps (iterable): A sequence of strings, representing the syntactic relation types.
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
RETURNS (GoldParse): The newly constructed object.
"""
if words is None:
words = [token.text for token in doc]
@ -268,7 +417,8 @@ cdef class GoldParse:
# These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc)
@ -295,7 +445,10 @@ cdef class GoldParse:
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.heads[i] = self.gold_to_cand[heads[gold_i]]
if heads[gold_i] is None:
self.heads[i] = None
else:
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]
@ -304,59 +457,49 @@ cdef class GoldParse:
raise Exception("Cycle found: %s" % cycle)
if make_projective:
proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads, self.labels)
proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
self.heads = proj_heads
def __len__(self):
"""
Get the number of gold-standard tokens.
"""Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens.
RETURNS (int): The number of gold-standard tokens.
"""
return self.length
@property
def is_projective(self):
"""
Whether the provided syntactic annotations form a projective dependency
tree.
"""Whether the provided syntactic annotations form a projective
dependency tree.
"""
return not nonproj.is_nonproj_tree(self.heads)
def biluo_tags_from_offsets(doc, entities):
"""
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (biluo).
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (BILUO).
Arguments:
doc (Doc):
The document that the entity offsets refer to. The output tags will
refer to the token boundaries within the document.
doc (Doc): The document that the entity offsets refer to. The output tags
will refer to the token boundaries within the document.
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
`end` should be character-offset integers denoting the slice into the
original string.
entities (sequence):
A sequence of (start, end, label) triples. start and end should be
character-offset integers denoting the slice into the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The
training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Returns:
tags (list):
A list of unicode strings, describing the tags. Each tag string will
be of the form either "", "O" or "{action}-{label}", where action is one
of "B", "I", "L", "U". The string "-" is used where the entity
offsets don't align with the tokenization in the Doc object. The
training algorithm will view these as missing values. "O" denotes
a non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Example:
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = nlp.tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
EXAMPLE:
>>> text = 'I like London.'
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
>>> doc = nlp.tokenizer(text)
>>> tags = biluo_tags_from_offsets(doc, entities)
>>> assert tags == ['O', 'O', 'U-LOC', 'O']
"""
starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc}

View File

@ -13,21 +13,23 @@ from ...attrs import LANG
from ...util import update_exc
class BengaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
class Bengali(Language):
lang = 'bn'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
Defaults = BengaliDefaults
__all__ = ['Bengali']

View File

@ -1,8 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
_currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(_currency),
r'(?<=[0-9])(?:{})'.format(UNITS),
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
_infixes = (LIST_ELLIPSES +
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),

View File

@ -20,7 +20,6 @@ _upper = [_latin_upper]
_lower = [_latin_lower]
_uncased = [_bengali, _hebrew]
ALPHA = merge_char_classes(_upper + _lower + _uncased)
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---'
_other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency)
QUOTES = merge_chars(_quotes)
PUNCT = merge_chars(_punct)
HYPHENS = merge_chars(_hyphens)
ICONS = _other_symbols
LIST_UNITS = split_chars(_units)
LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
LIST_PUNCT = split_chars(_punct)
LIST_HYPHENS = split_chars(_hyphens)
LIST_ELLIPSES = [r'\.\.+', '']
LIST_ICONS = [_other_symbols]

View File

@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Danish(Language):
lang = 'da'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = DanishDefaults
__all__ = ['Danish']

View File

@ -2,33 +2,39 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
NORM_EXCEPTIONS, BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class German(Language):
lang = 'de'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = GermanDefaults
__all__ = ['German']

View File

@ -0,0 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
# Here we only want to include the absolute most common words. Otherwise,
# this list would get impossibly long for German especially considering the
# old vs. new spelling rules, and all possible cases.
_exc = {
"daß": "dass"
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -15,9 +15,9 @@ def noun_chunks(obj):
# and not just "eine Tasse", same for "das Thema Familie".
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings['NP']
np_deps = set(doc.vocab.strings[label] for label in labels)
close_app = doc.vocab.strings['nk']
np_label = doc.vocab.strings.add('NP')
np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings.add('nk')
rbracket = 0
for i, word in enumerate(obj):

View File

@ -8,7 +8,7 @@ from ...deprecated import PRON_LEMMA
_exc = {
"auf'm": [
{ORTH: "auf", LEMMA: "auf"},
{ORTH: "'m", LEMMA: "der", NORM: "dem" }],
{ORTH: "'m", LEMMA: "der", NORM: "dem"}],
"du's": [
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
@ -53,97 +53,97 @@ _exc = {
for exc_data in [
{ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"},
{ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "'n", LEMMA: "ein", NORM: "ein"},
{ORTH: "'ne", LEMMA: "eine", NORM: "eine"},
{ORTH: "'nen", LEMMA: "ein", NORM: "einen"},
{ORTH: "'nem", LEMMA: "ein", NORM: "einem"},
{ORTH: "Abb.", LEMMA: "Abbildung"},
{ORTH: "Abk.", LEMMA: "Abkürzung"},
{ORTH: "Abt.", LEMMA: "Abteilung"},
{ORTH: "Apr.", LEMMA: "April"},
{ORTH: "Aug.", LEMMA: "August"},
{ORTH: "Bd.", LEMMA: "Band"},
{ORTH: "Betr.", LEMMA: "Betreff"},
{ORTH: "Bf.", LEMMA: "Bahnhof"},
{ORTH: "Bhf.", LEMMA: "Bahnhof"},
{ORTH: "Bsp.", LEMMA: "Beispiel"},
{ORTH: "Dez.", LEMMA: "Dezember"},
{ORTH: "Di.", LEMMA: "Dienstag"},
{ORTH: "Do.", LEMMA: "Donnerstag"},
{ORTH: "Fa.", LEMMA: "Firma"},
{ORTH: "Fam.", LEMMA: "Familie"},
{ORTH: "Feb.", LEMMA: "Februar"},
{ORTH: "Fr.", LEMMA: "Frau"},
{ORTH: "Frl.", LEMMA: "Fräulein"},
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"},
{ORTH: "Hr.", LEMMA: "Herr"},
{ORTH: "Hrn.", LEMMA: "Herr"},
{ORTH: "Jan.", LEMMA: "Januar"},
{ORTH: "Jh.", LEMMA: "Jahrhundert"},
{ORTH: "Jhd.", LEMMA: "Jahrhundert"},
{ORTH: "Jul.", LEMMA: "Juli"},
{ORTH: "Jun.", LEMMA: "Juni"},
{ORTH: "Mi.", LEMMA: "Mittwoch"},
{ORTH: "Mio.", LEMMA: "Million"},
{ORTH: "Mo.", LEMMA: "Montag"},
{ORTH: "Mrd.", LEMMA: "Milliarde"},
{ORTH: "Mrz.", LEMMA: "März"},
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"},
{ORTH: "Mär.", LEMMA: "März"},
{ORTH: "Nov.", LEMMA: "November"},
{ORTH: "Nr.", LEMMA: "Nummer"},
{ORTH: "Okt.", LEMMA: "Oktober"},
{ORTH: "Orig.", LEMMA: "Original"},
{ORTH: "Pkt.", LEMMA: "Punkt"},
{ORTH: "Prof.", LEMMA: "Professor"},
{ORTH: "Red.", LEMMA: "Redaktion"},
{ORTH: "Sa.", LEMMA: "Samstag"},
{ORTH: "Sep.", LEMMA: "September"},
{ORTH: "Sept.", LEMMA: "September"},
{ORTH: "So.", LEMMA: "Sonntag"},
{ORTH: "Std.", LEMMA: "Stunde"},
{ORTH: "Str.", LEMMA: "Straße"},
{ORTH: "Tel.", LEMMA: "Telefon"},
{ORTH: "Tsd.", LEMMA: "Tausend"},
{ORTH: "Univ.", LEMMA: "Universität"},
{ORTH: "abzgl.", LEMMA: "abzüglich"},
{ORTH: "allg.", LEMMA: "allgemein"},
{ORTH: "bspw.", LEMMA: "beispielsweise"},
{ORTH: "bzgl.", LEMMA: "bezüglich"},
{ORTH: "bzw.", LEMMA: "beziehungsweise"},
{ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"},
{ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"},
{ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"},
{ORTH: "Apr.", LEMMA: "April", NORM: "April"},
{ORTH: "Aug.", LEMMA: "August", NORM: "August"},
{ORTH: "Bd.", LEMMA: "Band", NORM: "Band"},
{ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"},
{ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
{ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
{ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"},
{ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"},
{ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"},
{ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"},
{ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"},
{ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"},
{ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"},
{ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"},
{ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"},
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"},
{ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"},
{ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
{ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
{ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
{ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
{ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
{ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"},
{ORTH: "Mio.", LEMMA: "Million", NORM: "Million"},
{ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"},
{ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"},
{ORTH: "Mrz.", LEMMA: "März", NORM: "März"},
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"},
{ORTH: "Mär.", LEMMA: "März", NORM: "März"},
{ORTH: "Nov.", LEMMA: "November", NORM: "November"},
{ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"},
{ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
{ORTH: "Orig.", LEMMA: "Original", NORM: "Original"},
{ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"},
{ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"},
{ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"},
{ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"},
{ORTH: "Sep.", LEMMA: "September", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September", NORM: "September"},
{ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"},
{ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"},
{ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"},
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
{ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"},
{ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"},
{ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"},
{ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"},
{ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"},
{ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"},
{ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"},
{ORTH: "d.h.", LEMMA: "das heißt"},
{ORTH: "dgl.", LEMMA: "dergleichen"},
{ORTH: "ebd.", LEMMA: "ebenda"},
{ORTH: "eigtl.", LEMMA: "eigentlich"},
{ORTH: "engl.", LEMMA: "englisch"},
{ORTH: "evtl.", LEMMA: "eventuell"},
{ORTH: "frz.", LEMMA: "französisch"},
{ORTH: "gegr.", LEMMA: "gegründet"},
{ORTH: "ggf.", LEMMA: "gegebenenfalls"},
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"},
{ORTH: "ggü.", LEMMA: "gegenüber"},
{ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"},
{ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"},
{ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"},
{ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"},
{ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"},
{ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"},
{ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"},
{ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
{ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
{ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"},
{ORTH: "i.O.", LEMMA: "in Ordnung"},
{ORTH: "i.d.R.", LEMMA: "in der Regel"},
{ORTH: "incl.", LEMMA: "inklusive"},
{ORTH: "inkl.", LEMMA: "inklusive"},
{ORTH: "insb.", LEMMA: "insbesondere"},
{ORTH: "kath.", LEMMA: "katholisch"},
{ORTH: "lt.", LEMMA: "laut"},
{ORTH: "max.", LEMMA: "maximal"},
{ORTH: "min.", LEMMA: "minimal"},
{ORTH: "mind.", LEMMA: "mindestens"},
{ORTH: "mtl.", LEMMA: "monatlich"},
{ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"},
{ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"},
{ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"},
{ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"},
{ORTH: "lt.", LEMMA: "laut", NORM: "laut"},
{ORTH: "max.", LEMMA: "maximal", NORM: "maximal"},
{ORTH: "min.", LEMMA: "minimal", NORM: "minimal"},
{ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"},
{ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"},
{ORTH: "n.Chr.", LEMMA: "nach Christus"},
{ORTH: "orig.", LEMMA: "original"},
{ORTH: "röm.", LEMMA: "römisch"},
{ORTH: "orig.", LEMMA: "original", NORM: "original"},
{ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"},
{ORTH: "s.o.", LEMMA: "siehe oben"},
{ORTH: "sog.", LEMMA: "so genannt"},
{ORTH: "stellv.", LEMMA: "stellvertretend"},
{ORTH: "tägl.", LEMMA: "täglich"},
{ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"},
{ORTH: "u.U.", LEMMA: "unter Umständen"},
{ORTH: "u.s.w.", LEMMA: "und so weiter"},
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
@ -153,9 +153,9 @@ for exc_data in [
{ORTH: "v.Chr.", LEMMA: "vor Christus"},
{ORTH: "v.a.", LEMMA: "vor allem"},
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
{ORTH: "vgl.", LEMMA: "vergleiche"},
{ORTH: "vllt.", LEMMA: "vielleicht"},
{ORTH: "vlt.", LEMMA: "vielleicht"},
{ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"},
{ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"},
{ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"},
{ORTH: "z.B.", LEMMA: "zum Beispiel"},
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
{ORTH: "z.T.", LEMMA: "zum Teil"},
@ -163,7 +163,7 @@ for exc_data in [
{ORTH: "z.Zt.", LEMMA: "zur Zeit"},
{ORTH: "z.b.", LEMMA: "zum Beispiel"},
{ORTH: "zzgl.", LEMMA: "zuzüglich"},
{ORTH: "österr.", LEMMA: "österreichisch"}]:
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
@ -10,27 +11,32 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
morph_rules = dict(MORPH_RULES)
lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC)
syntax_iterators = dict(SYNTAX_ITERATORS)
class English(Language):
lang = 'en'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
morph_rules = dict(MORPH_RULES)
lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC)
sytax_iterators = dict(SYNTAX_ITERATORS)
Defaults = EnglishDefaults
__all__ = ['English']

File diff suppressed because it is too large Load Diff

View File

@ -11,9 +11,9 @@ def noun_chunks(obj):
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings['conj']
np_label = doc.vocab.strings['NP']
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):

View File

@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
for pron in ["i"]:
for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
_exc[orth + "m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
_exc[orth + "'ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
_exc[orth + "ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
@ -36,72 +36,72 @@ for pron in ["i"]:
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
for orth in [pron, pron.title()]:
_exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["i", "you", "we", "they"]:
for orth in [pron, pron.title()]:
_exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["you", "we", "they"]:
for orth in [pron, pron.title()]:
_exc[orth + "'re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
for pron in ["he", "she", "it"]:
for orth in [pron, pron.title()]:
_exc[orth + "'s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: "'s"}]
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "s"}]
@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
for orth in [word, word.title()]:
_exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'s"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [
{ORTH: orth, LEMMA: word},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "s"}]
_exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'re"] = [
{ORTH: orth, LEMMA: word},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [
{ORTH: orth, LEMMA: word},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "re", LEMMA: "be", NORM: "are"}]
_exc[orth + "'ve"] = [
{ORTH: orth},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
_exc[orth + "ve"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'d"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d", NORM: "'d"}]
_exc[orth + "d"] = [
{ORTH: orth, LEMMA: word},
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d"}]
_exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [
{ORTH: orth, LEMMA: word},
{ORTH: "d", LEMMA: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
# Verbs
for verb_data in [
{ORTH: "ca", LEMMA: "can", TAG: "MD"},
{ORTH: "could", TAG: "MD"},
{ORTH: "do", LEMMA: "do"},
{ORTH: "does", LEMMA: "do"},
{ORTH: "did", LEMMA: "do", TAG: "VBD"},
{ORTH: "had", LEMMA: "have", TAG: "VBD"},
{ORTH: "may", TAG: "MD"},
{ORTH: "might", TAG: "MD"},
{ORTH: "must", TAG: "MD"},
{ORTH: "need"},
{ORTH: "ought"},
{ORTH: "sha", LEMMA: "shall", TAG: "MD"},
{ORTH: "should", TAG: "MD"},
{ORTH: "wo", LEMMA: "will", TAG: "MD"},
{ORTH: "would", TAG: "MD"}]:
{ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "do", LEMMA: "do", NORM: "do"},
{ORTH: "does", LEMMA: "do", NORM: "does"},
{ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
{ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
{ORTH: "may", NORM: "may", TAG: "MD"},
{ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "need", NORM: "need"},
{ORTH: "ought", NORM: "ought", TAG: "MD"},
{ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
{ORTH: "should", NORM: "should", TAG: "MD"},
{ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "would", NORM: "would", TAG: "MD"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "n't've"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[data[ORTH] + "ntve"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for verb_data in [
{ORTH: "could", TAG: "MD"},
{ORTH: "might"},
{ORTH: "must"},
{ORTH: "should"}]:
{ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "should", NORM: "should", TAG: "MD"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
@ -228,21 +228,21 @@ for verb_data in [
for verb_data in [
{ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be"},
{ORTH: "were", LEMMA: "be"}]:
{ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be", NORM: "was"},
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [
dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [
dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
# Other contractions with trailing apostrophe
@ -250,10 +250,10 @@ for verb_data in [
for exc_data in [
{ORTH: "doin", LEMMA: "do", NORM: "doing"},
{ORTH: "goin", LEMMA: "go", NORM: "going"},
{ORTH: "nothin", LEMMA: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing"},
{ORTH: "ol", LEMMA: "old"},
{ORTH: "somethin", LEMMA: "something"}]:
{ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "ol", LEMMA: "old", NORM: "old"},
{ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
exc_data_tc = dict(exc_data)
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
for data in [exc_data, exc_data_tc]:
@ -266,10 +266,10 @@ for exc_data in [
# Other contractions with leading apostrophe
for exc_data in [
{ORTH: "cause", LEMMA: "because"},
{ORTH: "cause", LEMMA: "because", NORM: "because"},
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
{ORTH: "ll", LEMMA: "will"},
{ORTH: "nuff", LEMMA: "enough"}]:
{ORTH: "ll", LEMMA: "will", NORM: "will"},
{ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]:
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
for period in ["a.m.", "am"]:
_exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h},
{ORTH: period, LEMMA: "a.m."}]
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
for period in ["p.m.", "pm"]:
_exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h},
{ORTH: period, LEMMA: "p.m."}]
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
# Rest
@ -306,56 +306,56 @@ _other_exc = {
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"How'd'y": [
{ORTH: "How", LEMMA: "how"},
{ORTH: "How", LEMMA: "how", NORM: "how"},
{ORTH: "'d", LEMMA: "do"},
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"not've": [
{ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"notve": [
{ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Not've": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Notve": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"cannot": [
{ORTH: "can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
"Cannot": [
{ORTH: "Can", LEMMA: "can", TAG: "MD"},
{ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
"gonna": [
{ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}],
{ORTH: "na", LEMMA: "to", NORM: "to"}],
"Gonna": [
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}],
{ORTH: "na", LEMMA: "to", NORM: "to"}],
"gotta": [
{ORTH: "got"},
{ORTH: "ta", LEMMA: "to"}],
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
"Gotta": [
{ORTH: "Got"},
{ORTH: "ta", LEMMA: "to"}],
{ORTH: "Got", NORM: "got"},
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
"let's": [
{ORTH: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
"Let's": [
{ORTH: "Let", LEMMA: "let"},
{ORTH: "Let", LEMMA: "let", NORM: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
}
@ -363,72 +363,80 @@ _exc.update(_other_exc)
for exc_data in [
{ORTH: "'S", LEMMA: "'s"},
{ORTH: "'s", LEMMA: "'s"},
{ORTH: "\u2018S", LEMMA: "'s"},
{ORTH: "\u2018s", LEMMA: "'s"},
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
{ORTH: "'S", LEMMA: "'s", NORM: "'s"},
{ORTH: "'s", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
{ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
{ORTH: "w/o", LEMMA: "without", NORM: "without"},
{ORTH: "'re", LEMMA: "be", NORM: "are"},
{ORTH: "'Cause", LEMMA: "because"},
{ORTH: "'cause", LEMMA: "because"},
{ORTH: "ma'am", LEMMA: "madam"},
{ORTH: "Ma'am", LEMMA: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock"},
{ORTH: "'Cause", LEMMA: "because", NORM: "because"},
{ORTH: "'cause", LEMMA: "because", NORM: "because"},
{ORTH: "'cos", LEMMA: "because", NORM: "because"},
{ORTH: "'Cos", LEMMA: "because", NORM: "because"},
{ORTH: "'coz", LEMMA: "because", NORM: "because"},
{ORTH: "'Coz", LEMMA: "because", NORM: "because"},
{ORTH: "'cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'bout", LEMMA: "about", NORM: "about"},
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "Mt.", LEMMA: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama"},
{ORTH: "Apr.", LEMMA: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August"},
{ORTH: "Calif.", LEMMA: "California"},
{ORTH: "Colo.", LEMMA: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December"},
{ORTH: "Del.", LEMMA: "Delaware"},
{ORTH: "Feb.", LEMMA: "February"},
{ORTH: "Fla.", LEMMA: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana"},
{ORTH: "Jan.", LEMMA: "January"},
{ORTH: "Jul.", LEMMA: "July"},
{ORTH: "Jun.", LEMMA: "June"},
{ORTH: "Kan.", LEMMA: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts"},
{ORTH: "May.", LEMMA: "May"},
{ORTH: "Mich.", LEMMA: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada"},
{ORTH: "Nov.", LEMMA: "November"},
{ORTH: "Oct.", LEMMA: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September"},
{ORTH: "Sept.", LEMMA: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin"}]:
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
{ORTH: "Apr.", LEMMA: "April", NORM: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August", NORM: "August"},
{ORTH: "Calif.", LEMMA: "California", NORM: "California"},
{ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December", NORM: "December"},
{ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
{ORTH: "Feb.", LEMMA: "February", NORM: "February"},
{ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
{ORTH: "Jan.", LEMMA: "January", NORM: "January"},
{ORTH: "Jul.", LEMMA: "July", NORM: "July"},
{ORTH: "Jun.", LEMMA: "June", NORM: "June"},
{ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March", NORM: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
{ORTH: "May.", LEMMA: "May", NORM: "May"},
{ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
{ORTH: "Nov.", LEMMA: "November", NORM: "November"},
{ORTH: "Oct.", LEMMA: "October", NORM: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September", NORM: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)]

View File

@ -5,21 +5,25 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class SpanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'es'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
sytax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
@ -28,7 +32,7 @@ class SpanishDefaults(Language.Defaults):
class Spanish(Language):
lang = 'es'
Defaults = SpanishDefaults
__all__ = ['Spanish']

View File

@ -0,0 +1,55 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
def noun_chunks(obj):
doc = obj.doc
np_label = doc.vocab.strings['NP']
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
right_labels = ['flat', 'fixed', 'compound', 'neg']
stop_labels = ['punct']
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
token = doc[0]
while token and token.i < len(doc):
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(token)
yield left.i, right.i+1, np_label
token = right
token = next_token(token)
def is_verb_token(token):
return token.pos in [VERB, AUX]
def next_token(token):
try:
return token.nbor()
except:
return None
def noun_bounds(root):
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
left_bound = token
right_bound = root
for token in root.rights:
if (token.dep in np_right_deps):
left, right = noun_bounds(token)
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i: right.i])):
break
else:
right_bound = right
return left_bound, right_bound
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -6,37 +6,13 @@ from ...deprecated import PRON_LEMMA
_exc = {
"al": [
{ORTH: "a", LEMMA: "a", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET}],
"consigo": [
{ORTH: "con", LEMMA: "con"},
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: ""}],
"conmigo": [
{ORTH: "con", LEMMA: "con"},
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: ""}],
"contigo": [
{ORTH: "con", LEMMA: "con"},
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}],
"del": [
{ORTH: "de", LEMMA: "de", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET}],
"pel": [
{ORTH: "pe", LEMMA: "per", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET}],
"pal": [
{ORTH: "pa", LEMMA: "para"},
{ORTH: "l", LEMMA: "el"}],
{ORTH: "l", LEMMA: "el", NORM: "el"}],
"pala": [
{ORTH: "pa", LEMMA: "para"},
{ORTH: "la"}]
{ORTH: "la", LEMMA: "la", NORM: "la"}]
}

View File

@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Finnish(Language):
lang = 'fi'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = FinnishDefaults
__all__ = ['Finnish']

View File

@ -5,30 +5,36 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class French(Language):
lang = 'fr'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = FrenchDefaults
__all__ = ['French']

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -9,15 +9,17 @@ from ...attrs import LANG
from ...util import update_exc
class HebrewDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Hebrew(Language):
lang = 'he'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = HebrewDefaults
__all__ = ['Hebrew']

View File

@ -7,29 +7,33 @@ from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Hungarian(Language):
lang = 'hu'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = HungarianDefaults
__all__ = ['Hungarian']

View File

@ -1,18 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
LIST_ICONS = [r'[\p{So}--[°]]']
_currency = r'\$|¢|£|€|¥|฿'
_quotes = QUOTES.replace("'", '')
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'[,.:](?=[{a}])'.format(a=ALPHA)])
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES)
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(_currency),
@ -20,16 +20,14 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency),
r'(?<=[{})])-e'.format(ALPHA_LOWER)])
_infixes = (LIST_ELLIPSES +
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -5,25 +5,29 @@ from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Italian(Language):
lang = 'it'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = ItalianDefaults
__all__ = ['Italian']

View File

@ -125,7 +125,7 @@ def word_shape(text):
LEX_ATTRS = {
attrs.LOWER: lambda string: string.lower(),
attrs.NORM: lambda string: string,
attrs.NORM: lambda string: string.lower(),
attrs.PREFIX: lambda string: string[0],
attrs.SUFFIX: lambda string: string[-3:],
attrs.CLUSTER: lambda string: 0,

View File

@ -6,20 +6,24 @@ from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Norwegian(Language):
lang = 'nb'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = NorwegianDefaults
__all__ = ['Norwegian']

View File

@ -4,21 +4,24 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Dutch(Language):
lang = 'nl'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = DutchDefaults
__all__ = ['Dutch']

View File

@ -0,0 +1,46 @@
# coding: utf8
from __future__ import unicode_literals
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
# Note that this does not change any other token attributes. Its main purpose
# is to normalise the word representations so that equivalent tokens receive
# similar representations. For example: $ and € are very different, but they're
# both currency symbols. By normalising currency symbols to $, all symbols are
# seen as similar, no matter how common they are in the training data.
BASE_NORMS = {
"'s": "'s",
"'S": "'s",
"s": "'s",
"S": "'s",
"": "'",
"": "'",
"´": "'",
"`": "'",
"": '"',
"": '"',
"''": '"',
"``": '"',
"´´": '"',
"": '"',
"»": '"',
"«": '"',
"": "...",
"": "-",
"": "-",
"--": "-",
"---": "-",
"": "$",
"£": "$",
"¥": "$",
"฿": "$",
"US$": "$",
"C$": "$",
"A$": "$"
}

View File

@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Polish(Language):
lang = 'pl'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = PolishDefaults
__all__ = ['Polish']

View File

@ -7,26 +7,30 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class PortugueseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Portuguese(Language):
lang = 'pt'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = PortugueseDefaults
__all__ = ['Portuguese']

View File

@ -2,15 +2,16 @@
from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
from .char_classes import CURRENCY, UNITS
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from .char_classes import QUOTES, CURRENCY, UNITS
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
LIST_CURRENCY)
LIST_CURRENCY + LIST_ICONS)
_suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
["'s", "'S", "s", "S"] +
[r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
_infixes = (LIST_ELLIPSES +
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),

View File

@ -7,25 +7,29 @@ from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Swedish(Language):
lang = 'sv'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = SwedishDefaults
__all__ = ['Swedish']

28
spacy/lang/xx/__init__.py Normal file
View File

@ -0,0 +1,28 @@
# coding: utf8
from __future__ import unicode_literals
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'.
"""
lang = 'xx'
Defaults = MultiLanguageDefaults
__all__ = ['MultiLanguage']

View File

@ -6,16 +6,23 @@ import dill
import numpy
from thinc.neural import Model
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.optimizers import Adam, SGD
import random
import ujson
from collections import OrderedDict
from .tokenizer import Tokenizer
from .vocab import Vocab
from .tagger import Tagger
from .lemmatizer import Lemmatizer
from .train import Trainer
from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity
from .syntax import nonproj
from .pipeline import NeuralDependencyParser, EntityRecognizer
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
from .pipeline import NeuralLabeller
from .pipeline import SimilarityHook
from .compat import json_dumps
from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
@ -23,6 +30,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS
from . import util
from .scorer import Scorer
class BaseDefaults(object):
@ -80,21 +88,34 @@ class BaseDefaults(object):
return NeuralEntityRecognizer(nlp.vocab, **cfg)
@classmethod
def create_pipeline(cls, nlp=None):
def create_pipeline(cls, nlp=None, disable=tuple()):
meta = nlp.meta if nlp is not None else {}
# Resolve strings, like "cnn", "lstm", etc
pipeline = []
for entry in cls.pipeline:
if entry in disable or getattr(entry, 'name', entry) in disable:
continue
factory = cls.Defaults.factories[entry]
pipeline.append(factory(nlp, **meta.get(entry, {})))
return pipeline
factories = {
'make_doc': create_tokenizer,
'token_vectors': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
'tags': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
'dependencies': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
'entities': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'parser': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize],
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
# Temporary compatibility -- delete after pivot
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'dependencies': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize,
],
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)]
}
token_match = TOKEN_MATCH
@ -112,19 +133,39 @@ class BaseDefaults(object):
lemma_index = {}
morph_rules = {}
lex_attr_getters = LEX_ATTRS
syntax_iterators = {}
class Language(object):
"""
A text-processing pipeline. Usually you'll load this once per process, and
pass the instance around your program.
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (unicode): Two-letter language ID, i.e. ISO code.
"""
Defaults = BaseDefaults
lang = None
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
self.meta = dict(meta)
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={},
disable=tuple(), **kwargs):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
`Language.Defaults.create_vocab`.
make_doc (callable): A function that takes text and returns a `Doc`
object. Usually a `Tokenizer`.
pipeline (list): A list of annotation processes or IDs of annotation,
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
up in `Language.Defaults.factories`.
disable (list): A list of component names to exclude from the pipeline.
The disable list has priority over the pipeline list -- if the same
string occurs in both, the component is not loaded.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
RETURNS (Language): The newly constructed object.
"""
self.meta = dict(meta)
if vocab is True:
factory = self.Defaults.create_vocab
vocab = factory(self, **meta.get('vocab', {}))
@ -132,11 +173,15 @@ class Language(object):
if make_doc is True:
factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {}))
self.make_doc = make_doc
self.tokenizer = make_doc
if pipeline is True:
self.pipeline = self.Defaults.create_pipeline(self)
self.pipeline = self.Defaults.create_pipeline(self, disable)
elif pipeline:
self.pipeline = list(pipeline)
# Careful not to do getattr(p, 'name', None) here
# If we had disable=[None], we'd disable everything!
self.pipeline = [p for p in pipeline
if p not in disable
and getattr(p, 'name', p) not in disable]
# Resolve strings, like "cnn", "lstm", etc
for i, entry in enumerate(self.pipeline):
if entry in self.Defaults.factories:
@ -144,82 +189,187 @@ class Language(object):
self.pipeline[i] = factory(self, **meta.get(entry, {}))
else:
self.pipeline = []
flat_list = []
for pipe in self.pipeline:
if isinstance(pipe, list):
flat_list.extend(pipe)
else:
flat_list.append(pipe)
self.pipeline = flat_list
def __call__(self, text, state=None, **disabled):
"""
Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
# Conveniences to access pipeline components
@property
def tensorizer(self):
return self.get_component('tensorizer')
@property
def tagger(self):
return self.get_component('tagger')
@property
def parser(self):
return self.get_component('parser')
@property
def entity(self):
return self.get_component('ner')
@property
def matcher(self):
return self.get_component('matcher')
def get_component(self, name):
if self.pipeline in (True, None):
return None
for proc in self.pipeline:
if hasattr(proc, 'name') and proc.name.endswith(name):
return proc
return None
def __call__(self, text, disable=[]):
"""'Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
Args:
text (unicode): The text to be processed.
state: Arbitrary
text (unicode): The text to be processed.
disable (list): Names of the pipeline components to disable.
RETURNS (Doc): A container for accessing the annotations.
Returns:
doc (Doc): A container for accessing the annotations.
Example:
>>> from spacy.en import English
>>> nlp = English()
EXAMPLE:
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_
>>> tokens[0].text, tokens[0].head.tag_
('An', 'NN')
"""
doc = self.make_doc(text)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
if name in disable:
continue
state = proc(doc, state=state)
doc = proc(doc)
return doc
def update(self, docs, golds, state=None, drop=0., sgd=None):
def make_doc(self, text):
return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
state = {} if state is None else state
for process in self.pipeline:
if hasattr(process, 'update'):
state = process.update(docs, golds,
state=state,
drop=drop,
sgd=get_grads)
else:
process(docs, state=state)
if sgd is not None:
for key, (W, dW) in grads.items():
# TODO: Unhack this when thinc improves
if isinstance(W, numpy.ndarray):
sgd.ops = NumpyOps()
else:
sgd.ops = CupyOps()
sgd(W, dW, key=key)
return state
pipes = list(self.pipeline[1:])
random.shuffle(pipes)
for proc in pipes:
if not hasattr(proc, 'update'):
continue
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
d_tokvecses = proc.update((docs, tokvecses), golds,
drop=drop, sgd=get_grads, losses=losses)
if d_tokvecses is not None:
bp_tokvecses(d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory.
# If we don't do this, the memory leak gets pretty
# bad, because we may be holding part of a batch.
for doc in docs:
doc.tensor = None
@contextmanager
def begin_training(self, gold_tuples, **cfg):
def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default,
it handles nonprojectivity and adds missing tags to the tag map.
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
"""
for proc in self.pipeline:
if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds)
for doc, gold in docs_golds:
yield doc, gold
def begin_training(self, get_gold_tuples, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
gold_tuples (iterable): Gold-standard training data.
**cfg: Config parameters.
YIELDS (tuple): A trainer and an optimizer.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
if self.parser:
self.pipeline.append(NeuralLabeller(self.vocab))
# Populate vocab
for _, annots_brackets in gold_tuples:
for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word]
# Handle crossing dependencies
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
contexts = []
if cfg.get('use_gpu'):
if cfg.get('device', -1) >= 0:
import cupy.cuda.device
device = cupy.cuda.device.Device(cfg['device'])
device.use()
Model.ops = CupyOps()
Model.Ops = CupyOps
print("Use GPU")
else:
device = None
for proc in self.pipeline:
if hasattr(proc, 'begin_training'):
context = proc.begin_training(gold_tuples,
context = proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline)
contexts.append(context)
trainer = Trainer(self, gold_tuples, **cfg)
yield trainer, trainer.optimizer
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
optimizer.max_grad_norm = max_grad_norm
optimizer.device = device
return optimizer
def evaluate(self, docs_golds):
docs, golds = zip(*docs_golds)
scorer = Scorer()
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
scorer.score(doc, gold)
doc.tensor = None
return scorer
@contextmanager
def use_params(self, params, **cfg):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
**cfg: Config parameters.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
contexts = [pipe.use_params(params) for pipe
in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib
@ -236,98 +386,141 @@ class Language(object):
except StopIteration:
pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
"""
Process texts as a stream, and yield Doc objects in order.
def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
Supports GIL-free multi-threading.
texts (iterator): A sequence of texts to process.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text.
Arguments:
texts (iterator)
tag (bool)
parse (bool)
entity (bool)
EXAMPLE:
>>> texts = [u'One document.', u'...', u'Lots of documents']
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed
"""
#stream = ((self.make_doc(text), None) for text in texts)
stream = ((doc, {}) for doc in texts)
docs = (self.make_doc(text) for text in texts)
docs = texts
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
if name in disable:
continue
if hasattr(proc, 'pipe'):
stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
else:
stream = (proc(doc, state) for doc, state in stream)
for doc, state in stream:
# Apply the function, but yield the doc
docs = _pipe(proc, docs)
for doc in docs:
yield doc
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
Args:
path: A path to a directory, which will be created if it doesn't
exist. Paths may be either strings or pathlib.Path-like
objects.
**exclude: Prevent named attributes from being saved.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
disable (list): Names of pipeline components to disable and prevent
from being saved.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
"""
path = util.ensure_path(path)
if not path.exists():
path.mkdir()
if not path.is_dir():
raise IOError("Output path must be a directory")
props = {}
for name, value in self.__dict__.items():
if name in exclude:
serializers = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if hasattr(value, 'to_disk'):
value.to_disk(path / name)
else:
props[name] = value
with (path / 'props.pickle').open('wb') as file_:
dill.dump(props, file_)
if proc.name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
util.to_disk(path, serializers, {p: False for p in disable})
def from_disk(self, path, **exclude):
"""Load the current state from a directory.
def from_disk(self, path, disable=tuple()):
"""Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the
model will be loaded.
Args:
path: A path to a directory. Paths may be either strings or
pathlib.Path-like objects.
**exclude: Prevent named attributes from being saved.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The modified `Language` object.
EXAMPLE:
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
"""
path = util.ensure_path(path)
for name in path.iterdir():
if name not in exclude and hasattr(self, str(name)):
getattr(self, name).from_disk(path / name)
with (path / 'props.pickle').open('rb') as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude)
deserializers = OrderedDict((
('vocab', lambda p: self.vocab.from_disk(p)),
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
exclude = {p: False for p in disable}
if not (path / 'vocab').exists():
exclude['vocab'] = True
util.from_disk(path, deserializers, exclude)
return self
def to_bytes(self, **exclude):
def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string.
Args:
path: A path to a directory. Paths may be either strings or
pathlib.Path-like objects.
**exclude: Prevent named attributes from being serialized.
disable (list): Nameds of pipeline components to disable and prevent
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
props = dict(self.__dict__)
for key in exclude:
if key in props:
props.pop(key)
return dill.dumps(props, -1)
serializers = OrderedDict((
('vocab', lambda: self.vocab.to_bytes()),
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
('meta', lambda: ujson.dumps(self.meta))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
continue
if not hasattr(proc, 'to_bytes'):
continue
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
return util.to_bytes(serializers, {})
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string.
Args:
bytes_data (bytes): The data to load from.
**exclude: Prevent named attributes from being loaded.
bytes_data (bytes): The data to load from.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object.
"""
props = dill.loads(bytes_data)
for key, value in props.items():
if key not in exclude:
setattr(self, key, value)
deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
('meta', lambda b: self.meta.update(ujson.loads(b)))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
continue
if not hasattr(proc, 'from_bytes'):
continue
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
msg = util.from_bytes(bytes_data, deserializers, {})
return self
def _pipe(func, docs):
for doc in docs:
func(doc)
yield doc

View File

@ -27,7 +27,7 @@ cdef class Lexeme:
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
cdef SerializedLexemeC lex_data
buff = <const unsigned char*>&lex.flags
end = <const unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)):
lex_data.data[i] = buff[i]
return lex_data
@ -35,7 +35,7 @@ cdef class Lexeme:
@staticmethod
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
buff = <unsigned char*>&lex.flags
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)):
buff[i] = lex_data.data[i]

View File

@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef class Lexeme:
"""
An entry in the vocabulary. A Lexeme has no string context --- it's a
"""An entry in the vocabulary. A `Lexeme` has no string context it's a
word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag).
"""
def __init__(self, Vocab vocab, int orth):
"""
Create a Lexeme object.
def __init__(self, Vocab vocab, attr_t orth):
"""Create a Lexeme object.
Arguments:
vocab (Vocab): The parent vocabulary
orth (int): The orth id of the lexeme.
vocab (Vocab): The parent vocabulary
orth (uint64): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object.
"""
self.vocab = vocab
@ -54,7 +51,7 @@ cdef class Lexeme:
if isinstance(other, Lexeme):
a = self.orth
b = other.orth
elif isinstance(other, int):
elif isinstance(other, long):
a = self.orth
b = other
elif isinstance(other, str):
@ -82,35 +79,28 @@ cdef class Lexeme:
return self.c.orth
def set_flag(self, attr_id_t flag_id, bint value):
"""
Change the value of a boolean flag.
"""Change the value of a boolean flag.
Arguments:
flag_id (int): The attribute ID of the flag to set.
value (bool): The new value of the flag.
flag_id (int): The attribute ID of the flag to set.
value (bool): The new value of the flag.
"""
Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id):
"""
Check the value of a boolean flag.
"""Check the value of a boolean flag.
Arguments:
flag_id (int): The attribute ID of the flag to query.
Returns (bool): The value of the flag.
flag_id (int): The attribute ID of the flag to query.
RETURNS (bool): The value of the flag.
"""
return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other):
"""
Compute a semantic similarity estimate. Defaults to cosine over vectors.
"""Compute a semantic similarity estimate. Defaults to cosine over
vectors.
Arguments:
other:
The object to compare with. By default, accepts Doc, Span,
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
@ -119,7 +109,7 @@ cdef class Lexeme:
def to_bytes(self):
lex_data = Lexeme.c_to_bytes(self.c)
start = <const char*>&self.c.flags
end = <const char*>&self.c.l2_norm + sizeof(self.c.l2_norm)
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
byte_string = b'\0' * sizeof(lex_data.data)
byte_chars = <char*>byte_string
@ -140,22 +130,29 @@ cdef class Lexeme:
self.orth = self.c.orth
property has_vector:
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self):
cdef int i
for i in range(self.vocab.vectors_length):
if self.c.vector[i] != 0:
return True
else:
return False
return self.vocab.has_vector(self.c.orth)
property vector_norm:
def __get__(self):
return self.c.l2_norm
"""The L2 norm of the lexeme's vector representation.
def __set__(self, float value):
self.c.l2_norm = value
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self):
vector = self.vector
return numpy.sqrt((vector**2).sum())
property vector:
"""A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics.
"""
def __get__(self):
cdef int length = self.vocab.vectors_length
if length == 0:
@ -165,27 +162,16 @@ cdef class Lexeme:
"model doesn't include word vectors. For more info, see "
"the documentation: \n%s\n" % about.__docs_models__
)
vector_view = <float[:length,]>self.c.vector
return numpy.asarray(vector_view)
return self.vocab.get_vector(self.c.orth)
def __set__(self, vector):
assert len(vector) == self.vocab.vectors_length
cdef float value
cdef double norm = 0.0
for i, value in enumerate(vector):
self.c.vector[i] = value
norm += value * value
self.c.l2_norm = sqrt(norm)
self.vocab.set_vector(self.c.orth, vector)
property rank:
def __get__(self):
return self.c.id
property repvec:
def __get__(self):
raise AttributeError("lex.repvec has been renamed to lex.vector")
property sentiment:
def __get__(self):
return self.c.sentiment
@ -196,33 +182,41 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.orth]
property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self):
return self.orth_
property lower:
def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x
def __set__(self, attr_t x): self.c.lower = x
property norm:
def __get__(self): return self.c.norm
def __set__(self, int x): self.c.norm = x
def __set__(self, attr_t x): self.c.norm = x
property shape:
def __get__(self): return self.c.shape
def __set__(self, int x): self.c.shape = x
def __set__(self, attr_t x): self.c.shape = x
property prefix:
def __get__(self): return self.c.prefix
def __set__(self, int x): self.c.prefix = x
def __set__(self, attr_t x): self.c.prefix = x
property suffix:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
def __set__(self, attr_t x): self.c.suffix = x
property cluster:
def __get__(self): return self.c.cluster
def __set__(self, int x): self.c.cluster = x
def __set__(self, attr_t x): self.c.cluster = x
property lang:
def __get__(self): return self.c.lang
def __set__(self, int x): self.c.lang = x
def __set__(self, attr_t x): self.c.lang = x
property prob:
def __get__(self): return self.c.prob
@ -230,27 +224,27 @@ cdef class Lexeme:
property lower_:
def __get__(self): return self.vocab.strings[self.c.lower]
def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x)
property norm_:
def __get__(self): return self.vocab.strings[self.c.norm]
def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x)
property shape_:
def __get__(self): return self.vocab.strings[self.c.shape]
def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x)
property prefix_:
def __get__(self): return self.vocab.strings[self.c.prefix]
def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x)
property suffix_:
def __get__(self): return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x)
property lang_:
def __get__(self): return self.vocab.strings[self.c.lang]
def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x)
property flags:
def __get__(self): return self.c.flags
@ -258,7 +252,7 @@ cdef class Lexeme:
property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x)
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
property is_stop:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
@ -308,7 +302,6 @@ cdef class Lexeme:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)

View File

@ -87,7 +87,7 @@ ctypedef TokenPatternC* TokenPatternC_ptr
ctypedef pair[int, TokenPatternC_ptr] StateC
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
object token_specs) except NULL:
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
cdef int i
@ -99,15 +99,21 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
pattern[i].attrs[j].attr = attr
pattern[i].attrs[j].value = value
i = len(token_specs)
pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC))
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
pattern[i].attrs[0].attr = ID
pattern[i].attrs[0].value = entity_id
pattern[i].attrs[1].attr = ENT_TYPE
pattern[i].attrs[1].value = label
pattern[i].nr_attr = 0
return pattern
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
while pattern.nr_attr != 0:
pattern += 1
id_attr = pattern[0].attrs[0]
assert id_attr.attr == ID
return id_attr.value
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
for attr in pattern.attrs[:pattern.nr_attr]:
if get_token_attr(token, attr.attr) != attr.value:
@ -148,7 +154,7 @@ def _convert_strings(token_specs, string_store):
if isinstance(attr, basestring):
attr = attrs.IDS.get(attr.upper())
if isinstance(value, basestring):
value = string_store[value]
value = string_store.add(value)
if isinstance(value, bool):
value = int(value)
if attr is not None:
@ -159,14 +165,14 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches):
'''Callback to merge a phrase on match'''
"""Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i]
span = doc[start : end]
span.merge(ent_type=label, ent_id=ent_id)
cdef class Matcher:
'''Match sequences of tokens, based on pattern rules.'''
"""Match sequences of tokens, based on pattern rules."""
cdef Pool mem
cdef vector[TokenPatternC*] patterns
cdef readonly Vocab vocab
@ -175,37 +181,12 @@ cdef class Matcher:
cdef public object _callbacks
cdef public object _acceptors
@classmethod
def load(cls, path, vocab):
"""
Load the matcher and patterns from a file path.
def __init__(self, vocab):
"""Create the Matcher.
Arguments:
path (Path):
Path to a JSON-formatted patterns file.
vocab (Vocab):
The vocabulary that the documents to match over will refer to.
Returns:
Matcher: The newly constructed object.
"""
if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = ujson.load(file_)
else:
patterns = {}
return cls(vocab, patterns)
def __init__(self, vocab, patterns={}):
"""
Create the Matcher.
Arguments:
vocab (Vocab):
The vocabulary object, which must be shared with the documents
the matcher will operate on.
patterns (dict): Patterns to add to the matcher.
Returns:
The newly constructed object.
vocab (Vocab): The vocabulary object, which must be shared with the
documents the matcher will operate on.
RETURNS (Matcher): The newly constructed object.
"""
self._patterns = {}
self._entities = {}
@ -213,144 +194,111 @@ cdef class Matcher:
self._callbacks = {}
self.vocab = vocab
self.mem = Pool()
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add_entity(entity_key, attrs)
for spec in specs:
self.add_pattern(entity_key, spec, label=etype)
def __reduce__(self):
return (self.__class__, (self.vocab, self._patterns), None, None)
property n_patterns:
def __get__(self): return self.patterns.size()
def __len__(self):
"""Get the number of rules added to the matcher. Note that this only
returns the number of rules (identical with the number of IDs), not the
number of individual patterns.
def add_entity(self, entity_key, attrs=None, if_exists='raise',
acceptor=None, on_match=None):
RETURNS (int): The number of rules.
"""
Add an entity to the matcher.
return len(self._patterns)
Arguments:
entity_key (unicode or int):
An ID for the entity.
attrs:
Attributes to associate with the Matcher.
if_exists ('raise', 'ignore' or 'update'):
Controls what happens if the entity ID already exists. Defaults to 'raise'.
acceptor:
Callback function to filter matches of the entity.
on_match:
Callback function to act on matches of the entity.
Returns:
None
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
key (unicode): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
if if_exists not in ('raise', 'ignore', 'update'):
raise ValueError(
"Unexpected value for if_exists: %s.\n"
"Expected one of: ['raise', 'ignore', 'update']" % if_exists)
if attrs is None:
attrs = {}
entity_key = self.normalize_entity_key(entity_key)
if self.has_entity(entity_key):
if if_exists == 'raise':
raise KeyError(
"Tried to add entity %s. Entity exists, and if_exists='raise'.\n"
"Set if_exists='ignore' or if_exists='update', or check with "
"matcher.has_entity()")
elif if_exists == 'ignore':
return
self._entities[entity_key] = dict(attrs)
self._patterns.setdefault(entity_key, [])
self._acceptors[entity_key] = acceptor
self._callbacks[entity_key] = on_match
return len(self._patterns)
def add_pattern(self, entity_key, token_specs, label=""):
def add(self, key, on_match, *patterns):
"""Add a match-rule to the matcher.
A match-rule consists of: an ID key, an on_match callback, and one or
more patterns. If the key exists, the patterns are appended to the
previous ones, and the previous on_match callback is replaced. The
`on_match` callback will receive the arguments `(matcher, doc, i,
matches)`. You can also set `on_match` to `None` to not perform any
actions. A pattern consists of one or more `token_specs`, where a
`token_spec` is a dictionary mapping attribute IDs to values. Token
descriptors can also include quantifiers. There are currently important
known problems with the quantifiers see the docs.
"""
Add a pattern to the matcher.
for pattern in patterns:
if len(pattern) == 0:
msg = ("Cannot add pattern for zero tokens to matcher.\n"
"key: {key}\n")
raise ValueError(msg.format(key=key))
key = self._normalize_key(key)
self._patterns.setdefault(key, [])
self._callbacks[key] = on_match
Arguments:
entity_key (unicode or int):
An ID for the entity.
token_specs:
Description of the pattern to be matched.
label:
Label to assign to the matched pattern. Defaults to "".
Returns:
None
for pattern in patterns:
specs = _convert_strings(pattern, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, key, specs))
self._patterns[key].append(specs)
def remove(self, key):
"""Remove a rule from the matcher. A KeyError is raised if the key does
not exist.
key (unicode): The ID of the match rule.
"""
token_specs = list(token_specs)
if len(token_specs) == 0:
msg = ("Cannot add pattern for zero tokens to matcher.\n"
"entity_key: {entity_key}\n"
"label: {label}")
raise ValueError(msg.format(entity_key=entity_key, label=label))
entity_key = self.normalize_entity_key(entity_key)
if not self.has_entity(entity_key):
self.add_entity(entity_key)
if isinstance(label, basestring):
label = self.vocab.strings[label]
elif label is None:
label = 0
spec = _convert_strings(token_specs, self.vocab.strings)
key = self._normalize_key(key)
self._patterns.pop(key)
self._callbacks.pop(key)
cdef int i = 0
while i < self.patterns.size():
pattern_key = get_pattern_key(self.patterns.at(i))
if pattern_key == key:
self.patterns.erase(self.patterns.begin()+i)
else:
i += 1
self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec))
self._patterns[entity_key].append((label, token_specs))
def has_key(self, key):
"""Check whether the matcher has a rule with a given key.
def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
self.add_entity(entity_key, attrs=attrs, if_exists='update',
acceptor=acceptor, on_match=on_match)
for spec in specs:
self.add_pattern(entity_key, spec, label=label)
def normalize_entity_key(self, entity_key):
if isinstance(entity_key, basestring):
return self.vocab.strings[entity_key]
else:
return entity_key
def has_entity(self, entity_key):
key (string or int): The key to check.
RETURNS (bool): Whether the matcher has the rule.
"""
Check whether the matcher has an entity.
key = self._normalize_key(key)
return key in self._patterns
Arguments:
entity_key (string or int): The entity key to check.
Returns:
bool: Whether the matcher has the entity.
"""
entity_key = self.normalize_entity_key(entity_key)
return entity_key in self._entities
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
def get_entity(self, entity_key):
key (unicode or int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
"""
Retrieve the attributes stored for an entity.
key = self._normalize_key(key)
if key not in self._patterns:
return default
return (self._callbacks[key], self._patterns[key])
Arguments:
entity_key (unicode or int): The entity to retrieve.
Returns:
The entity attributes if present, otherwise None.
"""
entity_key = self.normalize_entity_key(entity_key)
if entity_key in self._entities:
return self._entities[entity_key]
else:
return None
def pipe(self, docs, batch_size=1000, n_threads=2):
"""Match a stream of documents, yielding them in turn.
def __call__(self, Doc doc, acceptor=None):
docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
"""
Find all token sequences matching the supplied patterns on the Doc.
for doc in docs:
self(doc)
yield doc
Arguments:
doc (Doc):
The document to match over.
Returns:
list
A list of (entity_key, label_id, start, end) tuples,
describing the matches. A match tuple describes a span doc[start:end].
The label_id and entity_key are both integers.
def __call__(self, Doc doc):
"""Find all token sequences matching the supplied patterns on the `Doc`.
doc (Doc): The document to match over.
RETURNS (list): A list of `(key, label_id, start, end)` tuples,
describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers.
"""
if acceptor is not None:
raise ValueError(
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
"functions when you add patterns instead.")
cdef vector[StateC] partials
cdef int n_partials = 0
cdef int q = 0
@ -388,13 +336,7 @@ cdef class Matcher:
end = token_i+1
ent_id = state.second[1].attrs[0].value
label = state.second[1].attrs[1].value
acceptor = self._acceptors.get(ent_id)
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
matches.append((ent_id, start, end))
partials.resize(q)
# Check whether we open any new patterns on this token
for pattern in self.patterns:
@ -419,13 +361,7 @@ cdef class Matcher:
end = token_i+1
ent_id = pattern[1].attrs[0].value
label = pattern[1].attrs[1].value
acceptor = self._acceptors.get(ent_id)
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
matches.append((ent_id, start, end))
# Look for open patterns that are actually satisfied
for state in partials:
while state.second.quantifier in (ZERO, ZERO_PLUS):
@ -435,36 +371,19 @@ cdef class Matcher:
end = len(doc)
ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value
acceptor = self._acceptors.get(ent_id)
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
for i, (ent_id, label, start, end) in enumerate(matches):
matches.append((ent_id, start, end))
for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
if on_match is not None:
on_match(self, doc, i, matches)
# TODO: only return (match_id, start, end)
return matches
def pipe(self, docs, batch_size=1000, n_threads=2):
"""
Match a stream of documents, yielding them in turn.
Arguments:
docs: A stream of documents.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
"""
for doc in docs:
self(doc)
yield doc
def _normalize_key(self, key):
if isinstance(key, basestring):
return self.vocab.strings.add(key)
else:
return key
def get_bilou(length):
@ -550,7 +469,7 @@ cdef class PhraseMatcher:
self(doc)
yield doc
def accept_match(self, Doc doc, int ent_id, int label, int start, int end):
def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
assert (end - start) < self.max_length
cdef int i, j
for i in range(self.max_length):

View File

@ -30,6 +30,7 @@ cdef class Morphology:
cdef public object n_tags
cdef public object reverse_index
cdef public object tag_names
cdef public object exc
cdef RichTagC* rich_tags
cdef PreshMapArray _cache

View File

@ -33,36 +33,43 @@ def _normalize_props(props):
cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer):
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
self.mem = Pool()
self.strings = string_store
self.tag_map = {}
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) + 1
self.n_tags = len(tag_map)
self.tag_names = tuple(sorted(tag_map.keys()))
self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs)
self.tag_map[tag_str] = dict(attrs)
attrs = _normalize_props(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].name = self.strings.add(tag_str)
self.rich_tags[i].morph = 0
self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
for (tag_str, orth_str), attrs in exc.items():
self.add_special_case(tag_str, orth_str, attrs)
def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
cdef int assign_tag(self, TokenC* token, tag) except -1:
if isinstance(tag, basestring):
tag_id = self.reverse_index[self.strings[tag]]
else:
tag = self.strings.add(tag)
if tag in self.reverse_index:
tag_id = self.reverse_index[tag]
self.assign_tag_id(token, tag_id)
self.assign_tag_id(token, tag_id)
else:
token.tag = tag
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id >= self.n_tags:
@ -73,7 +80,7 @@ cdef class Morphology:
# the statistical model fails.
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings['SP']]
tag_id = self.reverse_index[self.strings.add('SP')]
rich_tag = self.rich_tags[tag_id]
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL:
@ -104,7 +111,8 @@ cdef class Morphology:
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
"""
tag = self.strings[tag_str]
self.exc[(tag_str, orth_str)] = dict(attrs)
tag = self.strings.add(tag_str)
tag_id = self.reverse_index[tag]
orth = self.strings[orth_str]
cdef RichTagC rich_tag = self.rich_tags[tag_id]
@ -140,14 +148,14 @@ cdef class Morphology:
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings[py_string.lower()]
return self.strings.add(py_string.lower())
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
return self.strings[py_string.lower()]
return self.strings.add(py_string.lower())
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string]
lemma = self.strings.add(lemma_string)
return lemma

View File

@ -9,12 +9,18 @@ import numpy
cimport numpy as np
import cytoolz
import util
from collections import OrderedDict
import ujson
import msgpack
from thinc.api import add, layerize, chain, clone, concatenate
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural.util import to_categorical
from thinc.neural.pooling import Pooling, max_pool, mean_pool
from thinc.neural._classes.difference import Siamese, CauchySimilarity
from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.resnet import Residual
from thinc.neural._classes.batchnorm import BatchNorm as BN
@ -31,110 +37,194 @@ from .syntax.stateclass cimport StateClass
from .gold cimport GoldParse
from .morphology cimport Morphology
from .vocab cimport Vocab
from .syntax import nonproj
from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import Tok2Vec, flatten, get_col, doc2feats
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
from .parts_of_speech import X
class TokenVectorEncoder(object):
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
name = 'tok2vec'
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
name = 'tensorizer'
@classmethod
def Model(cls, width=128, embed_size=5000, **cfg):
def Model(cls, width=128, embed_size=7500, **cfg):
"""Create a new statistical model for the class.
width (int): Output size of the model.
embed_size (int): Number of vectors in the embedding table.
**cfg: Config parameters.
RETURNS (Model): A `thinc.neural.Model` or similar instance.
"""
width = util.env_opt('token_vector_width', width)
embed_size = util.env_opt('embed_size', embed_size)
return Tok2Vec(width, embed_size, preprocess=None)
def __init__(self, vocab, model=True, **cfg):
"""Construct a new statistical model. Weights are not allocated on
initialisation.
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
instance with the `Doc` objects it will process.
model (Model): A `Model` instance or `True` allocate one later.
**cfg: Config parameters.
EXAMPLE:
>>> from spacy.pipeline import TokenVectorEncoder
>>> tok2vec = TokenVectorEncoder(nlp.vocab)
>>> tok2vec.model = tok2vec.Model(128, 5000)
"""
self.vocab = vocab
self.doc2feats = doc2feats()
self.model = model
def __call__(self, docs, state=None):
if isinstance(docs, Doc):
docs = [docs]
tokvecs = self.predict(docs)
self.set_annotations(docs, tokvecs)
state = {} if state is None else state
state['tokvecs'] = tokvecs
return state
def __call__(self, doc):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
model. Vectors are set to the `Doc.tensor` attribute.
docs (Doc or iterable): One or more documents to add vectors to.
RETURNS (dict or None): Intermediate computations.
"""
tokvecses = self.predict([doc])
self.set_annotations([doc], tokvecses)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for batch in cytoolz.partition_all(batch_size, stream):
docs, states = zip(*batch)
tokvecs = self.predict(docs)
self.set_annotations(docs, tokvecs)
for state in states:
state['tokvecs'] = tokvecs
yield from zip(docs, states)
"""Process `Doc` objects as a stream.
stream (iterator): A sequence of `Doc` objects to process.
batch_size (int): Number of `Doc` objects to group.
n_threads (int): Number of threads.
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
"""
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
tokvecses = self.predict(docs)
self.set_annotations(docs, tokvecses)
yield from docs
def predict(self, docs):
"""Return a single tensor for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the documents.
"""
feats = self.doc2feats(docs)
tokvecs = self.model(feats)
return tokvecs
def set_annotations(self, docs, tokvecs):
start = 0
for doc in docs:
doc.tensor = tokvecs[start : start + len(doc)]
start += len(doc)
def set_annotations(self, docs, tokvecses):
"""Set the tensor attribute for a batch of documents.
def update(self, docs, golds, state=None,
drop=0., sgd=None):
docs (iterable): A sequence of `Doc` objects.
tokvecs (object): Vector representation for each token in the documents.
"""
for doc, tokvecs in zip(docs, tokvecses):
assert tokvecs.shape[0] == len(doc)
doc.tensor = tokvecs
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
"""Update the model.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
"""
if isinstance(docs, Doc):
docs = [docs]
golds = [golds]
state = {} if state is None else state
feats = self.doc2feats(docs)
tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
state['feats'] = feats
state['tokvecs'] = tokvecs
state['bp_tokvecs'] = bp_tokvecs
return state
return tokvecs, bp_tokvecs
def get_loss(self, docs, golds, scores):
# TODO: implement
raise NotImplementedError
def begin_training(self, gold_tuples, pipeline=None):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
self.doc2feats = doc2feats()
if self.model is True:
self.model = self.Model()
def use_params(self, params):
"""Replace weights of models in the pipeline with those provided in the
params dictionary.
params (dict): A dictionary of parameters keyed by model ID.
"""
with self.model.use_params(params):
yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda b: self.model.from_bytes(b)),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
('vocab', lambda p: self.vocab.from_disk(p))
))
util.from_disk(path, deserialize, exclude)
return self
class NeuralTagger(object):
name = 'nn_tagger'
name = 'tagger'
def __init__(self, vocab, model=True):
self.vocab = vocab
self.model = model
def __call__(self, doc, state=None):
assert state is not None
assert 'tokvecs' in state
tokvecs = state['tokvecs']
tags = self.predict(tokvecs)
def __call__(self, doc):
tags = self.predict([doc.tensor])
self.set_annotations([doc], tags)
return state
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for batch in cytoolz.partition_all(batch_size, stream):
docs, states = zip(*batch)
tag_ids = self.predict(states[0]['tokvecs'])
for docs in cytoolz.partition_all(batch_size, stream):
tokvecs = [d.tensor for d in docs]
tag_ids = self.predict(tokvecs)
self.set_annotations(docs, tag_ids)
for state in states:
state['tag_ids'] = tag_ids
yield from zip(docs, states)
yield from docs
def predict(self, tokvecs):
scores = self.model(tokvecs)
scores = self.model.ops.flatten(scores)
guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get()
guesses = self.model.ops.unflatten(guesses,
[tv.shape[0] for tv in tokvecs])
return guesses
def set_annotations(self, docs, batch_tag_ids):
@ -142,46 +232,48 @@ class NeuralTagger(object):
docs = [docs]
cdef Doc doc
cdef int idx = 0
cdef int i, j, tag_id
cdef Vocab vocab = self.vocab
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[idx:idx+len(doc)]
doc_tag_ids = batch_tag_ids[i]
for j, tag_id in enumerate(doc_tag_ids):
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
# Don't clobber preset POS tags
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
idx += 1
doc.is_tagged = True
def update(self, docs, golds, state=None, drop=0., sgd=None):
state = {} if state is None else state
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
docs, tokvecs = docs_tokvecs
tokvecs = state['tokvecs']
bp_tokvecs = state['bp_tokvecs']
if self.model.nI is None:
self.model.nI = tokvecs.shape[1]
self.model.nI = tokvecs[0].shape[1]
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
bp_tokvecs(d_tokvecs, sgd=sgd)
state['tag_scores'] = tag_scores
state['tag_loss'] = loss
return state
return d_tokvecs
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)
for gold in golds:
for tag in gold.tags:
correct[idx] = tag_index[tag]
if tag is None:
correct[idx] = guesses[idx]
else:
correct[idx] = tag_index[tag]
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum()
d_scores = self.model.ops.asarray(d_scores, dtype='f')
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
def begin_training(self, gold_tuples, pipeline=None):
@ -195,22 +287,242 @@ class NeuralTagger(object):
new_tag_map[tag] = orig_tag_map[tag]
else:
new_tag_map[tag] = {POS: X}
if 'SP' not in new_tag_map:
new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X})
cdef Vocab vocab = self.vocab
vocab.morphology = Morphology(vocab.strings, new_tag_map,
vocab.morphology.lemmatizer)
self.model = Softmax(self.vocab.morphology.n_tags)
print("Tagging", self.model.nO, "tags")
if new_tag_map:
vocab.morphology = Morphology(vocab.strings, new_tag_map,
vocab.morphology.lemmatizer,
exc=vocab.morphology.exc)
token_vector_width = pipeline[0].model.nO
if self.model is True:
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
@classmethod
def Model(cls, n_tags, token_vector_width):
return with_flatten(
chain(Maxout(token_vector_width, token_vector_width),
Softmax(n_tags, token_vector_width)))
def use_params(self, params):
with self.model.use_params(params):
yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes()),
('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
use_bin_type=True,
encoding='utf8'))
))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
def load_model(b):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(b)
def load_tag_map(b):
tag_map = msgpack.loads(b, encoding='utf8')
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
deserialize = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('tag_map', load_tag_map),
('model', lambda b: load_model(b)),
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
self.vocab.morphology.tag_map,
use_bin_type=True,
encoding='utf8'))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
def load_model(p):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(p.open('rb').read())
def load_tag_map(p):
with p.open('rb') as file_:
tag_map = msgpack.loads(file_.read(), encoding='utf8')
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
deserialize = OrderedDict((
('vocab', lambda p: self.vocab.from_disk(p)),
('tag_map', load_tag_map),
('model', load_model),
))
util.from_disk(path, deserialize, exclude)
return self
class NeuralLabeller(NeuralTagger):
name = 'nn_labeller'
def __init__(self, vocab, model=True):
self.vocab = vocab
self.model = model
self.labels = {}
def set_annotations(self, docs, dep_ids):
pass
def begin_training(self, gold_tuples, pipeline=None):
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
for dep in deps:
if dep not in self.labels:
self.labels[dep] = len(self.labels)
token_vector_width = pipeline[0].model.nO
if self.model is True:
self.model = self.Model(len(self.labels), token_vector_width)
@classmethod
def Model(cls, n_tags, token_vector_width):
return with_flatten(
chain(Maxout(token_vector_width, token_vector_width),
Softmax(n_tags, token_vector_width)))
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)
for gold in golds:
for tag in gold.labels:
if tag is None or tag not in self.labels:
correct[idx] = guesses[idx]
else:
correct[idx] = self.labels[tag]
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
class SimilarityHook(object):
"""
Experimental
A pipeline component to install a hook for supervised similarity into
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
model can be any object obeying the Thinc Model interface. By default,
the model concatenates the elementwise mean and elementwise max of the two
tensors, and compares them using the Cauchy-like similarity function
from Chen (2013):
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
Where W is a vector of dimension weights, initialized to 1.
"""
name = 'similarity'
def __init__(self, vocab, model=True):
self.vocab = vocab
self.model = model
@classmethod
def Model(cls, length):
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
def __call__(self, doc):
'''Install similarity hook'''
doc.user_hooks['similarity'] = self.predict
return doc
def pipe(self, docs, **kwargs):
for doc in docs:
yield self(doc)
def predict(self, doc1, doc2):
return self.model.predict([(doc1.tensor, doc2.tensor)])
def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.):
doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2
sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
drop=drop)
d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
return d_tensor1s, d_tensor2s
def begin_training(self, _, pipeline=None):
"""
Allocate model, using width from tensorizer in pipeline.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
if self.model is True:
self.model = self.Model(pipeline[0].model.nO)
def use_params(self, params):
"""Replace weights of models in the pipeline with those provided in the
params dictionary.
params (dict): A dictionary of parameters keyed by model ID.
"""
with self.model.use_params(params):
yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda b: self.model.from_bytes(b)),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
('vocab', lambda p: self.vocab.from_disk(p))
))
util.from_disk(path, deserialize, exclude)
return self
cdef class EntityRecognizer(LinearParser):
"""
Annotate named entities on Doc objects.
"""
"""Annotate named entities on Doc objects."""
TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner')
@ -222,9 +534,7 @@ cdef class EntityRecognizer(LinearParser):
cdef class BeamEntityRecognizer(BeamParser):
"""
Annotate named entities on Doc objects.
"""
"""Annotate named entities on Doc objects."""
TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner')
@ -249,32 +559,18 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser'
TransitionSystem = ArcEager
def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
cdef class NeuralEntityRecognizer(NeuralParser):
name = 'entity'
name = 'ner'
TransitionSystem = BiluoPushDown
nr_feature = 6
def get_token_ids(self, states):
cdef StateClass state
cdef int n_tokens = 6
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
for i, state in enumerate(states):
ids[i, 0] = state.c.B(0)-1
ids[i, 1] = state.c.B(0)
ids[i, 2] = state.c.B(1)
ids[i, 3] = state.c.E(0)
ids[i, 4] = state.c.E(0)-1
ids[i, 5] = state.c.E(0)+1
for j in range(6):
if ids[i, j] >= state.c.length:
ids[i, j] = -1
if ids[i, j] != -1:
ids[i, j] += state.c.offset
return ids
def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
cdef class BeamDependencyParser(BeamParser):

View File

@ -1,4 +1,5 @@
from libc.stdint cimport int64_t
from libcpp.vector cimport vector
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
@ -8,6 +9,9 @@ from .typedefs cimport attr_t, hash_t
cpdef hash_t hash_string(unicode string) except 0
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
cdef unicode decode_Utf8Str(const Utf8Str* string)
ctypedef union Utf8Str:
@ -17,13 +21,11 @@ ctypedef union Utf8Str:
cdef class StringStore:
cdef Pool mem
cdef Utf8Str* c
cdef int64_t size
cdef bint is_frozen
cdef vector[hash_t] keys
cdef public PreshMap _map
cdef public PreshMap _oov
cdef int64_t _resize_at
cdef const Utf8Str* intern_unicode(self, unicode py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)

View File

@ -7,11 +7,16 @@ from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t
from libc.stdint cimport uint32_t
import ujson
import dill
from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t
from libc.stdint cimport uint32_t
import ujson
from . import util
from .compat import json_dumps
cpdef hash_t hash_string(unicode string) except 0:
@ -27,7 +32,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1)
cdef unicode _decode(const Utf8Str* string):
cdef unicode decode_Utf8Str(const Utf8Str* string):
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode('utf8')
@ -44,10 +49,10 @@ cdef unicode _decode(const Utf8Str* string):
return string.p[i:length + i].decode('utf8')
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
cdef int n_length_bytes
cdef int i
cdef Utf8Str string
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
cdef uint32_t ulength = length
if length < sizeof(string.s):
string.s[0] = <unsigned char>length
@ -72,129 +77,166 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
cdef class StringStore:
"""
Map strings to and from integer IDs.
"""
"""Look up strings by 64-bit hashes."""
def __init__(self, strings=None, freeze=False):
"""
Create the StringStore.
"""Create the StringStore.
Arguments:
strings: A sequence of unicode strings to add to the store.
strings (iterable): A sequence of unicode strings to add to the store.
RETURNS (StringStore): The newly constructed object.
"""
self.mem = Pool()
self._map = PreshMap()
self._oov = PreshMap()
self._resize_at = 10000
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
self.is_frozen = freeze
if strings is not None:
for string in strings:
_ = self[string]
property size:
def __get__(self):
return self.size -1
def __reduce__(self):
# TODO: OOV words, for the is_frozen stuff?
if self.is_frozen:
raise NotImplementedError(
"Currently missing support for pickling StringStore when "
"is_frozen=True")
return (StringStore, (list(self),))
def __len__(self):
"""
The number of strings in the store.
Returns:
int The number of strings in the store.
"""
return self.size-1
self.add(string)
def __getitem__(self, object string_or_id):
"""
Retrieve a string from a given integer ID, or vice versa.
"""Retrieve a string from a given hash, or vice versa.
Arguments:
string_or_id (bytes or unicode or int):
The value to encode.
Returns:
unicode or int: The value to retrieved.
string_or_id (bytes, unicode or uint64): The value to encode.
Returns (unicode or uint64): The value to be retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0
elif string_or_id == 0:
return u''
elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id]
cdef bytes byte_string
cdef const Utf8Str* utf8str
cdef uint64_t int_id
cdef uint32_t oov_id
if isinstance(string_or_id, (int, long)):
int_id = string_or_id
oov_id = string_or_id
if int_id < <uint64_t>self.size:
return _decode(&self.c[int_id])
else:
utf8str = <Utf8Str*>self._oov.get(oov_id)
if utf8str is not NULL:
return _decode(utf8str)
else:
raise IndexError(string_or_id)
cdef hash_t key
if isinstance(string_or_id, unicode):
key = hash_string(string_or_id)
return key
elif isinstance(string_or_id, bytes):
key = hash_utf8(string_or_id, len(string_or_id))
return key
elif string_or_id < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[string_or_id]
else:
if isinstance(string_or_id, bytes):
byte_string = <bytes>string_or_id
elif isinstance(string_or_id, unicode):
byte_string = (<unicode>string_or_id).encode('utf8')
else:
raise TypeError(type(string_or_id))
utf8str = self._intern_utf8(byte_string, len(byte_string))
key = string_or_id
utf8str = <Utf8Str*>self._map.get(key)
if utf8str is NULL:
# TODO: We need to use 32 bit here, for compatibility with the
# vocabulary values. This makes birthday paradox probabilities
# pretty bad.
# We could also get unlucky here, and hash into a value that
# collides with the 'real' strings.
return hash32_utf8(byte_string, len(byte_string))
raise KeyError(string_or_id)
else:
return utf8str - self.c
return decode_Utf8Str(utf8str)
def __contains__(self, unicode string not None):
"""
Check whether a string is in the store.
def add(self, string):
"""Add a string to the StringStore.
Arguments:
string (unicode): The string to check.
Returns bool:
Whether the store contains the string.
string (unicode): The string to add.
RETURNS (uint64): The string's hash value.
"""
if len(string) == 0:
if isinstance(string, unicode):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_string(string)
self.intern_unicode(string)
elif isinstance(string, bytes):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_utf8(string, len(string))
self._intern_utf8(string, len(string))
else:
raise TypeError(
"Can only add unicode or bytes. Got type: %s" % type(string))
return key
def __len__(self):
"""The number of strings in the store.
RETURNS (int): The number of strings in the store.
"""
return self.keys.size()
def __contains__(self, string not None):
"""Check whether a string is in the store.
string (unicode): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
cdef hash_t key
if isinstance(string, int) or isinstance(string, long):
if string == 0:
return True
key = string
elif len(string) == 0:
return True
cdef hash_t key = hash_string(string)
return self._map.get(key) is not NULL
elif string in SYMBOLS_BY_STR:
return True
elif isinstance(string, unicode):
key = hash_string(string)
else:
string = string.encode('utf8')
key = hash_utf8(string, len(string))
if key < len(SYMBOLS_BY_INT):
return True
else:
return self._map.get(key) is not NULL
def __iter__(self):
"""
Iterate over the strings in the store, in order.
"""Iterate over the strings in the store, in order.
Yields: unicode A string in the store.
YIELDS (unicode): A string in the store.
"""
cdef int i
for i in range(self.size):
yield _decode(&self.c[i]) if i > 0 else u''
cdef hash_t key
for i in range(self.keys.size()):
key = self.keys[i]
utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str)
# TODO: Iterate OOV here?
def __reduce__(self):
strings = [""]
for i in range(1, self.size):
string = &self.c[i]
py_string = _decode(string)
strings.append(py_string)
strings = list(self)
return (StringStore, (strings,), None, None, None)
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
path = util.ensure_path(path)
strings = list(self)
with path.open('w') as file_:
file_.write(json_dumps(strings))
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object.
"""
path = util.ensure_path(path)
with path.open('r') as file_:
strings = ujson.load(file_)
self._reset_and_load(strings)
return self
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object.
"""
return ujson.dumps(list(self))
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object.
"""
strings = ujson.loads(bytes_data)
self._reset_and_load(strings)
return self
def set_frozen(self, bint is_frozen):
# TODO
self.is_frozen = is_frozen
@ -202,6 +244,15 @@ cdef class StringStore:
def flush_oov(self):
self._oov = PreshMap()
def _reset_and_load(self, strings, freeze=False):
self.mem = Pool()
self._map = PreshMap()
self._oov = PreshMap()
self.keys.clear()
for string in strings:
self.add(string)
self.is_frozen = freeze
cdef const Utf8Str* intern_unicode(self, unicode py_string):
# 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode('utf8')
@ -223,73 +274,11 @@ cdef class StringStore:
key32 = hash32_utf8(utf8_string, length)
# Important: Make the OOV store own the memory. That way it's trivial
# to flush them all.
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
self._oov.set(key32, value)
return NULL
if self.size == self._resize_at:
self._realloc()
self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, <void*>&self.c[self.size])
self.size += 1
return &self.c[self.size-1]
def dump(self, file_):
"""
Save the strings to a JSON file.
Arguments:
file_ (buffer): The file to save the strings.
Returns:
None
"""
string_data = ujson.dumps(list(self))
if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8')
# TODO: OOV?
file_.write(string_data)
def load(self, file_):
"""
Load the strings from a JSON file.
Arguments:
file_ (buffer): The file from which to load the strings.
Returns:
None
"""
strings = ujson.load(file_)
if strings == ['']:
return None
cdef unicode string
for string in strings:
# explicit None/len check instead of simple truth testing
# (bug in Cython <= 0.23.4)
if string is not None and len(string):
self.intern_unicode(string)
def _realloc(self):
# We want to map straight to pointers, but they'll be invalidated if
# we resize our array. So, first we remap to indices, then we resize,
# then we can acquire the new pointers.
cdef Pool tmp_mem = Pool()
keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
cdef key_t key
cdef void* value
cdef const Utf8Str ptr
cdef int i = 0
cdef size_t offset
while map_iter(self._map.c_map, &i, &key, &value):
# Find array index with pointer arithmetic
offset = ((<Utf8Str*>value) - self.c)
keys[offset] = key
self._resize_at *= 2
cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
self._map = PreshMap(self.size)
for i in range(self.size):
if keys[i]:
self._map.set(keys[i], &self.c[i])
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, value)
self.keys.push_back(key)
return value

View File

@ -5,8 +5,6 @@ from .parts_of_speech cimport univ_pos_t
cdef struct LexemeC:
float* vector
flags_t flags
attr_t lang
@ -25,11 +23,10 @@ cdef struct LexemeC:
float prob
float sentiment
float l2_norm
cdef struct SerializedLexemeC:
unsigned char[4*13 + 8] data
unsigned char[8 + 8*10 + 4 + 4] data
# sizeof(flags_t) # flags
# + sizeof(attr_t) # lang
# + sizeof(attr_t) # id
@ -50,7 +47,7 @@ cdef struct Entity:
hash_t id
int start
int end
int label
attr_t label
cdef struct TokenC:
@ -58,12 +55,12 @@ cdef struct TokenC:
uint64_t morph
univ_pos_t pos
bint spacy
int tag
attr_t tag
int idx
int lemma
int sense
attr_t lemma
attr_t sense
int head
int dep
attr_t dep
bint sent_start
uint32_t l_kids
@ -72,5 +69,5 @@ cdef struct TokenC:
uint32_t r_edge
int ent_iob
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
hash_t ent_id

View File

@ -82,6 +82,7 @@ cpdef enum symbol_t:
ENT_IOB
ENT_TYPE
HEAD
SENT_START
SPACY
PROB

View File

@ -84,6 +84,7 @@ IDS = {
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY,
"PROB": PROB,

View File

@ -9,6 +9,7 @@ from ..structs cimport TokenC, Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
from ..typedefs cimport attr_t
cdef inline bint is_space_token(const TokenC* token) nogil:
@ -71,6 +72,45 @@ cdef cppclass StateC:
free(this._stack - PADDING)
free(this.shifted - PADDING)
void set_context_tokens(int* ids, int n) nogil:
if n == 13:
ids[0] = this.B(0)
ids[1] = this.B(1)
ids[2] = this.S(0)
ids[3] = this.S(1)
ids[4] = this.S(2)
ids[5] = this.L(this.S(0), 1)
ids[6] = this.L(this.S(0), 2)
ids[6] = this.R(this.S(0), 1)
ids[7] = this.L(this.B(0), 1)
ids[8] = this.R(this.S(0), 2)
ids[9] = this.L(this.S(1), 1)
ids[10] = this.L(this.S(1), 2)
ids[11] = this.R(this.S(1), 1)
ids[12] = this.R(this.S(1), 2)
elif n == 6:
if this.B(0) >= 0:
ids[0] = this.B(0)
else:
ids[0] = -1
ids[1] = this.B(0)
ids[2] = this.B(1)
ids[3] = this.E(0)
if ids[3] >= 1:
ids[4] = this.E(0)-1
else:
ids[4] = -1
if (ids[3]+1) < this.length:
ids[5] = this.E(0)+1
else:
ids[5] = -1
else:
# TODO error =/
pass
for i in range(n):
if ids[i] >= 0:
ids[i] += this.offset
int S(int i) nogil const:
if i >= this._s_i:
return -1
@ -238,7 +278,7 @@ cdef cppclass StateC:
this._s_i -= 1
this.shifted[this.B(0)] = True
void add_arc(int head, int child, int label) nogil:
void add_arc(int head, int child, attr_t label) nogil:
if this.has_head(child):
this.del_arc(this.H(child), child)
@ -282,7 +322,7 @@ cdef cppclass StateC:
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
h.l_kids -= 1
void open_ent(int label) nogil:
void open_ent(attr_t label) nogil:
this._ents[this._e_i].start = this.B(0)
this._ents[this._e_i].label = label
this._ents[this._e_i].end = -1
@ -294,7 +334,7 @@ cdef cppclass StateC:
this._ents[this._e_i-1].end = this.B(0)+1
this._sent[this.B(0)].ent_iob = 1
void set_ent_tag(int i, int ent_iob, int ent_type) nogil:
void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
if 0 <= i < this.length:
this._sent[i].ent_iob = ent_iob
this._sent[i].ent_type = ent_type
@ -305,16 +345,18 @@ cdef cppclass StateC:
this._break = this._b_i
void clone(const StateC* src) nogil:
this.length = src.length
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
memcpy(this._stack, src._stack, this.length * sizeof(int))
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
this.length = src.length
this._b_i = src._b_i
this._s_i = src._s_i
this._e_i = src._e_i
this._break = src._break
this.offset = src.offset
this._empty_token = src._empty_token
void fast_forward() nogil:
# space token attachement policy:

View File

@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from .stateclass cimport StateClass
from ..typedefs cimport attr_t
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParseC

View File

@ -9,10 +9,10 @@ import ctypes
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cymem.cymem cimport Pool
from collections import OrderedDict
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
from .nonproj import PseudoProjectivity
from .nonproj import is_nonproj_tree
from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t
@ -60,7 +60,7 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
cost += 1
if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
cost += 1
cost += Break.is_valid(stcls.c, -1) and Break.move_cost(stcls, gold) == 0
cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
return cost
@ -73,7 +73,7 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
cost += gold.heads[target] == B_i
if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
break
if Break.is_valid(stcls.c, -1) and Break.move_cost(stcls, gold) == 0:
if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
cost += 1
return cost
@ -84,14 +84,14 @@ cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int c
elif stcls.H(child) == gold.heads[child]:
return 1
# Head in buffer
elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1:
elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != 0:
return 1
else:
return 0
cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
if gold.labels[child] == -1:
if not gold.has_dep[child]:
return True
elif gold.heads[child] == head:
return True
@ -99,10 +99,10 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
return False
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
if gold.labels[child] == -1:
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
if not gold.has_dep[child]:
return True
elif label == -1:
elif label == 0:
return True
elif gold.labels[child] == label:
return True
@ -111,21 +111,20 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label)
cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
return gold.labels[word] == -1 or gold.heads[word] == word
return gold.heads[word] == word or not gold.has_dep[word]
cdef class Shift:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.push()
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
@staticmethod
@ -133,17 +132,17 @@ cdef class Shift:
return push_cost(s, gold, s.B(0))
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0
cdef class Reduce:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.stack_depth() >= 2
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
if st.has_head(st.S(0)):
st.pop()
else:
@ -151,7 +150,7 @@ cdef class Reduce:
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
@staticmethod
@ -165,28 +164,28 @@ cdef class Reduce:
cost -= 1
if gold.heads[S_i] == st.S(0):
cost -= 1
if Break.is_valid(st.c, -1) and Break.move_cost(st, gold) == 0:
if Break.is_valid(st.c, 0) and Break.move_cost(st, gold) == 0:
cost -= 1
return cost
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0
cdef class LeftArc:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.B(0), st.S(0), label)
st.pop()
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
@staticmethod
@ -204,23 +203,23 @@ cdef class LeftArc:
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
cdef class RightArc:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.S(0), st.B(0), label)
st.push()
st.fast_forward()
@staticmethod
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
@staticmethod
@ -233,13 +232,13 @@ cdef class RightArc:
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
@staticmethod
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
cdef class Break:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int i
if not USE_BREAK:
return False
@ -251,12 +250,12 @@ cdef class Break:
return True
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.set_break(st.B_(0).l_edge)
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
@staticmethod
@ -281,13 +280,13 @@ cdef class Break:
return cost + 1
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0
cdef int _get_root(int word, const GoldParseC* gold) nogil:
while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0:
while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
word = gold.heads[word]
if gold.labels[word] == -1:
if not gold.has_dep[word]:
return -1
else:
return word
@ -295,9 +294,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
# Ensure sent_start is set to 0 throughout
for i in range(st.c.length):
st.c._sent[i].sent_start = False
st.c._sent[i].l_edge = i
st.c._sent[i].r_edge = i
st.fast_forward()
@ -313,21 +310,24 @@ cdef class ArcEager(TransitionSystem):
@classmethod
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
{
SHIFT: [''],
REDUCE: [''],
RIGHT: [],
LEFT: [],
BREAK: ['ROOT']})
OrderedDict((
(SHIFT, ['']),
(REDUCE, ['']),
(RIGHT, []),
(LEFT, []),
(BREAK, ['ROOT'])
)))
seen_actions = set()
for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT':
if (LEFT, label) not in seen_actions:
actions[LEFT].append(label)
seen_actions.add((LEFT, label))
for label in kwargs.get('right_labels', []):
if label.upper() != 'ROOT':
if (RIGHT, label) not in seen_actions:
actions[RIGHT].append(label)
seen_actions.add((RIGHT, label))
for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, iob), ctnts in sents:
@ -338,29 +338,39 @@ cdef class ArcEager(TransitionSystem):
if head < child:
if (RIGHT, label) not in seen_actions:
actions[RIGHT].append(label)
seen_actions.add((RIGHT, label))
elif head > child:
if (LEFT, label) not in seen_actions:
actions[LEFT].append(label)
seen_actions.add((LEFT, label))
return actions
property action_types:
def __get__(self):
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
cdef int preprocess_gold(self, GoldParse gold) except -1:
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.heads)
if all([tag is None for tag in gold.heads[start:end]]):
return False
else:
return True
def preprocess_gold(self, GoldParse gold):
if not self.has_gold(gold):
return None
for i in range(gold.length):
if gold.heads[i] is None: # Missing values
gold.c.heads[i] = i
gold.c.labels[i] = -1
gold.c.has_dep[i] = False
else:
label = gold.labels[i]
gold.c.has_dep[i] = True
if label.upper() == 'ROOT':
label = 'ROOT'
gold.c.heads[i] = gold.heads[i]
gold.c.labels[i] = self.strings[label]
# Count frequencies, for use in encoder
self.freqs[HEAD][gold.c.heads[i] - i] += 1
self.freqs[DEP][gold.c.labels[i]] += 1
gold.c.labels[i] = self.strings.add(label)
return gold
cdef Transition lookup_transition(self, object name) except *:
if '-' in name:
@ -374,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
def move_name(self, int move, int label):
def move_name(self, int move, attr_t label):
label_str = self.strings[label]
if label_str:
return MOVE_NAMES[move] + '-' + label_str
else:
return MOVE_NAMES[move]
cdef Transition init_transition(self, int clas, int move, int label) except *:
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
@ -414,9 +424,7 @@ cdef class ArcEager(TransitionSystem):
return t
cdef int initialize_state(self, StateC* st) nogil:
# Ensure sent_start is set to 0 throughout
for i in range(st.length):
st._sent[i].sent_start = False
st._sent[i].l_edge = i
st._sent[i].r_edge = i
st.fast_forward()
@ -432,18 +440,19 @@ cdef class ArcEager(TransitionSystem):
cdef int set_valid(self, int* output, const StateC* st) nogil:
cdef bint[N_MOVES] is_valid
is_valid[SHIFT] = Shift.is_valid(st, -1)
is_valid[REDUCE] = Reduce.is_valid(st, -1)
is_valid[LEFT] = LeftArc.is_valid(st, -1)
is_valid[RIGHT] = RightArc.is_valid(st, -1)
is_valid[BREAK] = Break.is_valid(st, -1)
is_valid[SHIFT] = Shift.is_valid(st, 0)
is_valid[REDUCE] = Reduce.is_valid(st, 0)
is_valid[LEFT] = LeftArc.is_valid(st, 0)
is_valid[RIGHT] = RightArc.is_valid(st, 0)
is_valid[BREAK] = Break.is_valid(st, 0)
cdef int i
for i in range(self.n_moves):
output[i] = is_valid[self.c[i].move]
cdef int set_costs(self, int* is_valid, weight_t* costs,
StateClass stcls, GoldParse gold) except -1:
cdef int i, move, label
cdef int i, move
cdef attr_t label
cdef label_cost_func_t[N_MOVES] label_cost_funcs
cdef move_cost_func_t[N_MOVES] move_cost_funcs
cdef weight_t[N_MOVES] move_costs
@ -461,7 +470,7 @@ cdef class ArcEager(TransitionSystem):
label_cost_funcs[RIGHT] = RightArc.label_cost
label_cost_funcs[BREAK] = Break.label_cost
cdef int* labels = gold.c.labels
cdef attr_t* labels = gold.c.labels
cdef int* heads = gold.c.heads
n_gold = 0

View File

@ -1,7 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
from ..parts_of_speech cimport NOUN, PROPN, PRON
from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
def english_noun_chunks(obj):
@ -12,9 +12,9 @@ def english_noun_chunks(obj):
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings['conj']
np_label = doc.vocab.strings['NP']
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
@ -48,9 +48,9 @@ def english_noun_chunks(obj):
def german_noun_chunks(obj):
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings['NP']
np_deps = set(doc.vocab.strings[label] for label in labels)
close_app = doc.vocab.strings['nk']
np_label = doc.vocab.strings.add('NP')
np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings.add('nk')
rbracket = 0
for i, word in enumerate(obj):
@ -66,4 +66,79 @@ def german_noun_chunks(obj):
yield word.left_edge.i, rbracket, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
def es_noun_chunks(obj):
doc = obj.doc
np_label = doc.vocab.strings['NP']
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
right_labels = ['flat', 'fixed', 'compound', 'neg']
stop_labels = ['punct']
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
def next_token(token):
try:
return token.nbor()
except:
return None
def noun_bounds(root):
def is_verb_token(token):
return token.pos in [VERB, AUX]
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
left_bound = token
right_bound = root
for token in root.rights:
if (token.dep in np_right_deps):
left, right = noun_bounds(token)
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i: right.i])):
break
else:
right_bound = right
return left_bound, right_bound
token = doc[0]
while token and token.i < len(doc):
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(token)
yield left.i, right.i+1, np_label
token = right
token = next_token(token)
def french_noun_chunks(obj):
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
'es': es_noun_chunks, 'fr': french_noun_chunks}

View File

@ -1,6 +1,7 @@
from .transition_system cimport TransitionSystem
from .transition_system cimport Transition
from ..gold cimport GoldParseC
from ..typedefs cimport attr_t
cdef class BiluoPushDown(TransitionSystem):

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from thinc.typedefs cimport weight_t
from collections import OrderedDict
from .stateclass cimport StateClass
from ._state cimport StateC
@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
cdef class BiluoPushDown(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
def __reduce__(self):
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str)
return (BiluoPushDown, (self.strings, labels_by_action),
None, None)
@classmethod
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
{
MISSING: [''],
BEGIN: [],
IN: [],
LAST: [],
UNIT: [],
OUT: ['']
})
OrderedDict((
(MISSING, ['']),
(BEGIN, []),
(IN, []),
(LAST, []),
(UNIT, []),
(OUT, [''])
)))
seen_entities = set()
for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities:
@ -87,32 +100,30 @@ cdef class BiluoPushDown(TransitionSystem):
def __get__(self):
return (BEGIN, IN, LAST, UNIT, OUT)
def move_name(self, int move, int label):
def move_name(self, int move, attr_t label):
if move == OUT:
return 'O'
elif move == 'MISSING':
elif move == MISSING:
return 'M'
else:
return MOVE_NAMES[move] + '-' + self.strings[label]
cdef int preprocess_gold(self, GoldParse gold) except -1:
def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.ner)
if all([tag == '-' for tag in gold.ner[start:end]]):
return False
else:
return True
def preprocess_gold(self, GoldParse gold):
if not self.has_gold(gold):
return None
for i in range(gold.length):
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
# Count frequencies, for use in encoder
if gold.c.ner[i].move in (BEGIN, UNIT):
self.freqs[ENT_IOB][3] += 1
self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1
elif gold.c.ner[i].move in (IN, LAST):
self.freqs[ENT_IOB][2] += 1
self.freqs[ENT_TYPE][0] += 1
elif gold.c.ner[i].move == OUT:
self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1
else:
self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1
return gold
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == None:
move_str = 'M'
label = 0
@ -122,7 +133,7 @@ cdef class BiluoPushDown(TransitionSystem):
if label_str.startswith('!'):
label_str = label_str[1:]
move_str = 'x'
label = self.strings[label_str]
label = self.strings.add(label_str)
else:
move_str = name
label = 0
@ -135,7 +146,7 @@ cdef class BiluoPushDown(TransitionSystem):
else:
raise KeyError(name)
cdef Transition init_transition(self, int clas, int move, int label) except *:
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
@ -184,21 +195,21 @@ cdef class BiluoPushDown(TransitionSystem):
cdef class Missing:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return False
@staticmethod
cdef int transition(StateC* s, int label) nogil:
cdef int transition(StateC* s, attr_t label) nogil:
pass
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 9000
cdef class Begin:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
# Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0
cdef int preset_ent_iob = st.B_(0).ent_iob
@ -222,16 +233,16 @@ cdef class Begin:
return label != 0 and not st.entity_is_open()
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.set_ent_tag(st.B(0), 3, label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING:
return 0
@ -251,7 +262,7 @@ cdef class Begin:
cdef class In:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
@ -267,17 +278,17 @@ cdef class In:
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 1, label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = IN
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
if g_act == MISSING:
@ -303,24 +314,24 @@ cdef class In:
cdef class Last:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.B_(1).ent_iob == 1:
return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.close_ent()
st.set_ent_tag(st.B(0), 1, label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = LAST
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING:
return 0
@ -345,7 +356,7 @@ cdef class Last:
cdef class Unit:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
@ -358,7 +369,7 @@ cdef class Unit:
return label != 0 and not st.entity_is_open()
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.close_ent()
st.set_ent_tag(st.B(0), 3, label)
@ -366,9 +377,9 @@ cdef class Unit:
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING:
return 0
@ -388,7 +399,7 @@ cdef class Unit:
cdef class Out:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 3:
return False
@ -397,15 +408,15 @@ cdef class Out:
return not st.entity_is_open()
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 2, 0)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING or g_act == ISNT:
return 0

View File

@ -14,4 +14,8 @@ cdef class Parser:
cdef readonly TransitionSystem moves
cdef readonly object cfg
cdef void _parse_step(self, StateC* state,
const float* feat_weights,
int nr_class, int nr_feat, int nr_piece) nogil
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil

View File

@ -5,7 +5,7 @@
# coding: utf-8
from __future__ import unicode_literals, print_function
from collections import Counter
from collections import Counter, OrderedDict
import ujson
import contextlib
@ -18,6 +18,7 @@ import dill
import numpy.random
cimport numpy as np
from libcpp.vector cimport vector
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals
from libc.stdint cimport uint32_t, uint64_t
@ -28,26 +29,29 @@ from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
from thinc.extra.eg cimport Example
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.api import layerize, chain
from thinc.api import layerize, chain, noop, clone
from thinc.neural import Model, Affine, ELU, ReLu, Maxout
from thinc.neural.ops import NumpyOps
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
from .. import util
from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats
from .._ml import Tok2Vec, doc2feats, rebatch
from ..compat import json_dumps
from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context
from .stateclass cimport StateClass
from ._state cimport StateC
from .nonproj import PseudoProjectivity
from . import nonproj
from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC
@ -104,68 +108,77 @@ cdef class precompute_hiddens:
cached = gpu_cached
self.nF = cached.shape[1]
self.nO = cached.shape[2]
self.nP = cached.shape[3]
self.nP = getattr(lower_model, 'nP', 1)
self.ops = lower_model.ops
self._features = numpy.zeros((batch_size, self.nO, self.nP), dtype='f')
self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
self._is_synchronized = False
self._cuda_stream = cuda_stream
self._cached = cached
self._bp_hiddens = bp_features
cdef const float* get_feat_weights(self) except NULL:
if not self._is_synchronized \
and self._cuda_stream is not None:
self._cuda_stream.synchronize()
self._is_synchronized = True
return <float*>self._cached.data
def __call__(self, X):
return self.begin_update(X)[0]
def begin_update(self, token_ids, drop=0.):
self._features.fill(0)
if not self._is_synchronized \
and self._cuda_stream is not None:
self._cuda_stream.synchronize()
self._is_synchronized = True
# This is tricky, but (assuming GPU available);
# - Input to forward on CPU
# - Output from forward on CPU
# - Input to backward on GPU!
# - Output from backward on GPU
cdef np.ndarray state_vector = self._features[:len(token_ids)]
cdef np.ndarray hiddens = self._cached
bp_hiddens = self._bp_hiddens
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids
self._sum_features(<float*>state_vector.data,
<float*>hiddens.data, &ids[0,0],
sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
output, bp_output = self._apply_nonlinearity(state_vector)
def backward(d_output, sgd=None):
def backward(d_state_vector, sgd=None):
if bp_nonlinearity is not None:
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
# This will usually be on GPU
if isinstance(d_output, numpy.ndarray):
d_output = self.ops.xp.array(d_output)
d_state_vector = bp_output(d_output, sgd)
if isinstance(d_state_vector, numpy.ndarray):
d_state_vector = self.ops.xp.array(d_state_vector)
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
return d_tokens
return output, backward
return state_vector, backward
def _apply_nonlinearity(self, X):
if self.nP < 2:
return X.reshape(X.shape[:2]), lambda dX, sgd=None: dX.reshape(X.shape)
best, which = self.ops.maxout(X)
return best, lambda dX, sgd=None: self.ops.backprop_maxout(dX, which, self.nP)
def _nonlinearity(self, state_vector):
if self.nP == 1:
return state_vector, None
state_vector = state_vector.reshape(
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
best, which = self.ops.maxout(state_vector)
def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
cdef void _sum_features(self, float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i
cdef const float* feature
for b in range(B):
for f in range(F):
if token_ids[f] < 0:
continue
idx = token_ids[f] * F * O + f*O
feature = &cached[idx]
for i in range(O):
output[i] += feature[i]
output += O
token_ids += F
cdef void sum_state_features(float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i
cdef const float* feature
for b in range(B):
for f in range(F):
if token_ids[f] < 0:
continue
idx = token_ids[f] * F * O + f*O
feature = &cached[idx]
for i in range(O):
output[i] += feature[i]
output += O
token_ids += F
cdef void cpu_log_loss(float* d_scores,
@ -220,25 +233,43 @@ cdef class Parser:
Base class of the DependencyParser and EntityRecognizer.
"""
@classmethod
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
depth = util.env_opt('parser_hidden_depth', depth)
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width)
maxout_pieces = util.env_opt('parser_maxout_pieces', 1)
lower = PrecomputableMaxouts(hidden_width,
nF=cls.nr_feature,
nI=token_vector_width,
pieces=maxout_pieces)
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
if parser_maxout_pieces == 1:
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
nI=token_vector_width)
else:
lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
nP=parser_maxout_pieces,
nI=token_vector_width)
with Model.use_device('cpu'):
upper = chain(
Maxout(hidden_width),
zero_init(Affine(nr_class))
)
if depth == 0:
upper = chain()
upper.is_noop = True
else:
upper = chain(
clone(Maxout(hidden_width), (depth-1)),
zero_init(Affine(nr_class, drop_factor=0.0))
)
upper.is_noop = False
# TODO: This is an unfortunate hack atm!
# Used to set input dimensions in network.
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
upper.begin_training(upper.ops.allocate((500, hidden_width)))
return lower, upper
cfg = {
'nr_class': nr_class,
'depth': depth,
'token_vector_width': token_vector_width,
'hidden_width': hidden_width,
'maxout_pieces': parser_maxout_pieces
}
return (lower, upper), cfg
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
"""
@ -274,7 +305,7 @@ cdef class Parser:
def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None)
def __call__(self, Doc tokens, state=None):
def __call__(self, Doc doc):
"""
Apply the parser or entity recognizer, setting the annotations onto the Doc object.
@ -283,10 +314,11 @@ cdef class Parser:
Returns:
None
"""
self.parse_batch([tokens], state['tokvecs'])
return state
states = self.parse_batch([doc], [doc.tensor])
self.set_annotations([doc], states)
return doc
def pipe(self, stream, int batch_size=1000, int n_threads=2):
def pipe(self, docs, int batch_size=1000, int n_threads=2):
"""
Process a stream of documents.
@ -301,93 +333,217 @@ cdef class Parser:
cdef StateClass parse_state
cdef Doc doc
queue = []
for batch in cytoolz.partition_all(batch_size, stream):
batch = list(batch)
docs, states = zip(*batch)
parse_states = self.parse_batch(docs, states[0]['tokvecs'])
for docs in cytoolz.partition_all(batch_size, docs):
docs = list(docs)
tokvecs = [d.tensor for d in docs]
parse_states = self.parse_batch(docs, tokvecs)
self.set_annotations(docs, parse_states)
yield from zip(docs, states)
yield from docs
def parse_batch(self, docs, tokvecses):
cdef:
precompute_hiddens state2vec
StateClass state
Pool mem
const float* feat_weights
StateC* st
vector[StateC*] next_step, this_step
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
if isinstance(docs, Doc):
docs = [docs]
tokvecs = self.model[0].ops.flatten(tokvecses)
nr_state = len(docs)
nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1]
nr_feat = self.nr_feature
def parse_batch(self, docs, tokvecs):
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
cuda_stream, 0.0)
nr_piece = state2vec.nP
states = self.moves.init_batch(docs)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs,
cuda_stream, 0.0)
for state in states:
if not state.c.is_final():
next_step.push_back(state.c)
todo = [st for st in states if not st.is_final()]
while todo:
token_ids = self.get_token_ids(states)
vectors = state2vec(token_ids)
scores = vec2scores(vectors)
self.transition_batch(states, scores)
todo = [st for st in states if not st.is_final()]
feat_weights = state2vec.get_feat_weights()
cdef int i
cdef np.ndarray token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
cdef np.ndarray is_valid = numpy.zeros((nr_state, nr_class), dtype='i')
cdef np.ndarray scores
c_token_ids = <int*>token_ids.data
c_is_valid = <int*>is_valid.data
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
while not next_step.empty():
if not has_hidden:
for i in cython.parallel.prange(
next_step.size(), num_threads=6, nogil=True):
self._parse_step(next_step[i],
feat_weights, nr_class, nr_feat, nr_piece)
else:
for i in range(next_step.size()):
st = next_step[i]
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
self.moves.set_valid(&c_is_valid[i*nr_class], st)
vectors = state2vec(token_ids[:next_step.size()])
scores = vec2scores(vectors)
c_scores = <float*>scores.data
for i in range(next_step.size()):
st = next_step[i]
guess = arg_max_if_valid(
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
action = self.moves.c[guess]
action.do(st, action.label)
this_step, next_step = next_step, this_step
next_step.clear()
for st in this_step:
if not st.is_final():
next_step.push_back(st)
return states
def update(self, docs, golds, state=None, drop=0., sgd=None):
assert state is not None
assert 'tokvecs' in state
assert 'bp_tokvecs' in state
cdef void _parse_step(self, StateC* state,
const float* feat_weights,
int nr_class, int nr_feat, int nr_piece) nogil:
'''This only works with no hidden layers -- fast but inaccurate'''
#for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
# self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
token_ids = <int*>calloc(nr_feat, sizeof(int))
scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
is_valid = <int*>calloc(nr_class, sizeof(int))
state.set_context_tokens(token_ids, nr_feat)
sum_state_features(scores,
feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
self.moves.set_valid(is_valid, state)
guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
action = self.moves.c[guess]
action.do(state, action.label)
free(is_valid)
free(scores)
free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs
tokvecs = self.model[0].ops.flatten(tokvec_lists)
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs]
golds = [golds]
cuda_stream = get_cuda_stream()
for gold in golds:
self.moves.preprocess_gold(gold)
tokvecs = state['tokvecs']
bp_tokvecs = state['bp_tokvecs']
states = self.moves.init_batch(docs)
states, golds, max_steps = self._init_gold_batch(docs, golds)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
drop)
todo = [(s, g) for s, g in zip(states, golds) if not s.is_final()]
0.0)
todo = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None]
if not todo:
return None
backprops = []
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
cdef float loss = 0.
cutoff = max(1, len(todo) // 10)
while len(todo) >= cutoff:
n_steps = 0
while todo:
states, golds = zip(*todo)
token_ids = self.get_token_ids(states)
vector, bp_vector = state2vec.begin_update(token_ids, drop=drop)
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
if drop != 0:
mask = vec2scores.ops.get_dropout_mask(vector.shape, drop)
vector *= mask
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
d_scores = self.get_batch_loss(states, golds, scores)
d_vector = bp_scores(d_scores, sgd=sgd)
loss += (d_scores**2).sum()
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
if drop != 0:
d_vector *= mask
if not isinstance(tokvecs, state2vec.ops.xp.ndarray):
backprops.append((token_ids, d_vector, bp_vector))
else:
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to CPU, asynchronously
backprops.append((
get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector),
bp_vector
))
else:
backprops.append((token_ids, d_vector, bp_vector))
self.transition_batch(states, scores)
todo = [st for st in todo if not st[0].is_final()]
if losses is not None:
losses[self.name] += (d_scores**2).sum()
n_steps += 1
if n_steps >= max_steps:
break
self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream)
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N,
where N is the shortest doc. We'll make two states, one representing
long_doc[:N], and another representing long_doc[N:]."""
cdef:
StateClass state
Transition action
whole_states = self.moves.init_batch(whole_docs)
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
max_moves = 0
states = []
golds = []
for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
gold = self.moves.preprocess_gold(gold)
if gold is None:
continue
oracle_actions = self.moves.get_oracle_sequence(doc, gold)
start = 0
while start < len(doc):
state = state.copy()
n_moves = 0
while state.B(0) < start and not state.is_final():
action = self.moves.c[oracle_actions.pop(0)]
action.do(state.c, action.label)
n_moves += 1
has_gold = self.moves.has_gold(gold, start=start,
end=start+max_length)
if not state.is_final() and has_gold:
states.append(state)
golds.append(gold)
max_moves = max(max_moves, n_moves)
start += min(max_length, len(doc)-start)
max_moves = max(max_moves, len(oracle_actions))
return states, golds, max_moves
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
# Tells CUDA to block, so our async copies complete.
if cuda_stream is not None:
cuda_stream.synchronize()
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
xp = state2vec.ops.xp # Handle for numpy/cupy
for token_ids, d_vector, bp_vector in backprops:
xp = get_array_module(d_tokvecs)
for ids, d_vector, bp_vector in backprops:
d_state_features = bp_vector(d_vector, sgd=sgd)
active_feats = token_ids * (token_ids >= 0)
active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1))
active_feats = ids * (ids >= 0)
active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
if hasattr(xp, 'scatter_add'):
xp.scatter_add(d_tokvecs,
token_ids, d_state_features * active_feats)
ids, d_state_features * active_feats)
else:
xp.add.at(d_tokvecs,
token_ids, d_state_features * active_feats)
bp_tokvecs(d_tokvecs, sgd)
state['parser_loss'] = loss
return state
ids, d_state_features * active_feats)
@property
def move_names(self):
names = []
for i in range(self.moves.n_moves):
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
names.append(name)
return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
lower, upper = self.model
@ -400,9 +556,12 @@ cdef class Parser:
def get_token_ids(self, states):
cdef StateClass state
cdef int n_tokens = self.nr_feature
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='C')
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
dtype='i', order='C')
c_ids = <int*>ids.data
for i, state in enumerate(states):
state.set_context_tokens(ids[i])
state.c.set_context_tokens(c_ids, n_tokens)
c_ids += ids.shape[1]
return ids
def transition_batch(self, states, float[:, ::1] scores):
@ -445,7 +604,6 @@ cdef class Parser:
self.moves.finalize_doc(doc)
def add_label(self, label):
# Doesn't set label into serializer -- subclasses override it to do that.
for action in self.moves.action_types:
added = self.moves.add_action(action, label)
if added:
@ -456,12 +614,18 @@ cdef class Parser:
def begin_training(self, gold_tuples, **cfg):
if 'model' in cfg:
self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
actions = self.moves.get_actions(gold_parses=gold_tuples)
for action, labels in actions.items():
for label in labels:
self.moves.add_action(action, label)
if self.model is True:
self.model = self.Model(self.moves.n_moves, **cfg)
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
self.cfg.update(cfg)
def preprocess_gold(self, docs_golds):
for doc, gold in docs_golds:
yield doc, gold
def use_params(self, params):
# Can't decorate cdef class :(. Workaround.
@ -469,21 +633,75 @@ cdef class Parser:
with self.model[1].use_params(params):
yield
def to_disk(self, path):
path = util.ensure_path(path)
with (path / 'model.bin').open('wb') as file_:
dill.dump(self.model, file_)
def to_disk(self, path, **exclude):
serializers = {
'lower_model': lambda p: p.open('wb').write(
self.model[0].to_bytes()),
'upper_model': lambda p: p.open('wb').write(
self.model[1].to_bytes()),
'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False),
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
}
util.to_disk(path, serializers, exclude)
def from_disk(self, path):
path = util.ensure_path(path)
with (path / 'model.bin').open('wb') as file_:
self.model = dill.load(file_)
def from_disk(self, path, **exclude):
deserializers = {
'vocab': lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, strings=False),
'cfg': lambda p: self.cfg.update(ujson.load(p.open())),
'model': lambda p: None
}
util.from_disk(path, deserializers, exclude)
if 'model' not in exclude:
path = util.ensure_path(path)
if self.model is True:
self.model, cfg = self.Model(**self.cfg)
else:
cfg = {}
with (path / 'lower_model').open('rb') as file_:
bytes_data = file_.read()
self.model[0].from_bytes(bytes_data)
with (path / 'upper_model').open('rb') as file_:
bytes_data = file_.read()
self.model[1].from_bytes(bytes_data)
self.cfg.update(cfg)
return self
def to_bytes(self):
pass
def to_bytes(self, **exclude):
serializers = OrderedDict((
('lower_model', lambda: self.model[0].to_bytes()),
('upper_model', lambda: self.model[1].to_bytes()),
('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)),
('cfg', lambda: ujson.dumps(self.cfg))
))
if 'model' in exclude:
exclude['lower_model'] = True
exclude['upper_model'] = True
exclude.pop('model')
return util.to_bytes(serializers, exclude)
def from_bytes(self, data):
pass
def from_bytes(self, bytes_data, **exclude):
deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('lower_model', lambda b: None),
('upper_model', lambda b: None)
))
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude:
if self.model is True:
self.model, cfg = self.Model(self.moves.n_moves)
else:
cfg = {}
if 'lower_model' in msg:
self.model[0].from_bytes(msg['lower_model'])
if 'upper_model' in msg:
self.model[1].from_bytes(msg['upper_model'])
self.cfg.update(cfg)
return self
class ParserStateError(ValueError):
@ -521,6 +739,19 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
return best
cdef int arg_maxout_if_valid(const weight_t* scores, const int* is_valid,
int n, int nP) nogil:
cdef int best = -1
cdef float best_score = 0
for i in range(n):
if is_valid[i] >= 1:
for j in range(nP):
if best == -1 or scores[i*nP+j] > best_score:
best = i
best_score = scores[i*nP+j]
return best
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
int nr_class) except -1:
cdef weight_t score = 0

View File

@ -1,10 +1,17 @@
# coding: utf-8
"""
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
"""
from __future__ import unicode_literals
from copy import copy
from ..tokens.doc cimport Doc
from ..attrs import DEP, HEAD
DELIMITER = '||'
def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root
@ -60,145 +67,124 @@ def is_nonproj_tree(heads):
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
class PseudoProjectivity:
# implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
# for doing pseudo-projective parsing
# implementation uses the HEAD decoration scheme
delimiter = '||'
@classmethod
def decompose(cls, label):
return label.partition(cls.delimiter)[::2]
@classmethod
def is_decorated(cls, label):
return label.find(cls.delimiter) != -1
@classmethod
def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30):
preprocessed = []
freqs = {}
for raw_text, sents in gold_tuples:
prepro_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads,deco_labels = cls.projectivize(heads,labels)
# set the label to ROOT for each root dependent
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
# count label frequencies
if label_freq_cutoff > 0:
for label in deco_labels:
if cls.is_decorated(label):
freqs[label] = freqs.get(label,0) + 1
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
preprocessed.append((raw_text, prepro_sents))
if label_freq_cutoff > 0:
return cls._filter_labels(preprocessed,label_freq_cutoff,freqs)
return preprocessed
def decompose(label):
return label.partition(DELIMITER)[::2]
@classmethod
def projectivize(cls, heads, labels):
# use the algorithm by Nivre & Nilsson 2005
# assumes heads to be a proper tree, i.e. connected and cycle-free
# returns a new pair (heads,labels) which encode
# a projective and decorated tree
proj_heads = copy(heads)
smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
if smallest_np_arc == None: # this sentence is already projective
return proj_heads, copy(labels)
while smallest_np_arc != None:
cls._lift(smallest_np_arc, proj_heads)
smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
deco_labels = cls._decorate(heads, proj_heads, labels)
return proj_heads, deco_labels
def is_decorated(label):
return label.find(DELIMITER) != -1
@classmethod
def deprojectivize(cls, tokens):
# reattach arcs with decorated labels (following HEAD scheme)
# for each decorated arc X||Y, search top-down, left-to-right,
# breadth-first until hitting a Y then make this the new head
#parse = tokens.to_array([HEAD, DEP])
for token in tokens:
if cls.is_decorated(token.dep_):
newlabel,headlabel = cls.decompose(token.dep_)
newhead = cls._find_new_head(token,headlabel)
token.head = newhead
token.dep_ = newlabel
def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
preprocessed = []
freqs = {}
for raw_text, sents in gold_tuples:
prepro_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads,deco_labels = projectivize(heads,labels)
# set the label to ROOT for each root dependent
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
# count label frequencies
if label_freq_cutoff > 0:
for label in deco_labels:
if is_decorated(label):
freqs[label] = freqs.get(label,0) + 1
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
preprocessed.append((raw_text, prepro_sents))
# tokens.attach(token,newhead,newlabel)
#parse[token.i,1] = tokens.vocab.strings[newlabel]
#parse[token.i,0] = newhead.i - token.i
#tokens.from_array([HEAD, DEP],parse)
if label_freq_cutoff > 0:
return _filter_labels(preprocessed,label_freq_cutoff,freqs)
return preprocessed
@classmethod
def _decorate(cls, heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005
assert(len(heads) == len(proj_heads) == len(labels))
deco_labels = []
for tokenid,head in enumerate(heads):
if head != proj_heads[tokenid]:
deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head]))
else:
deco_labels.append(labels[tokenid])
return deco_labels
def projectivize(heads, labels):
# use the algorithm by Nivre & Nilsson 2005
# assumes heads to be a proper tree, i.e. connected and cycle-free
# returns a new pair (heads,labels) which encode
# a projective and decorated tree
proj_heads = copy(heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
if smallest_np_arc == None: # this sentence is already projective
return proj_heads, copy(labels)
while smallest_np_arc != None:
_lift(smallest_np_arc, proj_heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
deco_labels = _decorate(heads, proj_heads, labels)
return proj_heads, deco_labels
@classmethod
def _get_smallest_nonproj_arc(cls, heads):
# return the smallest non-proj arc or None
# where size is defined as the distance between dep and head
# and ties are broken left to right
smallest_size = float('inf')
smallest_np_arc = None
for tokenid,head in enumerate(heads):
size = abs(tokenid-head)
if size < smallest_size and is_nonproj_arc(tokenid,heads):
smallest_size = size
smallest_np_arc = tokenid
return smallest_np_arc
def deprojectivize(tokens):
# reattach arcs with decorated labels (following HEAD scheme)
# for each decorated arc X||Y, search top-down, left-to-right,
# breadth-first until hitting a Y then make this the new head
for token in tokens:
if is_decorated(token.dep_):
newlabel,headlabel = decompose(token.dep_)
newhead = _find_new_head(token,headlabel)
token.head = newhead
token.dep_ = newlabel
return tokens
def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005
assert(len(heads) == len(proj_heads) == len(labels))
deco_labels = []
for tokenid,head in enumerate(heads):
if head != proj_heads[tokenid]:
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
else:
deco_labels.append(labels[tokenid])
return deco_labels
@classmethod
def _lift(cls, tokenid, heads):
# reattaches a word to it's grandfather
head = heads[tokenid]
ghead = heads[head]
# attach to ghead if head isn't attached to root else attach to root
heads[tokenid] = ghead if head != ghead else tokenid
def _get_smallest_nonproj_arc(heads):
# return the smallest non-proj arc or None
# where size is defined as the distance between dep and head
# and ties are broken left to right
smallest_size = float('inf')
smallest_np_arc = None
for tokenid,head in enumerate(heads):
size = abs(tokenid-head)
if size < smallest_size and is_nonproj_arc(tokenid,heads):
smallest_size = size
smallest_np_arc = tokenid
return smallest_np_arc
@classmethod
def _find_new_head(cls, token, headlabel):
# search through the tree starting from the head of the given token
# returns the id of the first descendant with the given label
# if there is none, return the current head (no change)
queue = [token.head]
while queue:
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space: continue
if child == token: continue
if child.dep_ == headlabel:
return child
next_queue.append(child)
queue = next_queue
return token.head
def _lift(tokenid, heads):
# reattaches a word to it's grandfather
head = heads[tokenid]
ghead = heads[head]
# attach to ghead if head isn't attached to root else attach to root
heads[tokenid] = ghead if head != ghead else tokenid
@classmethod
def _filter_labels(cls, gold_tuples, cutoff, freqs):
# throw away infrequent decorated labels
# can't learn them reliably anyway and keeps label set smaller
filtered = []
for raw_text, sents in gold_tuples:
filtered_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
filtered.append((raw_text, filtered_sents))
return filtered
def _find_new_head(token, headlabel):
# search through the tree starting from the head of the given token
# returns the id of the first descendant with the given label
# if there is none, return the current head (no change)
queue = [token.head]
while queue:
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space: continue
if child == token: continue
if child.dep_ == headlabel:
return child
next_queue.append(child)
queue = next_queue
return token.head
def _filter_labels(gold_tuples, cutoff, freqs):
# throw away infrequent decorated labels
# can't learn them reliably anyway and keeps label set smaller
filtered = []
for raw_text, sents in gold_tuples:
filtered_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
filtered.append((raw_text, filtered_sents))
return filtered

View File

@ -33,7 +33,6 @@ from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context
from .stateclass cimport StateClass
from ._state cimport StateC
from .nonproj import PseudoProjectivity
from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC

View File

@ -4,6 +4,7 @@ from cymem.cymem cimport Pool
cimport cython
from ..structs cimport TokenC, Entity
from ..typedefs cimport attr_t
from ..vocab cimport EMPTY_LEXEME
from ._state cimport StateC
@ -105,19 +106,19 @@ cdef class StateClass:
cdef inline void unshift(self) nogil:
self.c.unshift()
cdef inline void add_arc(self, int head, int child, int label) nogil:
cdef inline void add_arc(self, int head, int child, attr_t label) nogil:
self.c.add_arc(head, child, label)
cdef inline void del_arc(self, int head, int child) nogil:
self.c.del_arc(head, child)
cdef inline void open_ent(self, int label) nogil:
cdef inline void open_ent(self, attr_t label) nogil:
self.c.open_ent(label)
cdef inline void close_ent(self) nogil:
self.c.close_ent()
cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil:
cdef inline void set_ent_tag(self, int i, int ent_iob, attr_t ent_type) nogil:
self.c.set_ent_tag(i, ent_iob, ent_type)
cdef inline void set_break(self, int i) nogil:

View File

@ -41,6 +41,11 @@ cdef class StateClass:
def is_final(self):
return self.c.is_final()
def copy(self):
cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length)
new_state.c.clone(self.c)
return new_state
def print_state(self, words):
words = list(words) + ['_']
top = words[self.S(0)] + '_%d' % self.S_(0).head

View File

@ -1,6 +1,7 @@
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from ..typedefs cimport attr_t
from ..structs cimport TokenC
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
@ -13,20 +14,22 @@ from ._state cimport StateC
cdef struct Transition:
int clas
int move
int label
attr_t label
weight_t score
bint (*is_valid)(const StateC* state, int label) nogil
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
int (*do)(StateC* state, int label) nogil
bint (*is_valid)(const StateC* state, attr_t label) nogil
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
int (*do)(StateC* state, attr_t label) nogil
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
attr_tlabel) nogil
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
gold, attr_t label) nogil
ctypedef int (*do_func_t)(StateC* state, int label) nogil
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
@ -36,18 +39,16 @@ cdef class TransitionSystem:
cdef Transition* c
cdef readonly int n_moves
cdef int _size
cdef public int root_label
cdef public attr_t root_label
cdef public freqs
cdef init_state_t init_beam_state
cdef int initialize_state(self, StateC* state) nogil
cdef int finalize_state(self, StateC* state) nogil
cdef int preprocess_gold(self, GoldParse gold) except -1
cdef Transition lookup_transition(self, object name) except *
cdef Transition init_transition(self, int clas, int move, int label) except *
cdef Transition init_transition(self, int clas, int move, attr_t label) except *
cdef int set_valid(self, int* output, const StateC* st) nogil

View File

@ -5,11 +5,14 @@ from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from collections import defaultdict
from collections import defaultdict, OrderedDict
import ujson
from .. import util
from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from ..typedefs cimport attr_t
cdef weight_t MIN_SCORE = -90000
@ -26,7 +29,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef class TransitionSystem:
def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None):
def __init__(self, StringStore string_table, labels_by_action):
self.mem = Pool()
self.strings = string_table
self.n_moves = 0
@ -34,28 +37,20 @@ cdef class TransitionSystem:
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
for action, label_strs in sorted(labels_by_action.items()):
for action, label_strs in labels_by_action.items():
for label_str in label_strs:
self.add_action(int(action), label_str)
self.root_label = self.strings['ROOT']
self.freqs = {} if _freqs is None else _freqs
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
self.freqs[attr] = defaultdict(int)
self.freqs[attr][0] = 1
# Ensure we've seen heads. Need an official dependency length limit...
for i in range(10024):
self.freqs[HEAD][i] = 1
self.freqs[HEAD][-i] = 1
self.root_label = self.strings.add('ROOT')
self.init_beam_state = _init_state
def __reduce__(self):
labels_by_action = {}
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str)
return (self.__class__,
(self.strings, labels_by_action, self.freqs),
(self.strings, labels_by_action),
None, None)
def init_batch(self, docs):
@ -69,6 +64,29 @@ cdef class TransitionSystem:
offset += len(doc)
return states
def get_oracle_sequence(self, doc, GoldParse gold):
cdef Pool mem = Pool()
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
cdef StateClass state = StateClass(doc, offset=0)
self.initialize_state(state.c)
history = []
while not state.is_final():
self.set_costs(is_valid, costs, state, gold)
for i in range(self.n_moves):
if is_valid[i] and costs[i] <= 0:
action = self.c[i]
history.append(i)
action.do(state.c, action.label)
break
else:
print(gold.words)
print(gold.ner)
print(history)
raise ValueError("Could not find gold move")
return history
cdef int initialize_state(self, StateC* state) nogil:
pass
@ -78,13 +96,13 @@ cdef class TransitionSystem:
def finalize_doc(self, doc):
pass
cdef int preprocess_gold(self, GoldParse gold) except -1:
def preprocess_gold(self, GoldParse gold):
raise NotImplementedError
cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError
cdef Transition init_transition(self, int clas, int move, int label) except *:
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
raise NotImplementedError
def is_valid(self, StateClass stcls, move_name):
@ -100,24 +118,76 @@ cdef class TransitionSystem:
StateClass stcls, GoldParse gold) except -1:
cdef int i
self.set_valid(is_valid, stcls.c)
cdef int n_gold = 0
for i in range(self.n_moves):
if is_valid[i]:
costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
n_gold += costs[i] <= 0
else:
costs[i] = 9000
if n_gold <= 0:
print(gold.words)
print(gold.ner)
print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
raise ValueError(
"Could not find a gold-standard action to supervise "
"the entity recognizer\n"
"The transition system has %d actions." % (self.n_moves))
def add_action(self, int action, label):
if not isinstance(label, int):
label = self.strings[label]
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int):
label_id = self.strings.add(label_name)
else:
label_id = label_name
# Check we're not creating a move we already have, so that this is
# idempotent
for trans in self.c[:self.n_moves]:
if trans.move == action and trans.label == label:
if trans.move == action and trans.label == label_id:
return 0
if self.n_moves >= self._size:
self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label)
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id
self.n_moves += 1
return 1
def to_disk(self, path, **exclude):
with path.open('wb') as file_:
file_.write(self.to_bytes(**exclude))
def from_disk(self, path, **exclude):
with path.open('rb') as file_:
byte_data = file_.read()
self.from_bytes(byte_data, **exclude)
return self
def to_bytes(self, **exclude):
transitions = []
for trans in self.c[:self.n_moves]:
transitions.append({
'clas': trans.clas,
'move': trans.move,
'label': self.strings[trans.label],
'name': self.move_name(trans.move, trans.label)
})
serializers = {
'transitions': lambda: ujson.dumps(transitions),
'strings': lambda: self.strings.to_bytes()
}
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
transitions = []
deserializers = {
'transitions': lambda b: transitions.extend(ujson.loads(b)),
'strings': lambda b: self.strings.from_bytes(b)
}
msg = util.from_bytes(bytes_data, deserializers, exclude)
for trans in transitions:
self.add_action(trans['move'], trans['label'])
return self

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import ujson
from collections import defaultdict
from cymem.cymem cimport Pool
@ -15,7 +14,6 @@ from .tokens.doc cimport Doc
from .attrs cimport TAG
from .gold cimport GoldParse
from .attrs cimport *
from . import util
cpdef enum:
@ -108,55 +106,15 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
cdef class Tagger:
"""
Annotate part-of-speech tags on Doc objects.
"""
@classmethod
def load(cls, path, vocab, require=False):
"""
Load the statistical model from the supplied path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
The vocabulary. Must be shared by the documents to be processed.
require (bool):
Whether to raise an error if the files are not found.
Returns (Tagger):
The newly created object.
"""
# TODO: Change this to expect config.json when we don't have to
# support old data.
path = util.ensure_path(path)
if (path / 'templates.json').exists():
with (path / 'templates.json').open('r', encoding='utf8') as file_:
templates = ujson.load(file_)
elif require:
raise IOError(
"Required file %s/templates.json not found when loading Tagger" % str(path))
else:
templates = cls.feature_templates
self = cls(vocab, model=None, feature_templates=templates)
if (path / 'model').exists():
self.model.load(str(path / 'model'))
elif require:
raise IOError(
"Required file %s/model not found when loading Tagger" % str(path))
return self
"""Annotate part-of-speech tags on Doc objects."""
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
"""
Create a Tagger.
"""Create a Tagger.
Arguments:
vocab (Vocab):
The vocabulary object. Must be shared with documents to be processed.
model (thinc.linear.AveragedPerceptron):
The statistical model.
Returns (Tagger):
The newly constructed object.
vocab (Vocab): The vocabulary object. Must be shared with documents to
be processed.
model (thinc.linear.AveragedPerceptron): The statistical model.
RETURNS (Tagger): The newly constructed object.
"""
if model is None:
model = TaggerModel(cfg.get('features', self.feature_templates),
@ -186,13 +144,9 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length
def __call__(self, Doc tokens):
"""
Apply the tagger, setting the POS tags onto the Doc object.
"""Apply the tagger, setting the POS tags onto the Doc object.
Arguments:
doc (Doc): The tokens to be tagged.
Returns:
None
doc (Doc): The tokens to be tagged.
"""
if tokens.length == 0:
return 0
@ -215,34 +169,25 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2):
"""
Tag a stream of documents.
"""Tag a stream of documents.
Arguments:
stream: The sequence of documents to tag.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
stream: The sequence of documents to tag.
batch_size (int): The number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the Matcher implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
"""
for doc in stream:
self(doc)
yield doc
def update(self, Doc tokens, GoldParse gold, itn=0):
"""
Update the statistical model, with tags supplied for the given document.
"""Update the statistical model, with tags supplied for the given document.
Arguments:
doc (Doc):
The document to update on.
gold (GoldParse):
Manager for the gold-standard tags.
Returns (int):
Number of tags correct.
doc (Doc): The document to update on.
gold (GoldParse): Manager for the gold-standard tags.
RETURNS (int): Number of tags predicted correctly.
"""
gold_tag_strs = gold.tags
assert len(tokens) == len(gold_tag_strs)

View File

@ -13,21 +13,32 @@ Tests for spaCy modules and classes live in their own directories of the same na
2. [Dos and don'ts](#dos-and-donts)
3. [Parameters](#parameters)
4. [Fixtures](#fixtures)
5. [Helpers and utilities](#helpers-and-utilities)
6. [Contributing to the tests](#contributing-to-the-tests)
5. [Testing models](#testing-models)
6. [Helpers and utilities](#helpers-and-utilities)
7. [Contributing to the tests](#contributing-to-the-tests)
## Running the tests
To show print statements, run the tests with `py.test -s`. To abort after the
first failure, run them with `py.test -x`.
```bash
py.test spacy # run basic tests
py.test spacy --models # run basic and model tests
py.test spacy --slow # run basic and slow tests
py.test spacy --models --slow # run all tests
py.test spacy # run basic tests
py.test spacy --models --en # run basic and English model tests
py.test spacy --models --all # run basic and all model tests
py.test spacy --slow # run basic and slow tests
py.test spacy --models --all --slow # run all tests
```
To show print statements, run the tests with `py.test -s`. To abort after the first failure, run them with `py.test -x`.
You can also run tests in a specific file or directory, or even only one
specific test:
```bash
py.test spacy/tests/tokenizer # run all tests in directory
py.test spacy/tests/tokenizer/test_exceptions.py # run all tests in file
py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji # run specific test
```
## Dos and don'ts
@ -83,14 +94,9 @@ These are the main fixtures that are currently available:
| Fixture | Description |
| --- | --- |
| `tokenizer` | Creates **all available** language tokenizers and runs the test for **each of them**. |
| `en_tokenizer` | Creates an English `Tokenizer` object. |
| `de_tokenizer` | Creates a German `Tokenizer` object. |
| `hu_tokenizer` | Creates a Hungarian `Tokenizer` object. |
| `en_vocab` | Creates an English `Vocab` object. |
| `en_entityrecognizer` | Creates an English `EntityRecognizer` object. |
| `lemmatizer` | Creates a `Lemmatizer` object from the installed language data (`None` if no data is found).
| `EN` | Creates an instance of `English`. Only use for tests that require the models. |
| `DE` | Creates an instance of `German`. Only use for tests that require the models. |
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
| `en_vocab`, `en_entityrecognizer`, ... | Creates an instance of the English `Vocab`, `EntityRecognizer` object etc. |
| `EN`, `DE`, ... | Creates a language class with a loaded model. For more info, see [Testing models](#testing-models). |
| `text_file` | Creates an instance of `StringIO` to simulate reading from and writing to files. |
| `text_file_b` | Creates an instance of `ByteIO` to simulate reading from and writing to files. |
@ -103,6 +109,48 @@ def test_module_do_something(en_tokenizer):
If all tests in a file require a specific configuration, or use the same complex example, it can be helpful to create a separate fixture. This fixture should be added at the top of each file. Make sure to use descriptive names for these fixtures and don't override any of the global fixtures listed above. **From looking at a test, it should immediately be clear which fixtures are used, and where they are coming from.**
## Testing models
Models should only be loaded and tested **if absolutely necessary** for example, if you're specifically testing a model's performance, or if your test is related to model loading. If you only need an annotated `Doc`, you should use the `get_doc()` helper function to create it manually instead.
To specify which language models a test is related to, set the language ID as an argument of `@pytest.mark.models`. This allows you to later run the tests with `--models --en`. You can then use the `EN` [fixture](#fixtures) to get a language
class with a loaded model.
```python
@pytest.mark.models('en')
def test_english_model(EN):
doc = EN(u'This is a test')
```
> ⚠️ **Important note:** In order to test models, they need to be installed as a packge. The [conftest.py](conftest.py) includes a list of all available models, mapped to their IDs, e.g. `en`. Unless otherwise specified, each model that's installed in your environment will be imported and tested. If you don't have a model installed, **the test will be skipped**.
Under the hood, `pytest.importorskip` is used to import a model package and skip the test if the package is not installed. The `EN` fixture for example gets all
available models for `en`, [parametrizes](#parameters) them to run the test for *each of them*, and uses `load_test_model()` to import the model and run the test, or skip it if the model is not installed.
### Testing specific models
Using the `load_test_model()` helper function, you can also write tests for specific models, or combinations of them:
```python
from .util import load_test_model
@pytest.mark.models('en')
def test_en_md_only():
nlp = load_test_model('en_core_web_md')
# test something specific to en_core_web_md
@pytest.mark.models('en', 'fr')
@pytest.mark.parametrize('model', ['en_core_web_md', 'fr_depvec_web_lg'])
def test_different_models(model):
nlp = load_test_model(model)
# test something specific to the parametrized models
```
### Known issues and future improvements
Using `importorskip` on a list of model packages is not ideal and we're looking to improve this in the future. But at the moment, it's the best way to ensure that tests are performed on specific model packages only, and that you'll always be able to run the tests, even if you don't have *all available models* installed. (If the tests made a call to `spacy.load('en')` instead, this would load whichever model you've created an `en` shortcut for. This may be one of spaCy's default models, but it could just as easily be your own custom English model.)
The current setup also doesn't provide an easy way to only run tests on specific model versions. The `minversion` keyword argument on `pytest.importorskip` can take care of this, but it currently only checks for the package's `__version__` attribute. An alternative solution would be to load a model package's meta.json and skip if the model's version does not match the one specified in the test.
## Helpers and utilities
@ -152,11 +200,11 @@ print([token.dep_ for token in doc])
**Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work.
### Other utilities
| Name | Description |
| --- | --- |
| `load_test_model` | Load a model if it's installed as a package, otherwise skip test. |
| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |

View File

@ -1,25 +1,50 @@
# coding: utf-8
from __future__ import unicode_literals
from ..tokens import Doc
from ..strings import StringStore
from ..lemmatizer import Lemmatizer
from ..attrs import ORTH, TAG, HEAD, DEP
from .. import util
from io import StringIO, BytesIO
from pathlib import Path
import pytest
from .util import load_test_model
from ..tokens import Doc
from ..strings import StringStore
from .. import util
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
'nl', 'pl', 'pt', 'sv']
'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
'de': ['de_core_news_md'],
'fr': ['fr_depvec_web_lg'],
'xx': ['xx_ent_web_md']}
@pytest.fixture(params=_languages)
def tokenizer(request):
lang = util.get_lang_class(request.param)
return lang.Defaults.create_tokenizer()
# only used for tests that require loading the models
# in all other cases, use specific instances
@pytest.fixture(params=_models['en'])
def EN(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['de'])
def DE(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['fr'])
def FR(request):
return load_test_model(request.param)
#@pytest.fixture(params=_languages)
#def tokenizer(request):
#lang = util.get_lang_class(request.param)
#return lang.Defaults.create_tokenizer()
@pytest.fixture
def tokenizer():
return util.get_lang_class('xx').Defaults.create_tokenizer()
@pytest.fixture
@ -47,7 +72,7 @@ def de_tokenizer():
return util.get_lang_class('de').Defaults.create_tokenizer()
@pytest.fixture(scope='module')
@pytest.fixture
def fr_tokenizer():
return util.get_lang_class('fr').Defaults.create_tokenizer()
@ -91,11 +116,6 @@ def en_entityrecognizer():
return util.get_lang_class('en').Defaults.create_entity()
@pytest.fixture
def lemmatizer():
return util.get_lang_class('en').Defaults.create_lemmatizer()
@pytest.fixture
def text_file():
return StringIO()
@ -105,22 +125,6 @@ def text_file_b():
return BytesIO()
# only used for tests that require loading the models
# in all other cases, use specific instances
@pytest.fixture(scope="session")
def EN():
return English()
@pytest.fixture(scope="session")
def DE():
return German()
@pytest.fixture(scope="session")
def FR():
return French()
def pytest_addoption(parser):
parser.addoption("--models", action="store_true",
help="include tests that require full models")
@ -129,8 +133,18 @@ def pytest_addoption(parser):
parser.addoption("--slow", action="store_true",
help="include slow tests")
for lang in _languages + ['all']:
parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
def pytest_runtest_setup(item):
for opt in ['models', 'vectors', 'slow']:
if opt in item.keywords and not item.config.getoption("--%s" % opt):
pytest.skip("need --%s option to run" % opt)
# Check if test is marked with models and has arguments set, i.e. specific
# language. If so, skip test if flag not set.
if item.get_marker('models'):
for arg in item.get_marker('models').args:
if not item.config.getoption("--%s" % arg) and not item.config.getoption("--all"):
pytest.skip("need --%s or --all option to run" % arg)

View File

@ -102,7 +102,7 @@ def test_doc_api_getitem(en_tokenizer):
def test_doc_api_serialize(en_tokenizer, text):
tokens = en_tokenizer(text)
new_tokens = get_doc(tokens.vocab).from_bytes(tokens.to_bytes())
assert tokens.string == new_tokens.string
assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
assert doc[6].right_edge.text == ','
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
])

View File

@ -68,6 +68,7 @@ def test_doc_token_api_is_properties(en_vocab):
assert doc[5].like_email
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [
("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
])
@ -99,8 +100,8 @@ def test_doc_token_api_ancestors(en_tokenizer):
assert [t.text for t in doc[1].ancestors] == ["saw"]
assert [t.text for t in doc[2].ancestors] == []
assert doc[2].is_ancestor_of(doc[7])
assert not doc[6].is_ancestor_of(doc[2])
assert doc[2].is_ancestor(doc[7])
assert not doc[6].is_ancestor(doc[2])
def test_doc_token_api_head_setter(en_tokenizer):
@ -155,3 +156,15 @@ def test_doc_token_api_head_setter(en_tokenizer):
assert doc[3].left_edge.i == 0
assert doc[4].left_edge.i == 0
assert doc[2].left_edge.i == 0
def test_sent_start(en_tokenizer):
doc = en_tokenizer(u'This is a sentence. This is another.')
assert not doc[0].sent_start
assert not doc[5].sent_start
doc[5].sent_start = True
assert doc[5].sent_start
assert not doc[0].sent_start
doc.is_parsed = True
assert len(list(doc.sents)) == 2

View File

@ -1,72 +0,0 @@
# coding: utf-8
import pytest
import numpy
@pytest.mark.models
class TestModelSanity:
"""
This is to make sure the model works as expected. The tests make sure that
values are properly set.
Tests are not meant to evaluate the content of the output, only make sure
the output is formally okay.
"""
@pytest.fixture(scope='class', params=['en','de'])
def example(self, request, EN, DE):
assert EN.entity != None
assert DE.entity != None
if request.param == 'en':
doc = EN(u'There was a stranger standing at the big ' +
u'street talking to herself.')
elif request.param == 'de':
doc = DE(u'An der großen Straße stand eine merkwürdige ' +
u'Gestalt und führte Selbstgespräche.')
return doc
def test_tokenization(self, example):
# tokenization should split the document into tokens
assert len(example) > 1
def test_tagging(self, example):
# if tagging was done properly, pos tags shouldn't be empty
assert example.is_tagged
assert all( t.pos != 0 for t in example )
assert all( t.tag != 0 for t in example )
def test_parsing(self, example):
# if parsing was done properly
# - dependency labels shouldn't be empty
# - the head of some tokens should not be root
assert example.is_parsed
assert all( t.dep != 0 for t in example )
assert any( t.dep != i for i,t in enumerate(example) )
def test_ner(self, example):
# if ner was done properly, ent_iob shouldn't be empty
assert all([t.ent_iob != 0 for t in example])
def test_vectors(self, example):
# if vectors are available, they should differ on different words
# this isn't a perfect test since this could in principle fail
# in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
vector0 = example[0].vector
vector1 = example[1].vector
vector2 = example[2].vector
assert not numpy.array_equal(vector0,vector1)
assert not numpy.array_equal(vector0,vector2)
assert not numpy.array_equal(vector1,vector2)
def test_probs(self, example):
# if frequencies/probabilities are okay, they should differ for
# different words
# this isn't a perfect test since this could in principle fail
# in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
prob0 = example[0].prob
prob1 = example[1].prob
prob2 = example[2].prob
assert not prob0 == prob1
assert not prob0 == prob2
assert not prob1 == prob2

View File

@ -8,20 +8,33 @@ import pytest
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
def test_tokenizer_splits_contractions(de_tokenizer, text):
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
def test_tokenizer_handles_abbr(de_tokenizer, text):
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_handles_exc_in_text(de_tokenizer):
def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
text = "Ich bin z.Zt. im Urlaub."
tokens = de_tokenizer(text)
assert len(tokens) == 6
assert tokens[2].text == "z.Zt."
assert tokens[2].lemma_ == "zur Zeit"
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
tokens = de_tokenizer(text)
assert [token.norm_ for token in tokens] == norms
@pytest.mark.xfail
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
tokens = de_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -0,0 +1,77 @@
# coding: utf-8
from __future__ import unicode_literals
import numpy
import pytest
@pytest.fixture
def example(DE):
"""
This is to make sure the model works as expected. The tests make sure that
values are properly set. Tests are not meant to evaluate the content of the
output, only make sure the output is formally okay.
"""
assert DE.entity != None
return DE('An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
@pytest.mark.models('de')
def test_de_models_tokenization(example):
# tokenization should split the document into tokens
assert len(example) > 1
@pytest.mark.xfail
@pytest.mark.models('de')
def test_de_models_tagging(example):
# if tagging was done properly, pos tags shouldn't be empty
assert example.is_tagged
assert all(t.pos != 0 for t in example)
assert all(t.tag != 0 for t in example)
@pytest.mark.models('de')
def test_de_models_parsing(example):
# if parsing was done properly
# - dependency labels shouldn't be empty
# - the head of some tokens should not be root
assert example.is_parsed
assert all(t.dep != 0 for t in example)
assert any(t.dep != i for i,t in enumerate(example))
@pytest.mark.models('de')
def test_de_models_ner(example):
# if ner was done properly, ent_iob shouldn't be empty
assert all([t.ent_iob != 0 for t in example])
@pytest.mark.models('de')
def test_de_models_vectors(example):
# if vectors are available, they should differ on different words
# this isn't a perfect test since this could in principle fail
# in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
vector0 = example[0].vector
vector1 = example[1].vector
vector2 = example[2].vector
assert not numpy.array_equal(vector0,vector1)
assert not numpy.array_equal(vector0,vector2)
assert not numpy.array_equal(vector1,vector2)
@pytest.mark.xfail
@pytest.mark.models('de')
def test_de_models_probs(example):
# if frequencies/probabilities are okay, they should differ for
# different words
# this isn't a perfect test since this could in principle fail
# in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
prob0 = example[0].prob
prob1 = example[1].prob
prob2 = example[2].prob
assert not prob0 == prob1
assert not prob0 == prob2
assert not prob1 == prob2

View File

@ -0,0 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals
from ...util import get_doc
import pytest
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
text = "Eine Tasse steht auf dem Tisch."
heads = [1, 1, 0, -1, 1, -2, -4]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text_with_ws == "Eine Tasse "
assert chunks[1].text_with_ws == "dem Tisch "
def test_de_extended_chunk(de_tokenizer):
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
chunks = list(doc.noun_chunks)
assert len(chunks) == 3
assert chunks[0].text_with_ws == "Die Sängerin "
assert chunks[1].text_with_ws == "einer Tasse Kaffee "
assert chunks[2].text_with_ws == "Arien "

View File

@ -1,87 +0,0 @@
# coding: utf-8
"""Test that tokens are created correctly for contractions."""
from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_basic_contraction(en_tokenizer):
text = "don't giggle"
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "n't"
text = "i said don't!"
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[4].text == "!"
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
tokens = en_tokenizer(text_poss)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == "'s"
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'"
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'ll"
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
tokens_lower = en_tokenizer(text_lower)
tokens_title = en_tokenizer(text_title)
assert tokens_title[0].text == tokens_lower[0].text.title()
assert tokens_lower[0].text == tokens_title[0].text.lower()
assert tokens_lower[1].text == tokens_title[1].text
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
tokens = en_tokenizer(pron + contraction)
assert tokens[0].text == pron
assert tokens[1].text == contraction
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
tokens = en_tokenizer(exc)
assert len(tokens) == 1
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
tokens = en_tokenizer(wo_punct)
assert len(tokens) == 2
tokens = en_tokenizer(w_punct)
assert len(tokens) == 3

View File

@ -1,19 +1,96 @@
# coding: utf-8
"""Test that tokenizer exceptions are handled correctly."""
from __future__ import unicode_literals
import pytest
def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
text = "don't giggle"
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "n't"
text = "i said don't!"
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[4].text == "!"
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
tokens = en_tokenizer(text_poss)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == "'s"
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'"
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'ll"
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
tokens_lower = en_tokenizer(text_lower)
tokens_title = en_tokenizer(text_title)
assert tokens_title[0].text == tokens_lower[0].text.title()
assert tokens_lower[0].text == tokens_title[0].text.lower()
assert tokens_lower[1].text == tokens_title[1].text
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
tokens = en_tokenizer(pron + contraction)
assert tokens[0].text == pron
assert tokens[1].text == contraction
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
tokens = en_tokenizer(exc)
assert len(tokens) == 1
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
tokens = en_tokenizer(wo_punct)
assert len(tokens) == 2
tokens = en_tokenizer(w_punct)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
def test_tokenizer_handles_abbr(en_tokenizer, text):
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_handles_exc_in_text(en_tokenizer):
def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
text = "It's mediocre i.e. bad."
tokens = en_tokenizer(text)
assert len(tokens) == 6
@ -21,7 +98,19 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer):
@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"])
def test_tokenizer_handles_times(en_tokenizer, text):
def test_en_tokenizer_handles_times(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[1].lemma_ in ["a.m.", "p.m."]
@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])])
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
tokens = en_tokenizer(text)
assert [token.norm_ for token in tokens] == norms
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
tokens = en_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -7,7 +7,7 @@ from __future__ import unicode_literals
import pytest
def test_simple_punct(en_tokenizer):
def test_en_simple_punct(en_tokenizer):
text = "to walk, do foo"
tokens = en_tokenizer(text)
assert tokens[0].idx == 0
@ -17,7 +17,7 @@ def test_simple_punct(en_tokenizer):
assert tokens[4].idx == 12
def test_complex_punct(en_tokenizer):
def test_en_complex_punct(en_tokenizer):
text = "Tom (D., Ill.)!"
tokens = en_tokenizer(text)
assert tokens[0].idx == 0

Some files were not shown because too many files have changed in this diff Show More