Merge branch 'develop' into develop-irish

This commit is contained in:
Jim O'Regan 2017-08-08 17:21:27 +01:00
commit 95921d7d4c
278 changed files with 38598 additions and 25044 deletions

1
.appveyor.yml Normal file
View File

@ -0,0 +1 @@
build: off

1
.gitignore vendored
View File

@ -30,6 +30,7 @@ Profile.prof
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
.env/ .env/
.env*
.~env/ .~env/
.venv .venv
venv/ venv/

View File

@ -4,12 +4,10 @@ spaCy: Industrial-strength NLP
spaCy is a library for advanced natural language processing in Python and spaCy is a library for advanced natural language processing in Python and
Cython. spaCy is built on the very latest research, but it isn't researchware. Cython. spaCy is built on the very latest research, but it isn't researchware.
It was designed from day one to be used in real products. spaCy currently supports It was designed from day one to be used in real products. spaCy currently supports
English, German and French, as well as tokenization for Spanish, Italian, English, German, French and Spanish, as well as tokenization for Italian,
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew, Portuguese, Dutch, Swedish, Finnish, Norwegian, Danish, Hungarian, Polish,
Chinese and Japanese. It's commercial open-source software, released under the Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software,
MIT license. released under the MIT license.
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_ 💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
@ -85,7 +83,7 @@ Features
* GIL-free **multi-threading** * GIL-free **multi-threading**
* Efficient binary serialization * Efficient binary serialization
* Easy **deep learning** integration * Easy **deep learning** integration
* Statistical models for **English** and **German** * Statistical models for **English**, **German**, **French** and **Spanish**
* State-of-the-art speed * State-of-the-art speed
* Robust, rigorously evaluated accuracy * Robust, rigorously evaluated accuracy
@ -197,7 +195,7 @@ To load a model, use ``spacy.load()`` with the model's shortcut link:
.. code:: python .. code:: python
import spacy import spacy
nlp = spacy.load('en_default') nlp = spacy.load('en')
doc = nlp(u'This is a sentence.') doc = nlp(u'This is a sentence.')
If you've installed a model via pip, you can also ``import`` it directly and If you've installed a model via pip, you can also ``import`` it directly and
@ -313,7 +311,7 @@ and ``--model`` are optional and enable additional tests:
# make sure you are using recent pytest version # make sure you are using recent pytest version
python -m pip install -U pytest python -m pip install -U pytest
python -m pytest <spacy-directory> --vectors --models --slow python -m pytest <spacy-directory>
🛠 Changelog 🛠 Changelog
============ ============

View File

@ -1,68 +1,27 @@
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import json
import pathlib
import random import random
import spacy from spacy.lang.en import English
from spacy.pipeline import EntityRecognizer from spacy.gold import GoldParse, biluo_tags_from_offsets
from spacy.gold import GoldParse
from spacy.tagger import Tagger
try: def reformat_train_data(tokenizer, examples):
unicode """Reformat data to match JSON format"""
except: output = []
unicode = str for i, (text, entity_offsets) in enumerate(examples):
doc = tokenizer(text)
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
def train_ner(nlp, train_data, entity_types): words = [w.text for w in doc]
# Add new words to vocab. tags = ['-'] * len(doc)
for raw_text, _ in train_data: heads = [0] * len(doc)
doc = nlp.make_doc(raw_text) deps = [''] * len(doc)
for word in doc: sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
_ = nlp.vocab[word.orth] output.append((text, [(sentence, [])]))
return output
# Train NER.
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
for itn in range(5):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
ner.update(doc, gold)
return ner
def save_model(ner, model_dir):
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.is_dir()
with (model_dir / 'config.json').open('wb') as file_:
data = json.dumps(ner.cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
ner.model.dump(str(model_dir / 'model'))
if not (model_dir / 'vocab').exists():
(model_dir / 'vocab').mkdir()
ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
ner.vocab.strings.dump(file_)
def main(model_dir=None): def main(model_dir=None):
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
# v1.1.2 onwards
if nlp.tagger is None:
print('---- WARNING ----')
print('Data directory not found')
print('please run: `python -m spacy.en.download --force all` for better performance')
print('Using feature templates for tagging')
print('-----------------')
nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
train_data = [ train_data = [
( (
'Who is Shaka Khan?', 'Who is Shaka Khan?',
@ -74,23 +33,35 @@ def main(model_dir=None):
(len('I like London and '), len('I like London and Berlin'), 'LOC')] (len('I like London and '), len('I like London and Berlin'), 'LOC')]
) )
] ]
ner = train_ner(nlp, train_data, ['PERSON', 'LOC']) nlp = English(pipeline=['tensorizer', 'ner'])
get_data = lambda: reformat_train_data(nlp.tokenizer, train_data)
doc = nlp.make_doc('Who is Shaka Khan?') optimizer = nlp.begin_training(get_data)
nlp.tagger(doc) for itn in range(100):
ner(doc) random.shuffle(train_data)
for word in doc: losses = {}
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
if model_dir is not None: gold = GoldParse(doc, entities=entity_offsets)
save_model(ner, model_dir) nlp.update(
[doc], # Batch of Doc objects
[gold], # Batch of GoldParse objects
drop=0.5, # Dropout -- make it harder to memorise data
sgd=optimizer, # Callable to update weights
losses=losses)
print(losses)
print("Save to", model_dir)
nlp.to_disk(model_dir)
print("Load from", model_dir)
nlp = spacy.lang.en.English(pipeline=['tensorizer', 'ner'])
nlp.from_disk(model_dir)
for raw_text, _ in train_data:
doc = nlp(raw_text)
for word in doc:
print(word.text, word.ent_type_, word.ent_iob_)
if __name__ == '__main__': if __name__ == '__main__':
main('ner') import plac
plac.call(main)
# Who "" 2 # Who "" 2
# is "" 2 # is "" 2
# Shaka "" PERSON 3 # Shaka "" PERSON 3

View File

@ -0,0 +1,109 @@
from __future__ import unicode_literals
import plac
import random
import tqdm
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
import thinc.extra.datasets
import spacy.lang.en
from spacy.gold import GoldParse, minibatch
from spacy.util import compounding
from spacy.pipeline import TextCategorizer
def train_textcat(tokenizer, textcat,
train_texts, train_cats, dev_texts, dev_cats,
n_iter=20):
'''
Train the TextCategorizer without associated pipeline.
'''
textcat.begin_training()
optimizer = Adam(NumpyOps(), 0.001)
train_docs = [tokenizer(text) for text in train_texts]
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
zip(train_docs, train_cats)]
train_data = zip(train_docs, train_gold)
batch_sizes = compounding(4., 128., 1.001)
for i in range(n_iter):
losses = {}
train_data = tqdm.tqdm(train_data, leave=False) # Progress bar
for batch in minibatch(train_data, size=batch_sizes):
docs, golds = zip(*batch)
textcat.update((docs, None), golds, sgd=optimizer, drop=0.2,
losses=losses)
with textcat.model.use_params(optimizer.averages):
scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
yield losses['textcat'], scores
def evaluate(tokenizer, textcat, texts, cats):
docs = (tokenizer(text) for text in texts)
tp = 1e-8 # True positives
fp = 1e-8 # False positives
fn = 1e-8 # False negatives
tn = 1e-8 # True negatives
for i, doc in enumerate(textcat.pipe(docs)):
gold = cats[i]
for label, score in doc.cats.items():
if score >= 0.5 and label in gold:
tp += 1.
elif score >= 0.5 and label not in gold:
fp += 1.
elif score < 0.5 and label not in gold:
tn += 1
if score < 0.5 and label in gold:
fn += 1
precis = tp / (tp + fp)
recall = tp / (tp + fn)
fscore = 2 * (precis * recall) / (precis + recall)
return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
def load_data():
# Partition off part of the train data --- avoid running experiments
# against test.
train_data, _ = thinc.extra.datasets.imdb()
random.shuffle(train_data)
texts, labels = zip(*train_data)
cats = [(['POSITIVE'] if y else []) for y in labels]
split = int(len(train_data) * 0.8)
train_texts = texts[:split]
train_cats = cats[:split]
dev_texts = texts[split:]
dev_cats = cats[split:]
return (train_texts, train_cats), (dev_texts, dev_cats)
def main(model_loc=None):
nlp = spacy.lang.en.English()
tokenizer = nlp.tokenizer
textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
print("Load IMDB data")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
print("Itn.\tLoss\tP\tR\tF")
progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
train_texts, train_cats,
dev_texts, dev_cats, n_iter=20)):
print(progress.format(i=i, loss=loss, **scores))
# How to save, load and use
nlp.pipeline.append(textcat)
if model_loc is not None:
nlp.to_disk(model_loc)
nlp = spacy.load(model_loc)
doc = nlp(u'This movie sucked!')
print(doc.cats)
if __name__ == '__main__':
plac.call(main)

View File

@ -3,8 +3,8 @@ pathlib
numpy>=1.7 numpy>=1.7
cymem>=1.30,<1.32 cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0 preshed>=1.0.0,<2.0.0
thinc>=6.6.0,<6.7.0 thinc>=6.8.0,<6.9.0
murmurhash>=0.26,<0.27 murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6 plac<1.0.0,>=0.9.6
six six
ujson>=1.35 ujson>=1.35
@ -14,3 +14,6 @@ regex==2017.4.5
ftfy>=4.4.2,<5.0.0 ftfy>=4.4.2,<5.0.0
pytest>=3.0.6,<4.0.0 pytest>=3.0.6,<4.0.0
pip>=9.0.0,<10.0.0 pip>=9.0.0,<10.0.0
mock>=2.0.0,<3.0.0
msgpack-python
msgpack-numpy

View File

@ -44,7 +44,8 @@ MOD_NAMES = [
'spacy.matcher', 'spacy.matcher',
'spacy.syntax.ner', 'spacy.syntax.ner',
'spacy.symbols', 'spacy.symbols',
'spacy.syntax.iterators'] 'spacy.vectors',
]
COMPILE_OPTIONS = { COMPILE_OPTIONS = {
@ -188,10 +189,10 @@ def setup_package():
ext_modules=ext_modules, ext_modules=ext_modules,
install_requires=[ install_requires=[
'numpy>=1.7', 'numpy>=1.7',
'murmurhash>=0.26,<0.27', 'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32', 'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0', 'preshed>=1.0.0,<2.0.0',
'thinc>=6.6.0,<6.7.0', 'thinc>=6.8.0,<6.9.0',
'plac<1.0.0,>=0.9.6', 'plac<1.0.0,>=0.9.6',
'pip>=9.0.0,<10.0.0', 'pip>=9.0.0,<10.0.0',
'six', 'six',
@ -200,7 +201,9 @@ def setup_package():
'dill>=0.2,<0.3', 'dill>=0.2,<0.3',
'requests>=2.13.0,<3.0.0', 'requests>=2.13.0,<3.0.0',
'regex==2017.4.5', 'regex==2017.4.5',
'ftfy>=4.4.2,<5.0.0'], 'ftfy>=4.4.2,<5.0.0',
'msgpack-python',
'msgpack-numpy'],
classifiers=[ classifiers=[
'Development Status :: 5 - Production/Stable', 'Development Status :: 5 - Production/Stable',
'Environment :: Console', 'Environment :: Console',

View File

@ -1,22 +1,22 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import importlib from .cli.info import info as cli_info
from .compat import basestring_
from .cli.info import info
from .glossary import explain from .glossary import explain
from .deprecated import resolve_load_name from .deprecated import resolve_load_name
from .about import __version__
from . import util from . import util
def load(name, **overrides): def load(name, **overrides):
name = resolve_load_name(name, **overrides) name = resolve_load_name(name, **overrides)
model_path = util.resolve_model_path(name) return util.load_model(name, **overrides)
meta = util.parse_package_meta(model_path)
if 'lang' not in meta:
raise IOError('No language setting found in model meta.') def blank(name, **kwargs):
cls = util.get_lang_class(meta['lang']) LangClass = util.get_lang_class(name)
overrides['meta'] = meta return LangClass(**kwargs)
overrides['path'] = model_path
return cls(**overrides)
def info(model=None, markdown=False):
return cli_info(None, model, markdown)

View File

@ -3,135 +3,21 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!! # NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals #from __future__ import unicode_literals
import plac
from spacy.cli import download as cli_download
from spacy.cli import link as cli_link
from spacy.cli import info as cli_info
from spacy.cli import package as cli_package
from spacy.cli import train as cli_train
from spacy.cli import model as cli_model
from spacy.cli import convert as cli_convert
class CLI(object):
"""
Command-line interface for spaCy
"""
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
@plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(self, model, direct=False):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
cli_download(model, direct)
@plac.annotations(
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(self, origin, link_name, force=False):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
cli_link(origin, link_name, force)
@plac.annotations(
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(self, model=None, markdown=False):
"""
Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
cli_info(model, markdown)
@plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(self, input_dir, output_dir, meta=None, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
cli_package(input_dir, output_dir, meta, force)
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int),
nsents=("number of sentences", "option", None, int),
parser_L1=("L1 regularization penalty for parser", "option", "L", float),
use_gpu=("Use GPU", "flag", "g", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_ner=("Don't train NER", "flag", "N", bool)
)
def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
nsents=0, parser_L1=0.0, use_gpu=False,
no_tagger=False, no_parser=False, no_ner=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
nsents = nsents or None
cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents,
use_gpu, not no_tagger, not no_parser, not no_ner, parser_L1)
@plac.annotations(
lang=("model language", "positional", None, str),
model_dir=("output directory to store model in", "positional", None, str),
freqs_data=("tab-separated frequencies file", "positional", None, str),
clusters_data=("Brown clusters file", "positional", None, str),
vectors_data=("word vectors file", "positional", None, str)
)
def model(self, lang, model_dir, freqs_data, clusters_data=None, vectors_data=None):
"""
Initialize a new model and its data directory.
"""
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(self, input_file, output_dir, n_sents=10, morphology=False):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
"""
cli_convert(input_file, output_dir, n_sents, morphology)
def __missing__(self, name):
print("\n Command %r does not exist."
"\n Use the --help flag for a list of available commands.\n" % name)
if __name__ == '__main__': if __name__ == '__main__':
import plac import plac
import sys import sys
sys.argv[0] = 'spacy' from spacy.cli import download, link, info, package, train, convert
plac.Interpreter.call(CLI) from spacy.util import prints
commands = {'download': download, 'link': link, 'info': info, 'train': train,
'convert': convert, 'package': package}
if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1)
command = sys.argv.pop(1)
sys.argv[0] = 'spacy %s' % command
if command in commands:
plac.call(commands[command])
else:
prints("Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command, exits=1)

View File

@ -1,20 +1,107 @@
import ujson
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
import random
import cytoolz
from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.static_vectors import StaticVectors from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm from thinc.neural._classes.batchnorm import BatchNorm
from thinc.neural._classes.layernorm import LayerNorm as LN
from thinc.neural._classes.resnet import Residual from thinc.neural._classes.resnet import Residual
from thinc.neural import ReLu
from thinc.neural._classes.selu import SELU
from thinc import describe from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed from thinc.neural._classes.affine import _set_dimensions_if_needed
from thinc.api import FeatureExtracter, with_getitem
from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
from thinc.neural._classes.attention import ParametricAttention
from thinc.linear.linear import LinearModel
from thinc.api import uniqued, wrap, flatten_add_lengths
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
from .tokens.doc import Doc from .tokens.doc import Doc
import numpy import numpy
import io
@layerize
def _flatten_add_lengths(seqs, pad=0, drop=0.):
ops = Model.ops
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths, pad=pad)
X = ops.flatten(seqs, pad=pad)
return (X, lengths), finish_update
@layerize
def _logistic(X, drop=0.):
xp = get_array_module(X)
if not isinstance(X, xp.ndarray):
X = xp.asarray(X)
# Clip to range (-10, 10)
X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y))
return dX
return Y, logistic_bwd
@layerize
def add_tuples(X, drop=0.):
"""Give inputs of sequence pairs, where each sequence is (vals, length),
sum the values, returning a single sequence.
If input is:
((vals1, length), (vals2, length)
Output is:
(vals1+vals2, length)
vals are a single tensor for the whole batch.
"""
(vals1, length1), (vals2, length2) = X
assert length1 == length2
def add_tuples_bwd(dY, sgd=None):
return (dY, dY)
return (vals1+vals2, length), add_tuples_bwd
def _zero_init(model):
def _zero_init_impl(self, X, y):
self.W.fill(0)
model.on_data_hooks.append(_zero_init_impl)
if model.W is not None:
model.W.fill(0.)
return model
@layerize
def _preprocess_doc(docs, drop=0.):
keys = [doc.to_array([LOWER]) for doc in docs]
keys = [a[:, 0] for a in keys]
ops = Model.ops
lengths = ops.asarray([arr.shape[0] for arr in keys])
keys = ops.xp.concatenate(keys)
vals = ops.allocate(keys.shape[0]) + 1
return (keys, vals, lengths), None
def _init_for_precomputed(W, ops):
if (W**2).sum() != 0.:
return
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape)
@describe.on_data(_set_dimensions_if_needed) @describe.on_data(_set_dimensions_if_needed)
@ -23,8 +110,8 @@ import numpy
nF=Dimension("Number of features"), nF=Dimension("Number of features"),
nO=Dimension("Output size"), nO=Dimension("Output size"),
W=Synapses("Weights matrix", W=Synapses("Weights matrix",
lambda obj: (obj.nO, obj.nF, obj.nI), lambda obj: (obj.nF, obj.nO, obj.nI),
lambda W, ops: ops.xavier_uniform_init(W)), lambda W, ops: _init_for_precomputed(W, ops)),
b=Biases("Bias vector", b=Biases("Bias vector",
lambda obj: (obj.nO,)), lambda obj: (obj.nO,)),
d_W=Gradient("W"), d_W=Gradient("W"),
@ -39,25 +126,25 @@ class PrecomputableAffine(Model):
def begin_update(self, X, drop=0.): def begin_update(self, X, drop=0.):
# X: (b, i) # X: (b, i)
# Xf: (b, f, i) # Yf: (b, f, i)
# dY: (b, o) # dY: (b, o)
# dYf: (b, f, o) # dYf: (b, f, o)
#Yf = numpy.einsum('bi,ofi->bfo', X, self.W) #Yf = numpy.einsum('bi,foi->bfo', X, self.W)
Yf = self.ops.xp.tensordot( Yf = self.ops.xp.tensordot(
X, self.W, axes=[[1], [2]]).transpose((0, 2, 1)) X, self.W, axes=[[1], [2]])
Yf += self.b Yf += self.b
def backward(dY_ids, sgd=None): def backward(dY_ids, sgd=None):
tensordot = self.ops.xp.tensordot
dY, ids = dY_ids dY, ids = dY_ids
Xf = X[ids] Xf = X[ids]
#dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
dXf = tensordot(dY, self.W, axes=[[1], [1]])
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf) #dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
dW = self.ops.xp.tensordot(dY, Xf, axes=[[0], [0]]) dW = tensordot(dY, Xf, axes=[[0], [0]])
db = dY.sum(axis=0) # ofi -> foi
#dXf = numpy.einsum('bo,ofi->bfi', dY, self.W) self.d_W += dW.transpose((1, 0, 2))
dXf = self.ops.xp.tensordot(dY, self.W, axes=[[1], [0]]) self.d_b += dY.sum(axis=0)
self.d_W += dW
self.d_b += db
if sgd is not None: if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id) sgd(self._mem.weights, self._mem.gradient, key=self.id)
@ -80,10 +167,10 @@ class PrecomputableAffine(Model):
d_b=Gradient("b") d_b=Gradient("b")
) )
class PrecomputableMaxouts(Model): class PrecomputableMaxouts(Model):
def __init__(self, nO=None, nI=None, nF=None, pieces=3, **kwargs): def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
Model.__init__(self, **kwargs) Model.__init__(self, **kwargs)
self.nO = nO self.nO = nO
self.nP = pieces self.nP = nP
self.nI = nI self.nI = nI
self.nF = nF self.nF = nF
@ -120,38 +207,105 @@ class PrecomputableMaxouts(Model):
return dXf return dXf
return Yfp, backward return Yfp, backward
def Tok2Vec(width, embed_size, preprocess=None):
cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size)
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2)
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2)
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2)
def Tok2Vec(width, embed_size, preprocess=None):
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
embed = (norm | prefix | suffix | shape )
tok2vec = ( tok2vec = (
flatten with_flatten(
>> (lower | prefix | suffix | shape ) asarray(Model.ops, dtype='uint64')
>> Maxout(width, width*4, pieces=3) >> uniqued(embed, column=5)
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> LN(Maxout(width, width*4, pieces=3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) >> Residual(ExtractWindow(nW=1) >> SELU(width, width*3))
>> Residual(ExtractWindow(nW=1) >> SELU(width, width*3)),
pad=4)
) )
if preprocess not in (False, None): if preprocess not in (False, None):
tok2vec = preprocess >> tok2vec tok2vec = preprocess >> tok2vec
# Work around thinc API limitations :(. TODO: Revise in Thinc 7 # Work around thinc API limitations :(. TODO: Revise in Thinc 7
tok2vec.nO = width tok2vec.nO = width
tok2vec.embed = embed
return tok2vec return tok2vec
def get_col(idx): def asarray(ops, dtype):
def forward(X, drop=0.): def forward(X, drop=0.):
return ops.asarray(X, dtype=dtype), None
return layerize(forward)
def foreach(layer):
def forward(Xs, drop=0.):
results = []
backprops = []
for X in Xs:
result, bp = layer.begin_update(X, drop=drop)
results.append(result)
backprops.append(bp)
def backward(d_results, sgd=None):
dXs = []
for d_result, backprop in zip(d_results, backprops):
dXs.append(backprop(d_result, sgd))
return dXs
return results, backward
model = layerize(forward)
model._layers.append(layer)
return model
def rebatch(size, layer):
ops = layer.ops
def forward(X, drop=0.):
if X.shape[0] < size:
return layer.begin_update(X)
parts = _divide_array(X, size)
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
for p in parts])
y = ops.flatten(results)
def backward(dy, sgd=None):
d_parts = [bp(y, sgd=sgd) for bp, y in
zip(bp_results, _divide_array(dy, size))]
try:
dX = ops.flatten(d_parts)
except TypeError:
dX = None
except ValueError:
dX = None
return dX
return y, backward
model = layerize(forward)
model._layers.append(layer)
return model
def _divide_array(X, size):
parts = []
index = 0
while index < len(X):
parts.append(X[index : index + size])
index += size
return parts
def get_col(idx):
assert idx >= 0, idx
def forward(X, drop=0.):
assert idx >= 0, idx
if isinstance(X, numpy.ndarray): if isinstance(X, numpy.ndarray):
ops = NumpyOps() ops = NumpyOps()
else: else:
ops = CupyOps() ops = CupyOps()
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
def backward(y, sgd=None): def backward(y, sgd=None):
assert idx >= 0, idx
dX = ops.allocate(X.shape) dX = ops.allocate(X.shape)
dX[:, idx] += y dX[:, idx] += y
return dX return dX
@ -167,21 +321,17 @@ def zero_init(model):
def doc2feats(cols=None): def doc2feats(cols=None):
cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
def forward(docs, drop=0.): def forward(docs, drop=0.):
feats = [] feats = []
for doc in docs: for doc in docs:
if 'cached_feats' not in doc.user_data: feats.append(doc.to_array(cols))
doc.user_data['cached_feats'] = model.ops.asarray(
doc.to_array(cols),
dtype='uint64')
feats.append(doc.user_data['cached_feats'])
assert feats[-1].dtype == 'uint64'
return feats, None return feats, None
model = layerize(forward) model = layerize(forward)
model.cols = cols model.cols = cols
return model return model
def print_shape(prefix): def print_shape(prefix):
def forward(X, drop=0.): def forward(X, drop=0.):
return X, lambda dX, **kwargs: dX return X, lambda dX, **kwargs: dX
@ -197,6 +347,29 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
return vectors, backward return vectors, backward
def fine_tune(embedding, combine=None):
if combine is not None:
raise NotImplementedError(
"fine_tune currently only supports addition. Set combine=None")
def fine_tune_fwd(docs_tokvecs, drop=0.):
docs, tokvecs = docs_tokvecs
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
output = embedding.ops.unflatten(
embedding.ops.flatten(tokvecs)
+ embedding.ops.flatten(vecs),
lengths)
def fine_tune_bwd(d_output, sgd=None):
bp_vecs(d_output, sgd=sgd)
return d_output
return output, fine_tune_bwd
model = wrap(fine_tune_fwd, embedding)
return model
@layerize @layerize
def flatten(seqs, drop=0.): def flatten(seqs, drop=0.):
if isinstance(seqs[0], numpy.ndarray): if isinstance(seqs[0], numpy.ndarray):
@ -210,3 +383,95 @@ def flatten(seqs, drop=0.):
return ops.unflatten(d_X, lengths) return ops.unflatten(d_X, lengths)
X = ops.xp.vstack(seqs) X = ops.xp.vstack(seqs)
return X, finish_update return X, finish_update
@layerize
def logistic(X, drop=0.):
xp = get_array_module(X)
if not isinstance(X, xp.ndarray):
X = xp.asarray(X)
# Clip to range (-10, 10)
X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y))
return dX
return Y, logistic_bwd
def zero_init(model):
def _zero_init_impl(self, X, y):
self.W.fill(0)
model.on_data_hooks.append(_zero_init_impl)
return model
@layerize
def preprocess_doc(docs, drop=0.):
keys = [doc.to_array([LOWER]) for doc in docs]
keys = [a[:, 0] for a in keys]
ops = Model.ops
lengths = ops.asarray([arr.shape[0] for arr in keys])
keys = ops.xp.concatenate(keys)
vals = ops.allocate(keys.shape[0]) + 1
return (keys, vals, lengths), None
def getitem(i):
def getitem_fwd(X, drop=0.):
return X[i], None
return layerize(getitem_fwd)
def build_tagger_model(nr_class, token_vector_width, **cfg):
with Model.define_operators({'>>': chain, '+': add}):
# Input: (doc, tensor) tuples
private_tok2vec = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
model = (
fine_tune(private_tok2vec)
>> with_flatten(
Maxout(token_vector_width, token_vector_width)
>> Softmax(nr_class, token_vector_width)
)
)
model.nI = None
return model
def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 200)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}):
embed_lower = HashEmbed(width, nr_vector, column=1)
embed_prefix = HashEmbed(width//2, nr_vector, column=2)
embed_suffix = HashEmbed(width//2, nr_vector, column=3)
embed_shape = HashEmbed(width//2, nr_vector, column=4)
cnn_model = (
FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE])
>> _flatten_add_lengths
>> with_getitem(0,
uniqued(
(embed_lower | embed_prefix | embed_suffix | embed_shape)
>> Maxout(width, width+(width//2)*3))
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
>> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3))
)
>> ParametricAttention(width,)
>> Pooling(sum_pool)
>> ReLu(width, width)
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
)
linear_model = (
_preprocess_doc
>> LinearModel(nr_class, drop_factor=0.)
)
model = (
(linear_model | cnn_model)
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
>> logistic
)
model.lsuv = False
return model

View File

@ -2,16 +2,16 @@
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy' __title__ = 'spacy-nightly'
__version__ = '1.8.2' __version__ = '2.0.0a7'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io' __uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal' __author__ = 'Explosion AI'
__email__ = 'matt@explosion.ai' __email__ = 'contact@explosion.ai'
__license__ = 'MIT' __license__ = 'MIT'
__docs_models__ = 'https://spacy.io/docs/usage/models' __docs_models__ = 'https://spacy.io/docs/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json' __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/' __model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/develop/templates/model/'

View File

@ -83,6 +83,7 @@ cpdef enum attr_id_t:
ENT_IOB ENT_IOB
ENT_TYPE ENT_TYPE
HEAD HEAD
SENT_START
SPACY SPACY
PROB PROB

View File

@ -85,6 +85,7 @@ IDS = {
"ENT_IOB": ENT_IOB, "ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE, "ENT_TYPE": ENT_TYPE,
"HEAD": HEAD, "HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY, "SPACY": SPACY,
"PROB": PROB, "PROB": PROB,
"LANG": LANG, "LANG": LANG,
@ -149,6 +150,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
else: else:
int_key = IDS[name.upper()] int_key = IDS[name.upper()]
if strings_map is not None and isinstance(value, basestring): if strings_map is not None and isinstance(value, basestring):
value = strings_map[value] if hasattr(strings_map, 'add'):
value = strings_map.add(value)
else:
value = strings_map[value]
inty_attrs[int_key] = value inty_attrs[int_key] = value
return inty_attrs return inty_attrs

View File

@ -2,6 +2,5 @@ from .download import download
from .info import info from .info import info
from .link import link from .link import link
from .package import package from .package import package
from .train import train, train_config from .train import train
from .model import model
from .convert import convert from .convert import convert

View File

@ -1,31 +1,43 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
from pathlib import Path from pathlib import Path
from .converters import conllu2json from .converters import conllu2json, iob2json
from ..util import prints from ..util import prints
# Converters are matched by file extension. To add a converter, add a new entry # Converters are matched by file extension. To add a converter, add a new entry
# to this dict with the file extension mapped to the converter function imported # to this dict with the file extension mapped to the converter function imported
# from /converters. # from /converters.
CONVERTERS = { CONVERTERS = {
'.conllu': conllu2json, '.conllu': conllu2json,
'.conll': conllu2json '.conll': conllu2json,
'.iob': iob2json
} }
def convert(input_file, output_dir, *args): @plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(cmd, input_file, output_dir, n_sents, morphology):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
"""
input_path = Path(input_file) input_path = Path(input_file)
output_path = Path(output_dir) output_path = Path(output_dir)
if not input_path.exists(): if not input_path.exists():
prints(input_path, title="Input file not found", exits=True) prints(input_path, title="Input file not found", exits=1)
if not output_path.exists(): if not output_path.exists():
prints(output_path, title="Output directory not found", exits=True) prints(output_path, title="Output directory not found", exits=1)
file_ext = input_path.suffix file_ext = input_path.suffix
if not file_ext in CONVERTERS: if not file_ext in CONVERTERS:
prints("Can't find converter for %s" % input_path.parts[-1], prints("Can't find converter for %s" % input_path.parts[-1],
title="Unknown format", exits=True) title="Unknown format", exits=1)
CONVERTERS[file_ext](input_path, output_path, *args) CONVERTERS[file_ext](input_path, output_path,
n_sents=n_sents, use_morphology=morphology)

View File

@ -1 +1,2 @@
from .conllu2json import conllu2json from .conllu2json import conllu2json
from .iob2json import iob2json

View File

@ -73,10 +73,10 @@ def generate_sentence(sent):
tokens = [] tokens = []
for i, id in enumerate(id_): for i, id in enumerate(id_):
token = {} token = {}
token["orth"] = word[id] token["orth"] = word[i]
token["tag"] = tag[id] token["tag"] = tag[i]
token["head"] = head[id] - i token["head"] = head[i] - id
token["dep"] = dep[id] token["dep"] = dep[i]
tokens.append(token) tokens.append(token)
sentence["tokens"] = tokens sentence["tokens"] = tokens
return sentence return sentence

View File

@ -0,0 +1,45 @@
# coding: utf8
from __future__ import unicode_literals
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo
def iob2json(input_path, output_path, n_sents=10, *a, **k):
"""
Convert IOB files into JSON format for use with train cli.
"""
# TODO: This isn't complete yet -- need to map from IOB to
# BILUO
with input_path.open('r', encoding='utf8') as file_:
docs = read_iob(file_)
output_filename = input_path.parts[-1].replace(".iob", ".json")
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints("Created %d documents" % len(docs),
title="Generated output file %s" % path2str(output_file))
def read_iob(file_):
sentences = []
for line in file_:
if not line.strip():
continue
tokens = [t.split('|') for t in line.split()]
if len(tokens[0]) == 3:
words, pos, iob = zip(*tokens)
else:
words, iob = zip(*tokens)
pos = ['-'] * len(words)
biluo = iob_to_biluo(iob)
sentences.append([
{'orth': w, 'tag': p, 'ner': ent}
for (w, p, ent) in zip(words, pos, biluo)
])
sentences = [{'tokens': sent} for sent in sentences]
paragraphs = [{'sentences': [sent]} for sent in sentences]
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
return docs

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
import requests import requests
import os import os
import subprocess import subprocess
@ -11,7 +12,17 @@ from ..util import prints
from .. import about from .. import about
def download(model, direct=False): @plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(cmd, model, direct=False):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
if direct: if direct:
download_model('{m}/{m}.tar.gz'.format(m=model)) download_model('{m}/{m}.tar.gz'.format(m=model))
else: else:
@ -20,7 +31,17 @@ def download(model, direct=False):
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
link(model_name, model, force=True) try:
link(None, model_name, model, force=True)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and
# loading instructions, even if linking fails.
prints("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load "
"the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful")
def get_json(url, desc): def get_json(url, desc):
@ -28,7 +49,7 @@ def get_json(url, desc):
if r.status_code != 200: if r.status_code != 200:
prints("Couldn't fetch %s. Please find a model for your spaCy installation " prints("Couldn't fetch %s. Please find a model for your spaCy installation "
"(v%s), and download it manually." % (desc, about.__version__), "(v%s), and download it manually." % (desc, about.__version__),
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=True) about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
return r.json() return r.json()
@ -38,7 +59,7 @@ def get_compatibility():
comp = comp_table['spacy'] comp = comp_table['spacy']
if version not in comp: if version not in comp:
prints("No compatible models found for v%s of spaCy." % version, prints("No compatible models found for v%s of spaCy." % version,
title="Compatibility error", exits=True) title="Compatibility error", exits=1)
return comp[version] return comp[version]
@ -46,7 +67,7 @@ def get_version(model, comp):
if model not in comp: if model not in comp:
version = about.__version__ version = about.__version__
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version), prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
title="Compatibility error", exits=True) title="Compatibility error", exits=1)
return comp[model][0] return comp[model][0]

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
import platform import platform
from pathlib import Path from pathlib import Path
@ -9,17 +10,30 @@ from .. import about
from .. import util from .. import util
def info(model=None, markdown=False): @plac.annotations(
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
if model: if model:
data_path = util.get_data_path() if util.is_package(model):
data = util.parse_package_meta(data_path / model, require=True) model_path = util.get_package_path(model)
model_path = Path(__file__).parent / data_path / model
if model_path.resolve() != model_path:
data['link'] = path2str(model_path)
data['source'] = path2str(model_path.resolve())
else: else:
data['source'] = path2str(model_path) model_path = util.get_data_path() / model
print_info(data, 'model %s' % model, markdown) meta_path = model_path / 'meta.json'
if not meta_path.is_file():
util.prints(meta_path, title="Can't find model meta.json", exits=1)
meta = util.read_json(meta_path)
if model_path.resolve() != model_path:
meta['link'] = path2str(model_path)
meta['source'] = path2str(model_path.resolve())
else:
meta['source'] = path2str(model_path)
print_info(meta, 'model %s' % model, markdown)
else: else:
data = {'spaCy version': about.__version__, data = {'spaCy version': about.__version__,
'Location': path2str(Path(__file__).parent.parent), 'Location': path2str(Path(__file__).parent.parent),

View File

@ -1,24 +1,36 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
from pathlib import Path from pathlib import Path
from ..compat import symlink_to, path2str from ..compat import symlink_to, path2str
from ..util import prints from ..util import prints
from .. import util from .. import util
def link(origin, link_name, force=False): @plac.annotations(
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(cmd, origin, link_name, force=False):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
if util.is_package(origin): if util.is_package(origin):
model_path = util.get_model_package_path(origin) model_path = util.get_package_path(origin)
else: else:
model_path = Path(origin) model_path = Path(origin)
if not model_path.exists(): if not model_path.exists():
prints("The data should be located in %s" % path2str(model_path), prints("The data should be located in %s" % path2str(model_path),
title="Can't locate model data", exits=True) title="Can't locate model data", exits=1)
link_path = util.get_data_path() / link_name link_path = util.get_data_path() / link_name
if link_path.exists() and not force: if link_path.exists() and not force:
prints("To overwrite an existing link, use the --force flag.", prints("To overwrite an existing link, use the --force flag.",
title="Link %s already exists" % link_name, exits=True) title="Link %s already exists" % link_name, exits=1)
elif link_path.exists(): elif link_path.exists():
link_path.unlink() link_path.unlink()
try: try:
@ -33,5 +45,5 @@ def link(origin, link_name, force=False):
title="Error: Couldn't link model to '%s'" % link_name) title="Error: Couldn't link model to '%s'" % link_name)
raise raise
prints("%s --> %s" % (path2str(model_path), path2str(link_path)), prints("%s --> %s" % (path2str(model_path), path2str(link_path)),
"You can now load the model via spacy.load('%s')." % link_name, "You can now load the model via spacy.load('%s')" % link_name,
title="Linking successful") title="Linking successful")

View File

@ -1,122 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
import gzip
import math
from ast import literal_eval
from preshed.counter import PreshCounter
from ..vocab import write_binary_vectors
from ..compat import fix_text, path2str
from ..util import prints
from .. import util
def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
model_path = util.ensure_path(model_dir)
freqs_path = util.ensure_path(freqs_data)
clusters_path = util.ensure_path(clusters_data)
vectors_path = util.ensure_path(vectors_data)
if not freqs_path.is_file():
prints(freqs_path, title="No frequencies file found", exits=True)
if clusters_path and not clusters_path.is_file():
prints(clusters_path, title="No Brown clusters file found", exits=True)
if vectors_path and not vectors_path.is_file():
prints(vectors_path, title="No word vectors file found", exits=True)
vocab = util.get_lang_class(lang).Defaults.create_vocab()
probs, oov_prob = read_probs(freqs_path)
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
create_model(model_path, vectors_path, vocab, oov_prob)
def create_model(model_path, vectors_path, vocab, oov_prob):
vocab_path = model_path / 'vocab'
lexemes_path = vocab_path / 'lexemes.bin'
strings_path = vocab_path / 'strings.json'
oov_path = vocab_path / 'oov_prob'
if not model_path.exists():
model_path.mkdir()
if not vocab_path.exists():
vocab_path.mkdir()
vocab.dump(path2str(lexemes_path))
with strings_path.open('w') as f:
vocab.strings.dump(f)
with oov_path.open('w') as f:
f.write('%f' % oov_prob)
if vectors_path:
vectors_dest = vocab_path / 'vec.bin'
write_binary_vectors(path2str(vectors_path), path2str(vectors_dest))
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i+1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = path2str(file_path)
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
import shutil import shutil
import requests import requests
from pathlib import Path from pathlib import Path
@ -11,27 +12,38 @@ from .. import util
from .. import about from .. import about
def package(input_dir, output_dir, meta_path, force): @plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(cmd, input_dir, output_dir, meta=None, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
input_path = util.ensure_path(input_dir) input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path) meta_path = util.ensure_path(meta)
if not input_path or not input_path.exists(): if not input_path or not input_path.exists():
prints(input_path, title="Model directory not found", exits=True) prints(input_path, title="Model directory not found", exits=1)
if not output_path or not output_path.exists(): if not output_path or not output_path.exists():
prints(output_path, title="Output directory not found", exits=True) prints(output_path, title="Output directory not found", exits=1)
if meta_path and not meta_path.exists(): if meta_path and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=True) prints(meta_path, title="meta.json not found", exits=1)
template_setup = get_template('setup.py') template_setup = get_template('setup.py')
template_manifest = get_template('MANIFEST.in') template_manifest = get_template('MANIFEST.in')
template_init = get_template('en_model_name/__init__.py') template_init = get_template('xx_model_name/__init__.py')
meta_path = meta_path or input_path / 'meta.json' meta_path = meta_path or input_path / 'meta.json'
if meta_path.is_file(): if meta_path.is_file():
prints(meta_path, title="Reading meta.json from file") prints(meta_path, title="Reading meta.json from file")
meta = util.read_json(meta_path) meta = util.read_json(meta_path)
else: else:
meta = generate_meta() meta = generate_meta()
validate_meta(meta, ['lang', 'name', 'version']) meta = validate_meta(meta, ['lang', 'name', 'version'])
model_name = meta['lang'] + '_' + meta['name'] model_name = meta['lang'] + '_' + meta['name']
model_name_v = model_name + '-' + meta['version'] model_name_v = model_name + '-' + meta['version']
@ -55,7 +67,7 @@ def create_dirs(package_path, force):
else: else:
prints(package_path, "Please delete the directory and try again, or " prints(package_path, "Please delete the directory and try again, or "
"use the --force flag to overwrite existing directories.", "use the --force flag to overwrite existing directories.",
title="Package directory already exists", exits=True) title="Package directory already exists", exits=1)
Path.mkdir(package_path, parents=True) Path.mkdir(package_path, parents=True)
@ -68,31 +80,45 @@ def generate_meta():
settings = [('lang', 'Model language', 'en'), settings = [('lang', 'Model language', 'en'),
('name', 'Model name', 'model'), ('name', 'Model name', 'model'),
('version', 'Model version', '0.0.0'), ('version', 'Model version', '0.0.0'),
('spacy_version', 'Required spaCy version', '>=2.0.0,<3.0.0'), ('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
('description', 'Model description', False), ('description', 'Model description', False),
('author', 'Author', False), ('author', 'Author', False),
('email', 'Author email', False), ('email', 'Author email', False),
('url', 'Author website', False), ('url', 'Author website', False),
('license', 'License', 'CC BY-NC 3.0')] ('license', 'License', 'CC BY-NC 3.0')]
prints("Enter the package settings for your model.", title="Generating meta.json") prints("Enter the package settings for your model.", title="Generating meta.json")
meta = {} meta = {}
for setting, desc, default in settings: for setting, desc, default in settings:
response = util.get_raw_input(desc, default) response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response meta[setting] = default if response == '' and default else response
meta['pipeline'] = generate_pipeline()
if about.__title__ != 'spacy':
meta['parent_package'] = about.__title__
return meta return meta
def generate_pipeline():
prints("If set to 'True', the default pipeline is used. If set to 'False', "
"the pipeline will be disabled. Components should be specified as a "
"comma-separated list of component names, e.g. vectorizer, tagger, "
"parser, ner. For more information, see the docs on processing pipelines.",
title="Enter your model's pipeline components")
pipeline = util.get_raw_input("Pipeline components", True)
replace = {'True': True, 'False': False}
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
def validate_meta(meta, keys): def validate_meta(meta, keys):
for key in keys: for key in keys:
if key not in meta or meta[key] == '': if key not in meta or meta[key] == '':
prints("This setting is required to build your package.", prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=True) title='No "%s" setting found in meta.json' % key, exits=1)
return meta
def get_template(filepath): def get_template(filepath):
r = requests.get(about.__model_files__ + filepath) r = requests.get(about.__model_files__ + filepath)
if r.status_code != 200: if r.status_code != 200:
prints("Couldn't fetch template files from GitHub.", prints("Couldn't fetch template files from GitHub.",
title="Server error (%d)" % r.status_code, exits=True) title="Server error (%d)" % r.status_code, exits=1)
return r.text return r.text

View File

@ -1,132 +1,153 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, division, print_function from __future__ import unicode_literals, division, print_function
import plac
import json import json
from collections import defaultdict from collections import defaultdict
import cytoolz import cytoolz
from pathlib import Path from pathlib import Path
import dill import dill
import tqdm
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
from ..tokens.doc import Doc from ..tokens.doc import Doc
from ..scorer import Scorer from ..scorer import Scorer
from ..gold import GoldParse, merge_sents from ..gold import GoldParse, merge_sents
from ..gold import read_json_file as read_gold_json from ..gold import GoldCorpus, minibatch
from ..util import prints from ..util import prints
from .. import util from .. import util
from .. import displacy from .. import displacy
from ..compat import json_dumps
def train(language, output_dir, train_data, dev_data, n_iter, n_sents, @plac.annotations(
use_gpu, tagger, parser, ner, parser_L1): lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int),
resume=("Whether to resume training", "flag", "R", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
)
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
util.set_env_log(True)
n_sents = n_sents or None
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data) dev_path = util.ensure_path(dev_data)
if not output_path.exists(): if not output_path.exists():
prints(output_path, title="Output directory not found", exits=True) output_path.mkdir()
if not train_path.exists(): if not train_path.exists():
prints(train_path, title="Training data not found", exits=True) prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists(): if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=True) prints(dev_path, title="Development data not found", exits=1)
lang = util.get_lang_class(language) lang_class = util.get_lang_class(lang)
parser_cfg = {
'pseudoprojective': True,
'L1': parser_L1,
'n_iter': n_iter,
'lang': language,
'features': lang.Defaults.parser_features}
entity_cfg = {
'n_iter': n_iter,
'lang': language,
'features': lang.Defaults.entity_features}
tagger_cfg = {
'n_iter': n_iter,
'lang': language,
'features': lang.Defaults.tagger_features}
gold_train = list(read_gold_json(train_path, limit=n_sents))
gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None
train_model(lang, gold_train, gold_dev, output_path, n_iter, use_gpu=use_gpu) pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
if gold_dev: if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
scorer = evaluate(lang, gold_dev, output_path) if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
print_results(scorer) if no_entities and 'entities' in pipeline: pipeline.remove('entities')
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training.
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
util.env_opt('dropout_to', 0.2),
util.env_opt('dropout_decay', 0.0))
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 64),
util.env_opt('batch_compound', 1.001))
if resume:
prints(output_path / 'model19.pickle', title="Resuming training")
nlp = dill.load((output_path / 'model19.pickle').open('rb'))
else:
nlp = lang_class(pipeline=pipeline)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train()
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try:
for i in range(n_iter):
if resume:
i += 20
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
train_docs = corpus.train_docs(nlp, projectivize=True,
gold_preproc=False, max_length=0)
losses = {}
for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses,
update_tensors=True)
pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages):
util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path)
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
dill.dump(nlp, file_, -1)
nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate(
corpus.dev_docs(
nlp_loaded,
gold_preproc=False))
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores))
util.set_env_log(True)
print_progress(i, losses, scorer.scores)
finally:
print("Saving model...")
with (output_path / 'model-final.pickle').open('wb') as file_:
with nlp.use_params(optimizer.averages):
dill.dump(nlp, file_, -1)
def train_config(config): def _render_parses(i, to_render):
config_path = util.ensure_path(config) to_render[0].user_data['title'] = "Batch %d" % i
if not config_path.is_file(): with Path('/tmp/entities.html').open('w') as file_:
prints(config_path, title="Config file not found", exits=True) html = displacy.render(to_render[:5], style='ent', page=True)
config = json.load(config_path) file_.write(html)
for setting in []: with Path('/tmp/parses.html').open('w') as file_:
if setting not in config.keys(): html = displacy.render(to_render[:5], style='dep', page=True)
prints("%s not found in config file." % setting, title="Missing setting") file_.write(html)
def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg): def print_progress(itn, losses, dev_scores, wps=0.0):
print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %")
nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies'])
dropout = util.env_opt('dropout', 0.0)
# TODO: Get spaCy using Thinc's trainer and optimizer
with nlp.begin_training(train_data, **cfg) as (trainer, optimizer):
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)):
losses = defaultdict(float)
to_render = []
for i, (docs, golds) in enumerate(epoch):
state = nlp.update(docs, golds, drop=dropout, sgd=optimizer)
losses['dep_loss'] += state.get('parser_loss', 0.0)
losses['tag_loss'] += state.get('tag_loss', 0.0)
to_render.insert(0, nlp(docs[-1].text))
to_render[0].user_data['title'] = "Batch %d" % i
with Path('/tmp/entities.html').open('w') as file_:
html = displacy.render(to_render[:5], style='ent', page=True)
file_.write(html)
with Path('/tmp/parses.html').open('w') as file_:
html = displacy.render(to_render[:5], style='dep', page=True)
file_.write(html)
if dev_data:
with nlp.use_params(optimizer.averages):
dev_scores = trainer.evaluate(dev_data).scores
else:
dev_scores = defaultdict(float)
print_progress(itn, losses, dev_scores)
with (output_path / 'model.bin').open('wb') as file_:
dill.dump(nlp, file_, -1)
#nlp.to_disk(output_path, tokenizer=False)
def evaluate(Language, gold_tuples, path):
with (path / 'model.bin').open('rb') as file_:
nlp = dill.load(file_)
# TODO:
# 1. This code is duplicate with spacy.train.Trainer.evaluate
# 2. There's currently a semantic difference between pipe and
# not pipe! It matters whether we batch the inputs. Must fix!
all_docs = []
all_golds = []
for raw_text, paragraph_tuples in dev_sents:
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
docs = self.make_docs(raw_text, paragraph_tuples)
golds = self.make_golds(docs, paragraph_tuples)
all_docs.extend(docs)
all_golds.extend(golds)
scorer = Scorer()
for doc, gold in zip(self.nlp.pipe(all_docs), all_golds):
scorer.score(doc, gold)
return scorer
def print_progress(itn, losses, dev_scores):
# TODO: Fix!
scores = {} scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', 'ents_f']: for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0 scores[col] = 0.0
scores.update(losses) scores['dep_loss'] = losses.get('parser', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores) scores.update(dev_scores)
tpl = '{:d}\t{dep_loss:.3f}\t{tag_loss:.3f}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}' scores['wps'] = wps
tpl = '\t'.join((
'{:d}',
'{dep_loss:.3f}',
'{uas:.3f}',
'{ents_p:.3f}',
'{ents_r:.3f}',
'{ents_f:.3f}',
'{tags_acc:.3f}',
'{token_acc:.3f}',
'{wps:.1f}'))
print(tpl.format(itn, **scores)) print(tpl.format(itn, **scores))

View File

@ -5,6 +5,9 @@ import six
import ftfy import ftfy
import sys import sys
import ujson import ujson
import itertools
from thinc.neural.util import copy_array
try: try:
import cPickle as pickle import cPickle as pickle
@ -32,6 +35,8 @@ copy_reg = copy_reg
CudaStream = CudaStream CudaStream = CudaStream
cupy = cupy cupy = cupy
fix_text = ftfy.fix_text fix_text = ftfy.fix_text
copy_array = copy_array
izip = getattr(itertools, 'izip', zip)
is_python2 = six.PY2 is_python2 = six.PY2
is_python3 = six.PY3 is_python3 = six.PY3
@ -57,6 +62,19 @@ elif is_python3:
path2str = lambda path: str(path) path2str = lambda path: str(path)
def b_to_str(b_str):
if is_python2:
return b_str
# important: if no encoding is set, string becomes "b'...'"
return str(b_str, encoding='utf8')
def getattr_(obj, name, *default):
if is_python3 and isinstance(name, bytes):
name = name.decode('utf8')
return getattr(obj, name, *default)
def symlink_to(orig, dest): def symlink_to(orig, dest):
if is_python2 and is_windows: if is_python2 and is_windows:
import subprocess import subprocess
@ -71,3 +89,16 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
(windows == None or windows == is_windows) and (windows == None or windows == is_windows) and
(linux == None or linux == is_linux) and (linux == None or linux == is_linux) and
(osx == None or osx == is_osx)) (osx == None or osx == is_osx))
def normalize_string_keys(old):
'''Given a dictionary, make sure keys are unicode strings, not bytes.'''
new = {}
for key, value in old.items():
if isinstance(key, bytes_):
new[key.decode('utf8')] = value
else:
new[key] = value
return new

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .render import DependencyRenderer, EntityRenderer from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc from ..tokens import Doc
from ..compat import b_to_str
from ..util import prints, is_in_jupyter from ..util import prints, is_in_jupyter
@ -10,27 +11,28 @@ _html = {}
IS_JUPYTER = is_in_jupyter() IS_JUPYTER = is_in_jupyter()
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, options={}): def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
options={}, manual=False):
"""Render displaCy visualisation. """Render displaCy visualisation.
docs (list or Doc): Document(s) to visualise. docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'. style (unicode): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page. page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
jupyter (bool): Experimental, use Jupyter's display() to output markup. jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
options (dict): Visualiser-specific options, e.g. colors. options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
RETURNS (unicode): Rendered HTML markup. RETURNS (unicode): Rendered HTML markup.
""" """
if isinstance(docs, Doc): factories = {'dep': (DependencyRenderer, parse_deps),
docs = [docs] 'ent': (EntityRenderer, parse_ents)}
if style == 'dep': if style not in factories:
renderer = DependencyRenderer(options=options)
parsed = [parse_deps(doc, options) for doc in docs]
elif style == 'ent':
renderer = EntityRenderer(options=options)
parsed = [parse_ents(doc, options) for doc in docs]
else:
raise ValueError("Unknown style: %s" % style) raise ValueError("Unknown style: %s" % style)
if isinstance(docs, Doc) or isinstance(docs, dict):
docs = [docs]
renderer, converter = factories[style]
renderer = renderer(options=options)
parsed = [converter(doc, options) for doc in docs] if not manual else docs
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip() _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
html = _html['parsed'] html = _html['parsed']
if jupyter: # return HTML rendered by IPython display() if jupyter: # return HTML rendered by IPython display()
@ -39,7 +41,8 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, opti
return html return html
def serve(docs, style='dep', page=True, minify=False, options={}, port=5000): def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
port=5000):
"""Serve displaCy visualisation. """Serve displaCy visualisation.
docs (list or Doc): Document(s) to visualise. docs (list or Doc): Document(s) to visualise.
@ -47,27 +50,36 @@ def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
page (bool): Render markup as full HTML page. page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors. options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
port (int): Port to serve visualisation. port (int): Port to serve visualisation.
""" """
from wsgiref import simple_server from wsgiref import simple_server
render(docs, style=style, page=page, minify=minify, options=options) render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server('0.0.0.0', port, app) httpd = simple_server.make_server('0.0.0.0', port, app)
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
httpd.serve_forever() try:
httpd.serve_forever()
except KeyboardInterrupt:
prints("Shutting down server on port %d." % port)
finally:
httpd.server_close()
def app(environ, start_response): def app(environ, start_response):
start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')]) # headers and status need to be bytes in Python 2, see #1227
headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
start_response(b_to_str(b'200 OK'), headers)
res = _html['parsed'].encode(encoding='utf-8') res = _html['parsed'].encode(encoding='utf-8')
return [res] return [res]
def parse_deps(doc, options={}): def parse_deps(orig_doc, options={}):
"""Generate dependency parse in {'words': [], 'arcs': []} format. """Generate dependency parse in {'words': [], 'arcs': []} format.
doc (Doc): Document do parse. doc (Doc): Document do parse.
RETURNS (dict): Generated dependency parse keyed by words and arcs. RETURNS (dict): Generated dependency parse keyed by words and arcs.
""" """
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
if options.get('collapse_punct', True): if options.get('collapse_punct', True):
spans = [] spans = []
for word in doc[:-1]: for word in doc[:-1]:

View File

@ -18,12 +18,11 @@ class DependencyRenderer(object):
offset_x, color, bg, font) offset_x, color, bg, font)
""" """
self.compact = options.get('compact', False) self.compact = options.get('compact', False)
distance, arrow_width = (85, 8) if self.compact else (175, 10)
self.word_spacing = options.get('word_spacing', 45) self.word_spacing = options.get('word_spacing', 45)
self.arrow_spacing = options.get('arrow_spacing', 20) self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
self.arrow_width = options.get('arrow_width', arrow_width) self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
self.arrow_stroke = options.get('arrow_stroke', 2) self.arrow_stroke = options.get('arrow_stroke', 2)
self.distance = options.get('distance', distance) self.distance = options.get('distance', 150 if self.compact else 175)
self.offset_x = options.get('offset_x', 50) self.offset_x = options.get('offset_x', 50)
self.color = options.get('color', '#000000') self.color = options.get('color', '#000000')
self.bg = options.get('bg', '#ffffff') self.bg = options.get('bg', '#ffffff')
@ -99,6 +98,8 @@ class DependencyRenderer(object):
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
-self.arrow_spacing*(self.highest_level-level)/4) -self.arrow_spacing*(self.highest_level-level)/4)
y_curve = self.offset_y-level*self.distance/2 y_curve = self.offset_y-level*self.distance/2
if self.compact:
y_curve = self.offset_y-level*self.distance/6
if y_curve == 0 and len(self.levels) > 5: if y_curve == 0 and len(self.levels) > 5:
y_curve = -self.distance y_curve = -self.distance
arrowhead = self.get_arrowhead(direction, x_start, y, x_end) arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
@ -175,7 +176,7 @@ class EntityRenderer(object):
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered HTML markup. RETURNS (unicode): Rendered HTML markup.
""" """
rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed] rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed]
if page: if page:
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered]) docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
markup = TPL_PAGE.format(content=docs) markup = TPL_PAGE.format(content=docs)

View File

@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
TPL_DEP_ARCS = """ TPL_DEP_ARCS = """
<g class="displacy-arrow"> <g class="displacy-arrow">
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/> <path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
<text dy="1.25em" style="font-size: 0.8em"> <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath> <textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
</text> </text>
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/> <path class="displacy-arrowhead" d="{head}" fill="currentColor"/>

View File

@ -1,13 +1,15 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .structs cimport TokenC from .structs cimport TokenC
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition from .syntax.transition_system cimport Transition
cdef struct GoldParseC: cdef struct GoldParseC:
int* tags int* tags
int* heads int* heads
int* labels int* has_dep
attr_t* labels
int** brackets int** brackets
Transition* ner Transition* ner
@ -18,15 +20,16 @@ cdef class GoldParse:
cdef GoldParseC c cdef GoldParseC c
cdef int length cdef int length
cdef readonly int loss cdef public int loss
cdef readonly list words cdef public list words
cdef readonly list tags cdef public list tags
cdef readonly list heads cdef public list heads
cdef readonly list labels cdef public list labels
cdef readonly dict orths cdef public dict orths
cdef readonly list ner cdef public list ner
cdef readonly list ents cdef public list ents
cdef readonly dict brackets cdef public dict brackets
cdef public object cats
cdef readonly list cand_to_gold cdef readonly list cand_to_gold
cdef readonly list gold_to_cand cdef readonly list gold_to_cand

View File

@ -5,10 +5,13 @@ from __future__ import unicode_literals, print_function
import io import io
import re import re
import ujson import ujson
import random
import cytoolz
from .syntax import nonproj from .syntax import nonproj
from .util import ensure_path from .util import ensure_path
from . import util from . import util
from .tokens import Doc
def tags_to_entities(tags): def tags_to_entities(tags):
@ -86,8 +89,8 @@ def _min_edit_path(cand_words, gold_words):
# TODO: Fix this --- just do it properly, make the full edit matrix and # TODO: Fix this --- just do it properly, make the full edit matrix and
# then walk back over it... # then walk back over it...
# Preprocess inputs # Preprocess inputs
cand_words = [punct_re.sub('', w) for w in cand_words] cand_words = [punct_re.sub('', w).lower() for w in cand_words]
gold_words = [punct_re.sub('', w) for w in gold_words] gold_words = [punct_re.sub('', w).lower() for w in gold_words]
if cand_words == gold_words: if cand_words == gold_words:
return 0, ''.join(['M' for _ in gold_words]) return 0, ''.join(['M' for _ in gold_words])
@ -139,8 +142,164 @@ def _min_edit_path(cand_words, gold_words):
return prev_costs[n_gold], previous_row[-1] return prev_costs[n_gold], previous_row[-1]
def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None): def minibatch(items, size=8):
make_supertags = util.env_opt('make_supertags', make_supertags) '''Iterate over batches of items. `size` may be an iterator,
so that batch-size can vary on each step.
'''
items = iter(items)
while True:
batch_size = next(size) #if hasattr(size, '__next__') else size
batch = list(cytoolz.take(int(batch_size), items))
if len(batch) == 0:
break
yield list(batch)
class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER."""
def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
"""Create a GoldCorpus.
train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data.
"""
self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path)
self.limit = limit
self.train_locs = self.walk_corpus(self.train_path)
self.dev_locs = self.walk_corpus(self.dev_path)
@property
def train_tuples(self):
i = 0
for loc in self.train_locs:
gold_tuples = read_json_file(loc)
for item in gold_tuples:
yield item
i += len(item[1])
if self.limit and i >= self.limit:
break
@property
def dev_tuples(self):
i = 0
for loc in self.dev_locs:
gold_tuples = read_json_file(loc)
for item in gold_tuples:
yield item
i += 1
if self.limit and i >= self.limit:
break
def count_train(self):
n = 0
i = 0
for raw_text, paragraph_tuples in self.train_tuples:
n += sum([len(s[0][1]) for s in paragraph_tuples])
if self.limit and i >= self.limit:
break
i += len(paragraph_tuples)
return n
def train_docs(self, nlp, gold_preproc=False,
projectivize=False, max_length=None,
noise_level=0.0):
train_tuples = self.train_tuples
if projectivize:
train_tuples = nonproj.preprocess_training_data(
self.train_tuples)
random.shuffle(train_tuples)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length,
noise_level=noise_level)
yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
#gold_docs = nlp.preprocess_gold(gold_docs)
yield from gold_docs
@classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0):
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length:
yield doc, gold
@classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
noise_level=0.0):
if raw_text is not None:
raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)]
else:
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples]
@classmethod
def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples)
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
else:
return [GoldParse.from_annot_tuples(doc, sent_tuples)
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
@staticmethod
def walk_corpus(path):
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith('.'):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith('.json'):
locs.append(path)
return locs
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
corrupted = [_corrupt(word, noise_level) for word in orig]
corrupted = [w for w in corrupted if w]
return corrupted
else:
return ''.join(_corrupt(c, noise_level) for c in orig)
def _corrupt(c, noise_level):
if random.random() >= noise_level:
return c
elif c == ' ':
return '\n'
elif c == '\n':
return ' '
elif c in ['.', "'", "!", "?"]:
return ''
else:
return c.lower()
def read_json_file(loc, docs_filter=None, limit=None):
loc = ensure_path(loc) loc = ensure_path(loc)
if loc.is_dir(): if loc.is_dir():
for filename in loc.iterdir(): for filename in loc.iterdir():
@ -173,16 +332,14 @@ def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
if labels[-1].lower() == 'root': if labels[-1].lower() == 'root':
labels[-1] = 'ROOT' labels[-1] = 'ROOT'
ner.append(token.get('ner', '-')) ner.append(token.get('ner', '-'))
if make_supertags:
tags[-1] = '-'.join((tags[-1], labels[-1], ner[-1]))
sents.append([ sents.append([
[ids, words, tags, heads, labels, ner], [ids, words, tags, heads, labels, ner],
sent.get('brackets', [])]) sent.get('brackets', [])])
if sents: if sents:
yield [paragraph.get('raw', None), sents] yield [paragraph.get('raw', None), sents]
def _iob_to_biluo(tags): def iob_to_biluo(tags):
out = [] out = []
curr_label = None curr_label = None
tags = list(tags) tags = list(tags)
@ -224,26 +381,25 @@ cdef class GoldParse:
make_projective=make_projective) make_projective=make_projective)
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False): deps=None, entities=None, make_projective=False,
""" cats=tuple()):
Create a GoldParse. """Create a GoldParse.
Arguments: doc (Doc): The document the annotations refer to.
doc (Doc): words (iterable): A sequence of unicode word strings.
The document the annotations refer to. tags (iterable): A sequence of strings, representing tag annotations.
words: heads (iterable): A sequence of integers, representing syntactic head offsets.
A sequence of unicode word strings. deps (iterable): A sequence of strings, representing the syntactic relation types.
tags: entities (iterable): A sequence of named entity annotations, either as
A sequence of strings, representing tag annotations. BILUO tag strings, or as `(start_char, end_char, label)` tuples,
heads: representing the entity positions.
A sequence of integers, representing syntactic head offsets. cats (iterable): A sequence of labels for text classification. Each
deps: label may be a string or an int, or a `(start_char, end_char, label)`
A sequence of strings, representing the syntactic relation types. tuple, indicating that the label is applied to only part of the
entities: document (usually a sentence). Unlike entity annotations, label
A sequence of named entity annotations, either as BILUO tag strings, annotations can overlap, i.e. a single word can be covered by
or as (start_char, end_char, label) tuples, representing the entity multiple labelled spans.
positions. RETURNS (GoldParse): The newly constructed object.
Returns (GoldParse): The newly constructed object.
""" """
if words is None: if words is None:
words = [token.text for token in doc] words = [token.text for token in doc]
@ -268,9 +424,11 @@ cdef class GoldParse:
# These are filled by the tagger/parser/entity recogniser # These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int)) self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int)) self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int)) self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition)) self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.cats = list(cats)
self.words = [None] * len(doc) self.words = [None] * len(doc)
self.tags = [None] * len(doc) self.tags = [None] * len(doc)
self.heads = [None] * len(doc) self.heads = [None] * len(doc)
@ -295,7 +453,10 @@ cdef class GoldParse:
else: else:
self.words[i] = words[gold_i] self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i] self.tags[i] = tags[gold_i]
self.heads[i] = self.gold_to_cand[heads[gold_i]] if heads[gold_i] is None:
self.heads[i] = None
else:
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i] self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i] self.ner[i] = entities[gold_i]
@ -304,59 +465,49 @@ cdef class GoldParse:
raise Exception("Cycle found: %s" % cycle) raise Exception("Cycle found: %s" % cycle)
if make_projective: if make_projective:
proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads, self.labels) proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
self.heads = proj_heads self.heads = proj_heads
def __len__(self): def __len__(self):
""" """Get the number of gold-standard tokens.
Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens. RETURNS (int): The number of gold-standard tokens.
""" """
return self.length return self.length
@property @property
def is_projective(self): def is_projective(self):
""" """Whether the provided syntactic annotations form a projective
Whether the provided syntactic annotations form a projective dependency dependency tree.
tree.
""" """
return not nonproj.is_nonproj_tree(self.heads) return not nonproj.is_nonproj_tree(self.heads)
def biluo_tags_from_offsets(doc, entities): def biluo_tags_from_offsets(doc, entities, missing='O'):
""" """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO).
scheme (biluo).
Arguments: doc (Doc): The document that the entity offsets refer to. The output tags
doc (Doc): will refer to the token boundaries within the document.
The document that the entity offsets refer to. The output tags will entities (iterable): A sequence of `(start, end, label)` triples. `start` and
refer to the token boundaries within the document. `end` should be character-offset integers denoting the slice into the
original string.
entities (sequence): RETURNS (list): A list of unicode strings, describing the tags. Each tag
A sequence of (start, end, label) triples. start and end should be string will be of the form either "", "O" or "{action}-{label}", where
character-offset integers denoting the slice into the original string. action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The
training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Returns: EXAMPLE:
tags (list): >>> text = 'I like London.'
A list of unicode strings, describing the tags. Each tag string will >>> entities = [(len('I like '), len('I like London'), 'LOC')]
be of the form either "", "O" or "{action}-{label}", where action is one >>> doc = nlp.tokenizer(text)
of "B", "I", "L", "U". The string "-" is used where the entity >>> tags = biluo_tags_from_offsets(doc, entities)
offsets don't align with the tokenization in the Doc object. The >>> assert tags == ['O', 'O', 'U-LOC', 'O']
training algorithm will view these as missing values. "O" denotes
a non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Example:
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = nlp.tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
""" """
starts = {token.idx: token.i for token in doc} starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc} ends = {token.idx+len(token): token.i for token in doc}
@ -384,7 +535,7 @@ def biluo_tags_from_offsets(doc, entities):
if i in entity_chars: if i in entity_chars:
break break
else: else:
biluo[token.i] = 'O' biluo[token.i] = missing
return biluo return biluo

View File

@ -13,21 +13,23 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class BengaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
class Bengali(Language): class Bengali(Language):
lang = 'bn' lang = 'bn'
Defaults = BengaliDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
__all__ = ['Bengali'] __all__ = ['Bengali']

View File

@ -1,8 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
_currency = r"\$|¢|£|€|¥|฿|৳" _currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
_list_punct = LIST_PUNCT + '। ॥'.strip().split() _list_punct = LIST_PUNCT + '। ॥'.strip().split()
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES) _prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + _suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'(?<=[0-9])\+', [r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.', r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(_currency), r'(?<=[0-9])(?:{})'.format(_currency),
r'(?<=[0-9])(?:{})'.format(UNITS), r'(?<=[0-9])(?:{})'.format(UNITS),
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)]) r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
_infixes = (LIST_ELLIPSES + _infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),

View File

@ -20,7 +20,6 @@ _upper = [_latin_upper]
_lower = [_latin_lower] _lower = [_latin_lower]
_uncased = [_bengali, _hebrew] _uncased = [_bengali, _hebrew]
ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA = merge_char_classes(_upper + _lower + _uncased)
ALPHA_LOWER = merge_char_classes(_lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased)
ALPHA_UPPER = merge_char_classes(_upper + _uncased) ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «' _quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---' _hyphens = '- — -- ---'
_other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units) UNITS = merge_chars(_units)
CURRENCY = merge_chars(_currency) CURRENCY = merge_chars(_currency)
QUOTES = merge_chars(_quotes) QUOTES = merge_chars(_quotes)
PUNCT = merge_chars(_punct) PUNCT = merge_chars(_punct)
HYPHENS = merge_chars(_hyphens) HYPHENS = merge_chars(_hyphens)
ICONS = _other_symbols
LIST_UNITS = split_chars(_units) LIST_UNITS = split_chars(_units)
LIST_CURRENCY = split_chars(_currency) LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
LIST_PUNCT = split_chars(_punct) LIST_PUNCT = split_chars(_punct)
LIST_HYPHENS = split_chars(_hyphens) LIST_HYPHENS = split_chars(_hyphens)
LIST_ELLIPSES = [r'\.\.+', ''] LIST_ELLIPSES = [r'\.\.+', '']
LIST_ICONS = [_other_symbols]

View File

@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Danish(Language): class Danish(Language):
lang = 'da' lang = 'da'
Defaults = DanishDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Danish'] __all__ = ['Danish']

View File

@ -2,33 +2,39 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
NORM_EXCEPTIONS, BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class German(Language): class German(Language):
lang = 'de' lang = 'de'
Defaults = GermanDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['German'] __all__ = ['German']

View File

@ -0,0 +1,17 @@
# coding: utf8
from __future__ import unicode_literals
# Here we only want to include the absolute most common words. Otherwise,
# this list would get impossibly long for German especially considering the
# old vs. new spelling rules, and all possible cases.
_exc = {
"daß": "dass"
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -15,9 +15,9 @@ def noun_chunks(obj):
# and not just "eine Tasse", same for "das Thema Familie". # and not just "eine Tasse", same for "das Thema Familie".
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app'] labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span. doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings['NP'] np_label = doc.vocab.strings.add('NP')
np_deps = set(doc.vocab.strings[label] for label in labels) np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings['nk'] close_app = doc.vocab.strings.add('nk')
rbracket = 0 rbracket = 0
for i, word in enumerate(obj): for i, word in enumerate(obj):

View File

@ -8,7 +8,7 @@ from ...deprecated import PRON_LEMMA
_exc = { _exc = {
"auf'm": [ "auf'm": [
{ORTH: "auf", LEMMA: "auf"}, {ORTH: "auf", LEMMA: "auf"},
{ORTH: "'m", LEMMA: "der", NORM: "dem" }], {ORTH: "'m", LEMMA: "der", NORM: "dem"}],
"du's": [ "du's": [
{ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "du", LEMMA: PRON_LEMMA, TAG: "PPER"},
@ -53,97 +53,97 @@ _exc = {
for exc_data in [ for exc_data in [
{ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"}, {ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
{ORTH: "'n", LEMMA: "ein", NORM: "ein"}, {ORTH: "'n", LEMMA: "ein", NORM: "ein"},
{ORTH: "'ne", LEMMA: "eine", NORM: "eine"}, {ORTH: "'ne", LEMMA: "eine", NORM: "eine"},
{ORTH: "'nen", LEMMA: "ein", NORM: "einen"}, {ORTH: "'nen", LEMMA: "ein", NORM: "einen"},
{ORTH: "'nem", LEMMA: "ein", NORM: "einem"}, {ORTH: "'nem", LEMMA: "ein", NORM: "einem"},
{ORTH: "Abb.", LEMMA: "Abbildung"}, {ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"},
{ORTH: "Abk.", LEMMA: "Abkürzung"}, {ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"},
{ORTH: "Abt.", LEMMA: "Abteilung"}, {ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"},
{ORTH: "Apr.", LEMMA: "April"}, {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
{ORTH: "Aug.", LEMMA: "August"}, {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
{ORTH: "Bd.", LEMMA: "Band"}, {ORTH: "Bd.", LEMMA: "Band", NORM: "Band"},
{ORTH: "Betr.", LEMMA: "Betreff"}, {ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"},
{ORTH: "Bf.", LEMMA: "Bahnhof"}, {ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
{ORTH: "Bhf.", LEMMA: "Bahnhof"}, {ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
{ORTH: "Bsp.", LEMMA: "Beispiel"}, {ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"},
{ORTH: "Dez.", LEMMA: "Dezember"}, {ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"},
{ORTH: "Di.", LEMMA: "Dienstag"}, {ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"},
{ORTH: "Do.", LEMMA: "Donnerstag"}, {ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"},
{ORTH: "Fa.", LEMMA: "Firma"}, {ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"},
{ORTH: "Fam.", LEMMA: "Familie"}, {ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"},
{ORTH: "Feb.", LEMMA: "Februar"}, {ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"},
{ORTH: "Fr.", LEMMA: "Frau"}, {ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"},
{ORTH: "Frl.", LEMMA: "Fräulein"}, {ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"},
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"}, {ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"},
{ORTH: "Hr.", LEMMA: "Herr"}, {ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"},
{ORTH: "Hrn.", LEMMA: "Herr"}, {ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"},
{ORTH: "Jan.", LEMMA: "Januar"}, {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
{ORTH: "Jh.", LEMMA: "Jahrhundert"}, {ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
{ORTH: "Jhd.", LEMMA: "Jahrhundert"}, {ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
{ORTH: "Jul.", LEMMA: "Juli"}, {ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
{ORTH: "Jun.", LEMMA: "Juni"}, {ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
{ORTH: "Mi.", LEMMA: "Mittwoch"}, {ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"},
{ORTH: "Mio.", LEMMA: "Million"}, {ORTH: "Mio.", LEMMA: "Million", NORM: "Million"},
{ORTH: "Mo.", LEMMA: "Montag"}, {ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"},
{ORTH: "Mrd.", LEMMA: "Milliarde"}, {ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"},
{ORTH: "Mrz.", LEMMA: "März"}, {ORTH: "Mrz.", LEMMA: "März", NORM: "März"},
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"}, {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"},
{ORTH: "Mär.", LEMMA: "März"}, {ORTH: "Mär.", LEMMA: "März", NORM: "März"},
{ORTH: "Nov.", LEMMA: "November"}, {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
{ORTH: "Nr.", LEMMA: "Nummer"}, {ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"},
{ORTH: "Okt.", LEMMA: "Oktober"}, {ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
{ORTH: "Orig.", LEMMA: "Original"}, {ORTH: "Orig.", LEMMA: "Original", NORM: "Original"},
{ORTH: "Pkt.", LEMMA: "Punkt"}, {ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"},
{ORTH: "Prof.", LEMMA: "Professor"}, {ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"},
{ORTH: "Red.", LEMMA: "Redaktion"}, {ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"},
{ORTH: "Sa.", LEMMA: "Samstag"}, {ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"},
{ORTH: "Sep.", LEMMA: "September"}, {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September"}, {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
{ORTH: "So.", LEMMA: "Sonntag"}, {ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"},
{ORTH: "Std.", LEMMA: "Stunde"}, {ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"},
{ORTH: "Str.", LEMMA: "Straße"}, {ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"},
{ORTH: "Tel.", LEMMA: "Telefon"}, {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
{ORTH: "Tsd.", LEMMA: "Tausend"}, {ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"},
{ORTH: "Univ.", LEMMA: "Universität"}, {ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"},
{ORTH: "abzgl.", LEMMA: "abzüglich"}, {ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"},
{ORTH: "allg.", LEMMA: "allgemein"}, {ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"},
{ORTH: "bspw.", LEMMA: "beispielsweise"}, {ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"},
{ORTH: "bzgl.", LEMMA: "bezüglich"}, {ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"},
{ORTH: "bzw.", LEMMA: "beziehungsweise"}, {ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"},
{ORTH: "d.h.", LEMMA: "das heißt"}, {ORTH: "d.h.", LEMMA: "das heißt"},
{ORTH: "dgl.", LEMMA: "dergleichen"}, {ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"},
{ORTH: "ebd.", LEMMA: "ebenda"}, {ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"},
{ORTH: "eigtl.", LEMMA: "eigentlich"}, {ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"},
{ORTH: "engl.", LEMMA: "englisch"}, {ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"},
{ORTH: "evtl.", LEMMA: "eventuell"}, {ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"},
{ORTH: "frz.", LEMMA: "französisch"}, {ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"},
{ORTH: "gegr.", LEMMA: "gegründet"}, {ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"},
{ORTH: "ggf.", LEMMA: "gegebenenfalls"}, {ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"}, {ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
{ORTH: "ggü.", LEMMA: "gegenüber"}, {ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"},
{ORTH: "i.O.", LEMMA: "in Ordnung"}, {ORTH: "i.O.", LEMMA: "in Ordnung"},
{ORTH: "i.d.R.", LEMMA: "in der Regel"}, {ORTH: "i.d.R.", LEMMA: "in der Regel"},
{ORTH: "incl.", LEMMA: "inklusive"}, {ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"},
{ORTH: "inkl.", LEMMA: "inklusive"}, {ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"},
{ORTH: "insb.", LEMMA: "insbesondere"}, {ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"},
{ORTH: "kath.", LEMMA: "katholisch"}, {ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"},
{ORTH: "lt.", LEMMA: "laut"}, {ORTH: "lt.", LEMMA: "laut", NORM: "laut"},
{ORTH: "max.", LEMMA: "maximal"}, {ORTH: "max.", LEMMA: "maximal", NORM: "maximal"},
{ORTH: "min.", LEMMA: "minimal"}, {ORTH: "min.", LEMMA: "minimal", NORM: "minimal"},
{ORTH: "mind.", LEMMA: "mindestens"}, {ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"},
{ORTH: "mtl.", LEMMA: "monatlich"}, {ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"},
{ORTH: "n.Chr.", LEMMA: "nach Christus"}, {ORTH: "n.Chr.", LEMMA: "nach Christus"},
{ORTH: "orig.", LEMMA: "original"}, {ORTH: "orig.", LEMMA: "original", NORM: "original"},
{ORTH: "röm.", LEMMA: "römisch"}, {ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"},
{ORTH: "s.o.", LEMMA: "siehe oben"}, {ORTH: "s.o.", LEMMA: "siehe oben"},
{ORTH: "sog.", LEMMA: "so genannt"}, {ORTH: "sog.", LEMMA: "so genannt"},
{ORTH: "stellv.", LEMMA: "stellvertretend"}, {ORTH: "stellv.", LEMMA: "stellvertretend"},
{ORTH: "tägl.", LEMMA: "täglich"}, {ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"},
{ORTH: "u.U.", LEMMA: "unter Umständen"}, {ORTH: "u.U.", LEMMA: "unter Umständen"},
{ORTH: "u.s.w.", LEMMA: "und so weiter"}, {ORTH: "u.s.w.", LEMMA: "und so weiter"},
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"}, {ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
@ -153,9 +153,9 @@ for exc_data in [
{ORTH: "v.Chr.", LEMMA: "vor Christus"}, {ORTH: "v.Chr.", LEMMA: "vor Christus"},
{ORTH: "v.a.", LEMMA: "vor allem"}, {ORTH: "v.a.", LEMMA: "vor allem"},
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}, {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
{ORTH: "vgl.", LEMMA: "vergleiche"}, {ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"},
{ORTH: "vllt.", LEMMA: "vielleicht"}, {ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"},
{ORTH: "vlt.", LEMMA: "vielleicht"}, {ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"},
{ORTH: "z.B.", LEMMA: "zum Beispiel"}, {ORTH: "z.B.", LEMMA: "zum Beispiel"},
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}, {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
{ORTH: "z.T.", LEMMA: "zum Teil"}, {ORTH: "z.T.", LEMMA: "zum Teil"},
@ -163,7 +163,7 @@ for exc_data in [
{ORTH: "z.Zt.", LEMMA: "zur Zeit"}, {ORTH: "z.Zt.", LEMMA: "zur Zeit"},
{ORTH: "z.b.", LEMMA: "zum Beispiel"}, {ORTH: "z.b.", LEMMA: "zum Beispiel"},
{ORTH: "zzgl.", LEMMA: "zuzüglich"}, {ORTH: "zzgl.", LEMMA: "zuzüglich"},
{ORTH: "österr.", LEMMA: "österreichisch"}]: {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [dict(exc_data)]

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -10,27 +11,32 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class EnglishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
morph_rules = dict(MORPH_RULES)
lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC)
syntax_iterators = dict(SYNTAX_ITERATORS)
class English(Language): class English(Language):
lang = 'en' lang = 'en'
Defaults = EnglishDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
morph_rules = dict(MORPH_RULES)
lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC)
sytax_iterators = dict(SYNTAX_ITERATORS)
__all__ = ['English'] __all__ = ['English']

File diff suppressed because it is too large Load Diff

View File

@ -11,9 +11,9 @@ def noun_chunks(obj):
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT'] 'attr', 'ROOT']
doc = obj.doc # Ensure works on both Doc and Span. doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings['conj'] conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings['NP'] np_label = doc.vocab.strings.add('NP')
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):

View File

@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
for pron in ["i"]: for pron in ["i"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [ _exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}] {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
_exc[orth + "m"] = [ _exc[orth + "m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }] {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
_exc[orth + "'ma"] = [ _exc[orth + "'ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'m", LEMMA: "be", NORM: "am"}, {ORTH: "'m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}] {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
_exc[orth + "ma"] = [ _exc[orth + "ma"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "m", LEMMA: "be", NORM: "am"}, {ORTH: "m", LEMMA: "be", NORM: "am"},
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}] {ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
@ -36,72 +36,72 @@ for pron in ["i"]:
for pron in ["i", "you", "he", "she", "it", "we", "they"]: for pron in ["i", "you", "he", "she", "it", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}] {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}] {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}, {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}] {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}] {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["i", "you", "we", "they"]: for pron in ["i", "you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for pron in ["you", "we", "they"]: for pron in ["you", "we", "they"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}] {ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}] {ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
for pron in ["he", "she", "it"]: for pron in ["he", "she", "it"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'s"}] {ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "s"}] {ORTH: "s"}]
@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
for orth in [word, word.title()]: for orth in [word, word.title()]:
_exc[orth + "'s"] = [ _exc[orth + "'s"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'s"}] {ORTH: "'s", NORM: "'s"}]
_exc[orth + "s"] = [ _exc[orth + "s"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "s"}] {ORTH: "s"}]
_exc[orth + "'ll"] = [ _exc[orth + "'ll"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}] {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "ll"] = [ _exc[orth + "ll"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}] {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
_exc[orth + "'ll've"] = [ _exc[orth + "'ll've"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}, {ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "llve"] = [ _exc[orth + "llve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "ll", LEMMA: "will", TAG: "MD"}, {ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'re"] = [ _exc[orth + "'re"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'re", LEMMA: "be", NORM: "are"}] {ORTH: "'re", LEMMA: "be", NORM: "are"}]
_exc[orth + "re"] = [ _exc[orth + "re"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "re", LEMMA: "be", NORM: "are"}] {ORTH: "re", LEMMA: "be", NORM: "are"}]
_exc[orth + "'ve"] = [ _exc[orth + "'ve"] = [
{ORTH: orth}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
_exc[orth + "ve"] = [ _exc[orth + "ve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d"}] {ORTH: "'d", NORM: "'d"}]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d"}] {ORTH: "d"}]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "'d", LEMMA: "would", TAG: "MD"}, {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[orth + "dve"] = [ _exc[orth + "dve"] = [
{ORTH: orth, LEMMA: word}, {ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d", LEMMA: "would", TAG: "MD"}, {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
# Verbs # Verbs
for verb_data in [ for verb_data in [
{ORTH: "ca", LEMMA: "can", TAG: "MD"}, {ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "could", TAG: "MD"}, {ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "do", LEMMA: "do"}, {ORTH: "do", LEMMA: "do", NORM: "do"},
{ORTH: "does", LEMMA: "do"}, {ORTH: "does", LEMMA: "do", NORM: "does"},
{ORTH: "did", LEMMA: "do", TAG: "VBD"}, {ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
{ORTH: "had", LEMMA: "have", TAG: "VBD"}, {ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
{ORTH: "may", TAG: "MD"}, {ORTH: "may", NORM: "may", TAG: "MD"},
{ORTH: "might", TAG: "MD"}, {ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must", TAG: "MD"}, {ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "need"}, {ORTH: "need", NORM: "need"},
{ORTH: "ought"}, {ORTH: "ought", NORM: "ought", TAG: "MD"},
{ORTH: "sha", LEMMA: "shall", TAG: "MD"}, {ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
{ORTH: "should", TAG: "MD"}, {ORTH: "should", NORM: "should", TAG: "MD"},
{ORTH: "wo", LEMMA: "will", TAG: "MD"}, {ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
{ORTH: "would", TAG: "MD"}]: {ORTH: "would", NORM: "would", TAG: "MD"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}] {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}] {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "n't've"] = [ _exc[data[ORTH] + "n't've"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}, {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}] {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
_exc[data[ORTH] + "ntve"] = [ _exc[data[ORTH] + "ntve"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}, {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}] {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
for verb_data in [ for verb_data in [
{ORTH: "could", TAG: "MD"}, {ORTH: "could", NORM: "could", TAG: "MD"},
{ORTH: "might"}, {ORTH: "might", NORM: "might", TAG: "MD"},
{ORTH: "must"}, {ORTH: "must", NORM: "must", TAG: "MD"},
{ORTH: "should"}]: {ORTH: "should", NORM: "should", TAG: "MD"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
@ -228,21 +228,21 @@ for verb_data in [
for verb_data in [ for verb_data in [
{ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, {ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, {ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
{ORTH: "is", LEMMA: "be", TAG: "VBZ"}, {ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
{ORTH: "was", LEMMA: "be"}, {ORTH: "was", LEMMA: "be", NORM: "was"},
{ORTH: "were", LEMMA: "be"}]: {ORTH: "were", LEMMA: "be", NORM: "were"}]:
verb_data_tc = dict(verb_data) verb_data_tc = dict(verb_data)
verb_data_tc[ORTH] = verb_data_tc[ORTH].title() verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
for data in [verb_data, verb_data_tc]: for data in [verb_data, verb_data_tc]:
_exc[data[ORTH] + "n't"] = [ _exc[data[ORTH] + "n't"] = [
dict(data), dict(data),
{ORTH: "n't", LEMMA: "not", TAG: "RB"}] {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
_exc[data[ORTH] + "nt"] = [ _exc[data[ORTH] + "nt"] = [
dict(data), dict(data),
{ORTH: "nt", LEMMA: "not", TAG: "RB"}] {ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
# Other contractions with trailing apostrophe # Other contractions with trailing apostrophe
@ -250,10 +250,10 @@ for verb_data in [
for exc_data in [ for exc_data in [
{ORTH: "doin", LEMMA: "do", NORM: "doing"}, {ORTH: "doin", LEMMA: "do", NORM: "doing"},
{ORTH: "goin", LEMMA: "go", NORM: "going"}, {ORTH: "goin", LEMMA: "go", NORM: "going"},
{ORTH: "nothin", LEMMA: "nothing"}, {ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "nuthin", LEMMA: "nothing"}, {ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
{ORTH: "ol", LEMMA: "old"}, {ORTH: "ol", LEMMA: "old", NORM: "old"},
{ORTH: "somethin", LEMMA: "something"}]: {ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
exc_data_tc = dict(exc_data) exc_data_tc = dict(exc_data)
exc_data_tc[ORTH] = exc_data_tc[ORTH].title() exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
for data in [exc_data, exc_data_tc]: for data in [exc_data, exc_data_tc]:
@ -266,10 +266,10 @@ for exc_data in [
# Other contractions with leading apostrophe # Other contractions with leading apostrophe
for exc_data in [ for exc_data in [
{ORTH: "cause", LEMMA: "because"}, {ORTH: "cause", LEMMA: "because", NORM: "because"},
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"}, {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
{ORTH: "ll", LEMMA: "will"}, {ORTH: "ll", LEMMA: "will", NORM: "will"},
{ORTH: "nuff", LEMMA: "enough"}]: {ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
exc_data_apos = dict(exc_data) exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]: for data in [exc_data, exc_data_apos]:
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
for period in ["a.m.", "am"]: for period in ["a.m.", "am"]:
_exc["%d%s" % (h, period)] = [ _exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h}, {ORTH: "%d" % h},
{ORTH: period, LEMMA: "a.m."}] {ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
for period in ["p.m.", "pm"]: for period in ["p.m.", "pm"]:
_exc["%d%s" % (h, period)] = [ _exc["%d%s" % (h, period)] = [
{ORTH: "%d" % h}, {ORTH: "%d" % h},
{ORTH: period, LEMMA: "p.m."}] {ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
# Rest # Rest
@ -306,56 +306,56 @@ _other_exc = {
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"How'd'y": [ "How'd'y": [
{ORTH: "How", LEMMA: "how"}, {ORTH: "How", LEMMA: "how", NORM: "how"},
{ORTH: "'d", LEMMA: "do"}, {ORTH: "'d", LEMMA: "do"},
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}], {ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
"not've": [ "not've": [
{ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}], {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"notve": [ "notve": [
{ORTH: "not", LEMMA: "not", TAG: "RB"}, {ORTH: "not", LEMMA: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}], {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Not've": [ "Not've": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"}, {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}], {ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"Notve": [ "Notve": [
{ORTH: "Not", LEMMA: "not", TAG: "RB"}, {ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
{ORTH: "ve", LEMMA: "have", TAG: "VB"}], {ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
"cannot": [ "cannot": [
{ORTH: "can", LEMMA: "can", TAG: "MD"}, {ORTH: "can", LEMMA: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}], {ORTH: "not", LEMMA: "not", TAG: "RB"}],
"Cannot": [ "Cannot": [
{ORTH: "Can", LEMMA: "can", TAG: "MD"}, {ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
{ORTH: "not", LEMMA: "not", TAG: "RB"}], {ORTH: "not", LEMMA: "not", TAG: "RB"}],
"gonna": [ "gonna": [
{ORTH: "gon", LEMMA: "go", NORM: "going"}, {ORTH: "gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}], {ORTH: "na", LEMMA: "to", NORM: "to"}],
"Gonna": [ "Gonna": [
{ORTH: "Gon", LEMMA: "go", NORM: "going"}, {ORTH: "Gon", LEMMA: "go", NORM: "going"},
{ORTH: "na", LEMMA: "to"}], {ORTH: "na", LEMMA: "to", NORM: "to"}],
"gotta": [ "gotta": [
{ORTH: "got"}, {ORTH: "got"},
{ORTH: "ta", LEMMA: "to"}], {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"Gotta": [ "Gotta": [
{ORTH: "Got"}, {ORTH: "Got", NORM: "got"},
{ORTH: "ta", LEMMA: "to"}], {ORTH: "ta", LEMMA: "to", NORM: "to"}],
"let's": [ "let's": [
{ORTH: "let"}, {ORTH: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}], {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
"Let's": [ "Let's": [
{ORTH: "Let", LEMMA: "let"}, {ORTH: "Let", LEMMA: "let", NORM: "let"},
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}] {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
} }
@ -363,72 +363,80 @@ _exc.update(_other_exc)
for exc_data in [ for exc_data in [
{ORTH: "'S", LEMMA: "'s"}, {ORTH: "'S", LEMMA: "'s", NORM: "'s"},
{ORTH: "'s", LEMMA: "'s"}, {ORTH: "'s", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018S", LEMMA: "'s"}, {ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
{ORTH: "\u2018s", LEMMA: "'s"}, {ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"}, {ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
{ORTH: "w/o", LEMMA: "without", NORM: "without"},
{ORTH: "'re", LEMMA: "be", NORM: "are"}, {ORTH: "'re", LEMMA: "be", NORM: "are"},
{ORTH: "'Cause", LEMMA: "because"}, {ORTH: "'Cause", LEMMA: "because", NORM: "because"},
{ORTH: "'cause", LEMMA: "because"}, {ORTH: "'cause", LEMMA: "because", NORM: "because"},
{ORTH: "ma'am", LEMMA: "madam"}, {ORTH: "'cos", LEMMA: "because", NORM: "because"},
{ORTH: "Ma'am", LEMMA: "madam"}, {ORTH: "'Cos", LEMMA: "because", NORM: "because"},
{ORTH: "o'clock", LEMMA: "o'clock"}, {ORTH: "'coz", LEMMA: "because", NORM: "because"},
{ORTH: "O'clock", LEMMA: "o'clock"}, {ORTH: "'Coz", LEMMA: "because", NORM: "because"},
{ORTH: "'cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
{ORTH: "'bout", LEMMA: "about", NORM: "about"},
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
{ORTH: "Mt.", LEMMA: "Mount"}, {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
{ORTH: "Ak.", LEMMA: "Alaska"}, {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
{ORTH: "Ala.", LEMMA: "Alabama"}, {ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
{ORTH: "Apr.", LEMMA: "April"}, {ORTH: "Apr.", LEMMA: "April", NORM: "April"},
{ORTH: "Ariz.", LEMMA: "Arizona"}, {ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
{ORTH: "Ark.", LEMMA: "Arkansas"}, {ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
{ORTH: "Aug.", LEMMA: "August"}, {ORTH: "Aug.", LEMMA: "August", NORM: "August"},
{ORTH: "Calif.", LEMMA: "California"}, {ORTH: "Calif.", LEMMA: "California", NORM: "California"},
{ORTH: "Colo.", LEMMA: "Colorado"}, {ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
{ORTH: "Conn.", LEMMA: "Connecticut"}, {ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
{ORTH: "Dec.", LEMMA: "December"}, {ORTH: "Dec.", LEMMA: "December", NORM: "December"},
{ORTH: "Del.", LEMMA: "Delaware"}, {ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
{ORTH: "Feb.", LEMMA: "February"}, {ORTH: "Feb.", LEMMA: "February", NORM: "February"},
{ORTH: "Fla.", LEMMA: "Florida"}, {ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
{ORTH: "Ga.", LEMMA: "Georgia"}, {ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
{ORTH: "Ia.", LEMMA: "Iowa"}, {ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
{ORTH: "Id.", LEMMA: "Idaho"}, {ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
{ORTH: "Ill.", LEMMA: "Illinois"}, {ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
{ORTH: "Ind.", LEMMA: "Indiana"}, {ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
{ORTH: "Jan.", LEMMA: "January"}, {ORTH: "Jan.", LEMMA: "January", NORM: "January"},
{ORTH: "Jul.", LEMMA: "July"}, {ORTH: "Jul.", LEMMA: "July", NORM: "July"},
{ORTH: "Jun.", LEMMA: "June"}, {ORTH: "Jun.", LEMMA: "June", NORM: "June"},
{ORTH: "Kan.", LEMMA: "Kansas"}, {ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Kans.", LEMMA: "Kansas"}, {ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
{ORTH: "Ky.", LEMMA: "Kentucky"}, {ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
{ORTH: "La.", LEMMA: "Louisiana"}, {ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
{ORTH: "Mar.", LEMMA: "March"}, {ORTH: "Mar.", LEMMA: "March", NORM: "March"},
{ORTH: "Mass.", LEMMA: "Massachusetts"}, {ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
{ORTH: "May.", LEMMA: "May"}, {ORTH: "May.", LEMMA: "May", NORM: "May"},
{ORTH: "Mich.", LEMMA: "Michigan"}, {ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
{ORTH: "Minn.", LEMMA: "Minnesota"}, {ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
{ORTH: "Miss.", LEMMA: "Mississippi"}, {ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
{ORTH: "N.C.", LEMMA: "North Carolina"}, {ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
{ORTH: "N.D.", LEMMA: "North Dakota"}, {ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
{ORTH: "N.H.", LEMMA: "New Hampshire"}, {ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
{ORTH: "N.J.", LEMMA: "New Jersey"}, {ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
{ORTH: "N.M.", LEMMA: "New Mexico"}, {ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
{ORTH: "N.Y.", LEMMA: "New York"}, {ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
{ORTH: "Neb.", LEMMA: "Nebraska"}, {ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nebr.", LEMMA: "Nebraska"}, {ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
{ORTH: "Nev.", LEMMA: "Nevada"}, {ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
{ORTH: "Nov.", LEMMA: "November"}, {ORTH: "Nov.", LEMMA: "November", NORM: "November"},
{ORTH: "Oct.", LEMMA: "October"}, {ORTH: "Oct.", LEMMA: "October", NORM: "October"},
{ORTH: "Okla.", LEMMA: "Oklahoma"}, {ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
{ORTH: "Ore.", LEMMA: "Oregon"}, {ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
{ORTH: "Pa.", LEMMA: "Pennsylvania"}, {ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
{ORTH: "S.C.", LEMMA: "South Carolina"}, {ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
{ORTH: "Sep.", LEMMA: "September"}, {ORTH: "Sep.", LEMMA: "September", NORM: "September"},
{ORTH: "Sept.", LEMMA: "September"}, {ORTH: "Sept.", LEMMA: "September", NORM: "September"},
{ORTH: "Tenn.", LEMMA: "Tennessee"}, {ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
{ORTH: "Va.", LEMMA: "Virginia"}, {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington"}, {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin"}]: {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
_exc[exc_data[ORTH]] = [dict(exc_data)] _exc[exc_data[ORTH]] = [dict(exc_data)]

View File

@ -5,21 +5,25 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class SpanishDefaults(Language.Defaults): class SpanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[LANG] = lambda text: 'es'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP) tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS) stop_words = set(STOP_WORDS)
sytax_iterators = dict(SYNTAX_ITERATORS)
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None): def create_lemmatizer(cls, nlp=None):
@ -28,7 +32,7 @@ class SpanishDefaults(Language.Defaults):
class Spanish(Language): class Spanish(Language):
lang = 'es' lang = 'es'
Defaults = SpanishDefaults Defaults = SpanishDefaults
__all__ = ['Spanish'] __all__ = ['Spanish']

View File

@ -0,0 +1,55 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
def noun_chunks(obj):
doc = obj.doc
np_label = doc.vocab.strings['NP']
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
right_labels = ['flat', 'fixed', 'compound', 'neg']
stop_labels = ['punct']
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
token = doc[0]
while token and token.i < len(doc):
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(token)
yield left.i, right.i+1, np_label
token = right
token = next_token(token)
def is_verb_token(token):
return token.pos in [VERB, AUX]
def next_token(token):
try:
return token.nbor()
except:
return None
def noun_bounds(root):
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
left_bound = token
right_bound = root
for token in root.rights:
if (token.dep in np_right_deps):
left, right = noun_bounds(token)
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i: right.i])):
break
else:
right_bound = right
return left_bound, right_bound
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -6,37 +6,13 @@ from ...deprecated import PRON_LEMMA
_exc = { _exc = {
"al": [
{ORTH: "a", LEMMA: "a", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET}],
"consigo": [
{ORTH: "con", LEMMA: "con"},
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: ""}],
"conmigo": [
{ORTH: "con", LEMMA: "con"},
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: ""}],
"contigo": [
{ORTH: "con", LEMMA: "con"},
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}],
"del": [
{ORTH: "de", LEMMA: "de", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET}],
"pel": [
{ORTH: "pe", LEMMA: "per", TAG: ADP},
{ORTH: "l", LEMMA: "el", TAG: DET}],
"pal": [ "pal": [
{ORTH: "pa", LEMMA: "para"}, {ORTH: "pa", LEMMA: "para"},
{ORTH: "l", LEMMA: "el"}], {ORTH: "l", LEMMA: "el", NORM: "el"}],
"pala": [ "pala": [
{ORTH: "pa", LEMMA: "para"}, {ORTH: "pa", LEMMA: "para"},
{ORTH: "la"}] {ORTH: "la", LEMMA: "la", NORM: "la"}]
} }

View File

@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Finnish(Language): class Finnish(Language):
lang = 'fi' lang = 'fi'
Defaults = FinnishDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Finnish'] __all__ = ['Finnish']

View File

@ -5,30 +5,36 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class French(Language): class French(Language):
lang = 'fr' lang = 'fr'
Defaults = FrenchDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['French'] __all__ = ['French']

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,42 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
"""
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
SYNTAX_ITERATORS = {
'noun_chunks': noun_chunks
}

View File

@ -9,15 +9,17 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class HebrewDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Hebrew(Language): class Hebrew(Language):
lang = 'he' lang = 'he'
Defaults = HebrewDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Hebrew'] __all__ = ['Hebrew']

View File

@ -7,29 +7,33 @@ from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Hungarian(Language): class Hungarian(Language):
lang = 'hu' lang = 'hu'
Defaults = HungarianDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['Hungarian'] __all__ = ['Hungarian']

View File

@ -1,18 +1,18 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..punctuation import TOKENIZER_INFIXES from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
LIST_ICONS = [r'[\p{So}--[°]]']
_currency = r'\$|¢|£|€|¥|฿' _currency = r'\$|¢|£|€|¥|฿'
_quotes = QUOTES.replace("'", '') _quotes = QUOTES.replace("'", '')
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
[r'[,.:](?=[{a}])'.format(a=ALPHA)])
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES) _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
[r'(?<=[0-9])\+', [r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.', r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(_currency), r'(?<=[0-9])(?:{})'.format(_currency),
@ -20,16 +20,14 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency), r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency),
r'(?<=[{})])-e'.format(ALPHA_LOWER)]) r'(?<=[{})])-e'.format(ALPHA_LOWER)])
_infixes = (LIST_ELLIPSES + LIST_ICONS +
_infixes = (LIST_ELLIPSES +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)]) r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])
TOKENIZER_PREFIXES = _prefixes TOKENIZER_PREFIXES = _prefixes
TOKENIZER_SUFFIXES = _suffixes TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes TOKENIZER_INFIXES = _infixes

View File

@ -5,25 +5,29 @@ from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Italian(Language): class Italian(Language):
lang = 'it' lang = 'it'
Defaults = ItalianDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['Italian'] __all__ = ['Italian']

View File

@ -125,7 +125,7 @@ def word_shape(text):
LEX_ATTRS = { LEX_ATTRS = {
attrs.LOWER: lambda string: string.lower(), attrs.LOWER: lambda string: string.lower(),
attrs.NORM: lambda string: string, attrs.NORM: lambda string: string.lower(),
attrs.PREFIX: lambda string: string[0], attrs.PREFIX: lambda string: string[0],
attrs.SUFFIX: lambda string: string[-3:], attrs.SUFFIX: lambda string: string[-3:],
attrs.CLUSTER: lambda string: 0, attrs.CLUSTER: lambda string: 0,

View File

@ -6,20 +6,24 @@ from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES from .morph_rules import MORPH_RULES
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Norwegian(Language): class Norwegian(Language):
lang = 'nb' lang = 'nb'
Defaults = NorwegianDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Norwegian'] __all__ = ['Norwegian']

View File

@ -4,21 +4,24 @@ from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Dutch(Language): class Dutch(Language):
lang = 'nl' lang = 'nl'
Defaults = DutchDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Dutch'] __all__ = ['Dutch']

View File

@ -0,0 +1,46 @@
# coding: utf8
from __future__ import unicode_literals
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
# Note that this does not change any other token attributes. Its main purpose
# is to normalise the word representations so that equivalent tokens receive
# similar representations. For example: $ and € are very different, but they're
# both currency symbols. By normalising currency symbols to $, all symbols are
# seen as similar, no matter how common they are in the training data.
BASE_NORMS = {
"'s": "'s",
"'S": "'s",
"s": "'s",
"S": "'s",
"": "'",
"": "'",
"´": "'",
"`": "'",
"": '"',
"": '"',
"''": '"',
"``": '"',
"´´": '"',
"": '"',
"»": '"',
"«": '"',
"": "...",
"": "-",
"": "-",
"--": "-",
"---": "-",
"": "$",
"£": "$",
"¥": "$",
"฿": "$",
"US$": "$",
"C$": "$",
"A$": "$"
}

View File

@ -1,23 +1,28 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Polish(Language): class Polish(Language):
lang = 'pl' lang = 'pl'
Defaults = PolishDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Polish'] __all__ = ['Polish']

View File

@ -0,0 +1,23 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import ORTH, LEMMA, POS
_exc = {}
for exc_data in [
{ORTH: "m.in.", LEMMA: "między innymi", POS: ADV},
{ORTH: "inż.", LEMMA: "inżynier", POS: NOUN},
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
_exc[exc_data[ORTH]] = [dict(exc_data)],
for orth in [
"w.", "r."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = dict(_exc)

View File

@ -7,26 +7,30 @@ from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class PortugueseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Portuguese(Language): class Portuguese(Language):
lang = 'pt' lang = 'pt'
Defaults = PortugueseDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['Portuguese'] __all__ = ['Portuguese']

View File

@ -2,15 +2,16 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from .char_classes import CURRENCY, UNITS from .char_classes import QUOTES, CURRENCY, UNITS
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
LIST_CURRENCY) LIST_CURRENCY + LIST_ICONS)
_suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
["'s", "'S", "s", "S"] +
[r'(?<=[0-9])\+', [r'(?<=[0-9])\+',
r'(?<=°[FfCcKk])\.', r'(?<=°[FfCcKk])\.',
r'(?<=[0-9])(?:{})'.format(CURRENCY), r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "s", "S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)]) r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
_infixes = (LIST_ELLIPSES + _infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[0-9])[+\-\*^](?=[0-9-])', [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),

View File

@ -7,25 +7,29 @@ from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LOOKUP from .lemmatizer import LEMMA_RULES, LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
from ...lemmatizerlookup import Lemmatizer from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG from ...attrs import LANG, NORM
from ...util import update_exc from ...util import update_exc, add_lookups
class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Swedish(Language): class Swedish(Language):
lang = 'sv' lang = 'sv'
Defaults = SwedishDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['Swedish'] __all__ = ['Swedish']

28
spacy/lang/xx/__init__.py Normal file
View File

@ -0,0 +1,28 @@
# coding: utf8
from __future__ import unicode_literals
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'.
"""
lang = 'xx'
Defaults = MultiLanguageDefaults
__all__ = ['MultiLanguage']

View File

@ -15,6 +15,7 @@ class Chinese(Language):
raise ImportError("The Chinese tokenizer requires the Jieba library: " raise ImportError("The Chinese tokenizer requires the Jieba library: "
"https://github.com/fxsjy/jieba") "https://github.com/fxsjy/jieba")
words = list(jieba.cut(text, cut_all=True)) words = list(jieba.cut(text, cut_all=True))
words=[x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))

View File

@ -6,23 +6,34 @@ import dill
import numpy import numpy
from thinc.neural import Model from thinc.neural import Model
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.optimizers import Adam, SGD
import random
import ujson
from collections import OrderedDict
import itertools
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
from .tagger import Tagger from .tagger import Tagger
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .train import Trainer
from .syntax.parser import get_templates from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity from .syntax import nonproj
from .pipeline import NeuralDependencyParser, EntityRecognizer from .pipeline import NeuralDependencyParser, EntityRecognizer
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
from .compat import json_dumps from .pipeline import NeuralLabeller
from .pipeline import SimilarityHook
from .pipeline import TextCategorizer
from . import about
from .compat import json_dumps, izip
from .attrs import IS_STOP from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS from .lang.lex_attrs import LEX_ATTRS
from . import util from . import util
from .scorer import Scorer
class BaseDefaults(object): class BaseDefaults(object):
@ -80,21 +91,35 @@ class BaseDefaults(object):
return NeuralEntityRecognizer(nlp.vocab, **cfg) return NeuralEntityRecognizer(nlp.vocab, **cfg)
@classmethod @classmethod
def create_pipeline(cls, nlp=None): def create_pipeline(cls, nlp=None, disable=tuple()):
meta = nlp.meta if nlp is not None else {} meta = nlp.meta if nlp is not None else {}
# Resolve strings, like "cnn", "lstm", etc # Resolve strings, like "cnn", "lstm", etc
pipeline = [] pipeline = []
for entry in cls.pipeline: for entry in cls.pipeline:
if entry in disable or getattr(entry, 'name', entry) in disable:
continue
factory = cls.Defaults.factories[entry] factory = cls.Defaults.factories[entry]
pipeline.append(factory(nlp, **meta.get(entry, {}))) pipeline.append(factory(nlp, **meta.get(entry, {})))
return pipeline return pipeline
factories = { factories = {
'make_doc': create_tokenizer, 'make_doc': create_tokenizer,
'token_vectors': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), 'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tags': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), 'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'dependencies': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), 'parser': lambda nlp, **cfg: [
'entities': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize],
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
# Temporary compatibility -- delete after pivot
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'dependencies': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize,
],
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
} }
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
@ -112,19 +137,39 @@ class BaseDefaults(object):
lemma_index = {} lemma_index = {}
morph_rules = {} morph_rules = {}
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
syntax_iterators = {}
class Language(object): class Language(object):
""" """A text-processing pipeline. Usually you'll load this once per process,
A text-processing pipeline. Usually you'll load this once per process, and and pass the instance around your application.
pass the instance around your program.
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (unicode): Two-letter language ID, i.e. ISO code.
""" """
Defaults = BaseDefaults Defaults = BaseDefaults
lang = None lang = None
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}): def __init__(self, vocab=True, make_doc=True, pipeline=None,
self.meta = dict(meta) meta={}, disable=tuple(), **kwargs):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
`Language.Defaults.create_vocab`.
make_doc (callable): A function that takes text and returns a `Doc`
object. Usually a `Tokenizer`.
pipeline (list): A list of annotation processes or IDs of annotation,
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
up in `Language.Defaults.factories`.
disable (list): A list of component names to exclude from the pipeline.
The disable list has priority over the pipeline list -- if the same
string occurs in both, the component is not loaded.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
RETURNS (Language): The newly constructed object.
"""
self._meta = dict(meta)
if vocab is True: if vocab is True:
factory = self.Defaults.create_vocab factory = self.Defaults.create_vocab
vocab = factory(self, **meta.get('vocab', {})) vocab = factory(self, **meta.get('vocab', {}))
@ -132,11 +177,15 @@ class Language(object):
if make_doc is True: if make_doc is True:
factory = self.Defaults.create_tokenizer factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {})) make_doc = factory(self, **meta.get('tokenizer', {}))
self.make_doc = make_doc self.tokenizer = make_doc
if pipeline is True: if pipeline is True:
self.pipeline = self.Defaults.create_pipeline(self) self.pipeline = self.Defaults.create_pipeline(self, disable)
elif pipeline: elif pipeline:
self.pipeline = list(pipeline) # Careful not to do getattr(p, 'name', None) here
# If we had disable=[None], we'd disable everything!
self.pipeline = [p for p in pipeline
if p not in disable
and getattr(p, 'name', p) not in disable]
# Resolve strings, like "cnn", "lstm", etc # Resolve strings, like "cnn", "lstm", etc
for i, entry in enumerate(self.pipeline): for i, entry in enumerate(self.pipeline):
if entry in self.Defaults.factories: if entry in self.Defaults.factories:
@ -144,82 +193,224 @@ class Language(object):
self.pipeline[i] = factory(self, **meta.get(entry, {})) self.pipeline[i] = factory(self, **meta.get(entry, {}))
else: else:
self.pipeline = [] self.pipeline = []
flat_list = []
for pipe in self.pipeline:
if isinstance(pipe, list):
flat_list.extend(pipe)
else:
flat_list.append(pipe)
self.pipeline = flat_list
def __call__(self, text, state=None, **disabled): @property
""" def meta(self):
Apply the pipeline to some text. The text can span multiple sentences, self._meta.setdefault('lang', self.vocab.lang)
and can contain arbtrary whitespace. Alignment into the original string self._meta.setdefault('name', '')
self._meta.setdefault('version', '0.0.0')
self._meta.setdefault('spacy_version', about.__version__)
self._meta.setdefault('description', '')
self._meta.setdefault('author', '')
self._meta.setdefault('email', '')
self._meta.setdefault('url', '')
self._meta.setdefault('license', '')
pipeline = []
for component in self.pipeline:
if hasattr(component, 'name'):
pipeline.append(component.name)
self._meta['pipeline'] = pipeline
return self._meta
@meta.setter
def meta(self, value):
self._meta = value
# Conveniences to access pipeline components
@property
def tensorizer(self):
return self.get_component('tensorizer')
@property
def tagger(self):
return self.get_component('tagger')
@property
def parser(self):
return self.get_component('parser')
@property
def entity(self):
return self.get_component('ner')
@property
def matcher(self):
return self.get_component('matcher')
def get_component(self, name):
if self.pipeline in (True, None):
return None
for proc in self.pipeline:
if hasattr(proc, 'name') and proc.name.endswith(name):
return proc
return None
def __call__(self, text, disable=[]):
"""'Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved. is preserved.
Args: text (unicode): The text to be processed.
text (unicode): The text to be processed. disable (list): Names of the pipeline components to disable.
state: Arbitrary RETURNS (Doc): A container for accessing the annotations.
Returns: EXAMPLE:
doc (Doc): A container for accessing the annotations.
Example:
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.') >>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_ >>> tokens[0].text, tokens[0].head.tag_
('An', 'NN') ('An', 'NN')
""" """
doc = self.make_doc(text) doc = self.make_doc(text)
for proc in self.pipeline: for proc in self.pipeline:
name = getattr(proc, 'name', None) name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]: if name in disable:
continue continue
state = proc(doc, state=state) doc = proc(doc)
return doc return doc
def update(self, docs, golds, state=None, drop=0., sgd=None): def make_doc(self, text):
return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None,
update_tensors=False):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
if len(docs) != len(golds):
raise IndexError("Update expects same number of docs and golds "
"Got: %d, %d" % (len(docs), len(golds)))
if len(docs) == 0:
return
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
grads = {} grads = {}
def get_grads(W, dW, key=None): def get_grads(W, dW, key=None):
grads[key] = (W, dW) grads[key] = (W, dW)
state = {} if state is None else state pipes = list(self.pipeline[1:])
for process in self.pipeline: random.shuffle(pipes)
if hasattr(process, 'update'): for proc in pipes:
state = process.update(docs, golds, if not hasattr(proc, 'update'):
state=state, continue
drop=drop, tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
sgd=get_grads) d_tokvecses = proc.update((docs, tokvecses), golds,
else: drop=drop, sgd=get_grads, losses=losses)
process(docs, state=state) if update_tensors and d_tokvecses is not None:
if sgd is not None: bp_tokvecses(d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items(): for key, (W, dW) in grads.items():
# TODO: Unhack this when thinc improves sgd(W, dW, key=key)
if isinstance(W, numpy.ndarray): # Clear the tensor variable, to free GPU memory.
sgd.ops = NumpyOps() # If we don't do this, the memory leak gets pretty
else: # bad, because we may be holding part of a batch.
sgd.ops = CupyOps() for doc in docs:
sgd(W, dW, key=key) doc.tensor = None
return state
@contextmanager def preprocess_gold(self, docs_golds):
def begin_training(self, gold_tuples, **cfg): """Can be called before training to pre-process gold data. By default,
it handles nonprojectivity and adds missing tags to the tag map.
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
"""
for proc in self.pipeline:
if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds)
for doc, gold in docs_golds:
yield doc, gold
def begin_training(self, get_gold_tuples, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
gold_tuples (iterable): Gold-standard training data.
**cfg: Config parameters.
YIELDS (tuple): A trainer and an optimizer.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
if self.parser:
self.pipeline.append(NeuralLabeller(self.vocab))
# Populate vocab # Populate vocab
for _, annots_brackets in gold_tuples: for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets: for annots, _ in annots_brackets:
for word in annots[1]: for word in annots[1]:
_ = self.vocab[word] _ = self.vocab[word]
# Handle crossing dependencies
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
contexts = [] contexts = []
if cfg.get('use_gpu'): if cfg.get('device', -1) >= 0:
import cupy.cuda.device
device = cupy.cuda.device.Device(cfg['device'])
device.use()
Model.ops = CupyOps() Model.ops = CupyOps()
Model.Ops = CupyOps Model.Ops = CupyOps
print("Use GPU") else:
device = None
for proc in self.pipeline: for proc in self.pipeline:
if hasattr(proc, 'begin_training'): if hasattr(proc, 'begin_training'):
context = proc.begin_training(gold_tuples, context = proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline) pipeline=self.pipeline)
contexts.append(context) contexts.append(context)
trainer = Trainer(self, gold_tuples, **cfg) learn_rate = util.env_opt('learn_rate', 0.001)
yield trainer, trainer.optimizer beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
optimizer.max_grad_norm = max_grad_norm
optimizer.device = device
return optimizer
def evaluate(self, docs_golds):
scorer = Scorer()
docs, golds = zip(*docs_golds)
docs = list(docs)
golds = list(golds)
for pipe in self.pipeline:
if not hasattr(pipe, 'pipe'):
for doc in docs:
pipe(doc)
else:
docs = list(pipe.pipe(docs))
assert len(docs) == len(golds)
for doc, gold in zip(docs, golds):
scorer.score(doc, gold)
doc.tensor = None
return scorer
@contextmanager @contextmanager
def use_params(self, params, **cfg): def use_params(self, params, **cfg):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
**cfg: Config parameters.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
contexts = [pipe.use_params(params) for pipe contexts = [pipe.use_params(params) for pipe
in self.pipeline if hasattr(pipe, 'use_params')] in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib # TODO: Having trouble with contextlib
@ -236,98 +427,149 @@ class Language(object):
except StopIteration: except StopIteration:
pass pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled): def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
""" """Process texts as a stream, and yield `Doc` objects in order. Supports
Process texts as a stream, and yield Doc objects in order. GIL-free multi-threading.
Supports GIL-free multi-threading. texts (iterator): A sequence of texts to process.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text.
Arguments: EXAMPLE:
texts (iterator) >>> texts = [u'One document.', u'...', u'Lots of documents']
tag (bool) >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
parse (bool) >>> assert doc.is_parsed
entity (bool)
""" """
#stream = ((self.make_doc(text), None) for text in texts) if tuples:
stream = ((doc, {}) for doc in texts) text_context1, text_context2 = itertools.tee(texts)
texts = (tc[0] for tc in text_context1)
contexts = (tc[1] for tc in text_context2)
docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
disable=disable)
for doc, context in izip(docs, contexts):
yield (doc, context)
return
docs = (self.make_doc(text) for text in texts)
for proc in self.pipeline: for proc in self.pipeline:
name = getattr(proc, 'name', None) name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]: if name in disable:
continue continue
if hasattr(proc, 'pipe'): if hasattr(proc, 'pipe'):
stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size) docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
else: else:
stream = (proc(doc, state) for doc, state in stream) # Apply the function, but yield the doc
for doc, state in stream: docs = _pipe(proc, docs)
for doc in docs:
yield doc yield doc
def to_disk(self, path, **exclude): def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. """Save the current state to a directory. If a model is loaded, this
will include the model.
Args: path (unicode or Path): A path to a directory, which will be created if
path: A path to a directory, which will be created if it doesn't it doesn't exist. Paths may be either strings or `Path`-like objects.
exist. Paths may be either strings or pathlib.Path-like disable (list): Names of pipeline components to disable and prevent
objects. from being saved.
**exclude: Prevent named attributes from being saved.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): serializers = OrderedDict((
path.mkdir() ('vocab', lambda p: self.vocab.to_disk(p)),
if not path.is_dir(): ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
raise IOError("Output path must be a directory") ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
props = {} ))
for name, value in self.__dict__.items(): for proc in self.pipeline:
if name in exclude: if not hasattr(proc, 'name'):
continue continue
if hasattr(value, 'to_disk'): if proc.name in disable:
value.to_disk(path / name) continue
else: if not hasattr(proc, 'to_disk'):
props[name] = value continue
with (path / 'props.pickle').open('wb') as file_: serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
dill.dump(props, file_) util.to_disk(path, serializers, {p: False for p in disable})
def from_disk(self, path, **exclude): def from_disk(self, path, disable=tuple()):
"""Load the current state from a directory. """Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the
model will be loaded.
Args: path (unicode or Path): A path to a directory. Paths may be either
path: A path to a directory. Paths may be either strings or strings or `Path`-like objects.
pathlib.Path-like objects. disable (list): Names of the pipeline components to disable.
**exclude: Prevent named attributes from being saved. RETURNS (Language): The modified `Language` object.
EXAMPLE:
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
for name in path.iterdir(): deserializers = OrderedDict((
if name not in exclude and hasattr(self, str(name)): ('vocab', lambda p: self.vocab.from_disk(p)),
getattr(self, name).from_disk(path / name) ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
with (path / 'props.pickle').open('rb') as file_: ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
bytes_data = file_.read() ))
self.from_bytes(bytes_data, **exclude) for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
exclude = {p: False for p in disable}
if not (path / 'vocab').exists():
exclude['vocab'] = True
util.from_disk(path, deserializers, exclude)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
Args: disable (list): Nameds of pipeline components to disable and prevent
path: A path to a directory. Paths may be either strings or from being serialized.
pathlib.Path-like objects. RETURNS (bytes): The serialized form of the `Language` object.
**exclude: Prevent named attributes from being serialized.
""" """
props = dict(self.__dict__) serializers = OrderedDict((
for key in exclude: ('vocab', lambda: self.vocab.to_bytes()),
if key in props: ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
props.pop(key) ('meta', lambda: ujson.dumps(self.meta))
return dill.dumps(props, -1) ))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
continue
if not hasattr(proc, 'to_bytes'):
continue
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
return util.to_bytes(serializers, {})
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string. """Load state from a binary string.
Args: bytes_data (bytes): The data to load from.
bytes_data (bytes): The data to load from. disable (list): Names of the pipeline components to disable.
**exclude: Prevent named attributes from being loaded. RETURNS (Language): The `Language` object.
""" """
props = dill.loads(bytes_data) deserializers = OrderedDict((
for key, value in props.items(): ('vocab', lambda b: self.vocab.from_bytes(b)),
if key not in exclude: ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
setattr(self, key, value) ('meta', lambda b: self.meta.update(ujson.loads(b)))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
continue
if not hasattr(proc, 'from_bytes'):
continue
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
msg = util.from_bytes(bytes_data, deserializers, {})
return self return self
def _pipe(func, docs):
for doc in docs:
func(doc)
yield doc

View File

@ -27,7 +27,7 @@ cdef class Lexeme:
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil: cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
cdef SerializedLexemeC lex_data cdef SerializedLexemeC lex_data
buff = <const unsigned char*>&lex.flags buff = <const unsigned char*>&lex.flags
end = <const unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm) end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)): for i in range(sizeof(lex_data.data)):
lex_data.data[i] = buff[i] lex_data.data[i] = buff[i]
return lex_data return lex_data
@ -35,7 +35,7 @@ cdef class Lexeme:
@staticmethod @staticmethod
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil: cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
buff = <unsigned char*>&lex.flags buff = <unsigned char*>&lex.flags
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm) end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)): for i in range(sizeof(lex_data.data)):
buff[i] = lex_data.data[i] buff[i] = lex_data.data[i]

View File

@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef class Lexeme: cdef class Lexeme:
""" """An entry in the vocabulary. A `Lexeme` has no string context it's a
An entry in the vocabulary. A Lexeme has no string context --- it's a
word-type, as opposed to a word token. It therefore has no part-of-speech word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag). tag).
""" """
def __init__(self, Vocab vocab, int orth): def __init__(self, Vocab vocab, attr_t orth):
""" """Create a Lexeme object.
Create a Lexeme object.
Arguments: vocab (Vocab): The parent vocabulary
vocab (Vocab): The parent vocabulary orth (uint64): The orth id of the lexeme.
orth (int): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object. Returns (Lexeme): The newly constructd object.
""" """
self.vocab = vocab self.vocab = vocab
@ -54,7 +51,7 @@ cdef class Lexeme:
if isinstance(other, Lexeme): if isinstance(other, Lexeme):
a = self.orth a = self.orth
b = other.orth b = other.orth
elif isinstance(other, int): elif isinstance(other, long):
a = self.orth a = self.orth
b = other b = other
elif isinstance(other, str): elif isinstance(other, str):
@ -82,35 +79,28 @@ cdef class Lexeme:
return self.c.orth return self.c.orth
def set_flag(self, attr_id_t flag_id, bint value): def set_flag(self, attr_id_t flag_id, bint value):
""" """Change the value of a boolean flag.
Change the value of a boolean flag.
Arguments: flag_id (int): The attribute ID of the flag to set.
flag_id (int): The attribute ID of the flag to set. value (bool): The new value of the flag.
value (bool): The new value of the flag.
""" """
Lexeme.c_set_flag(self.c, flag_id, value) Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id): def check_flag(self, attr_id_t flag_id):
""" """Check the value of a boolean flag.
Check the value of a boolean flag.
Arguments: flag_id (int): The attribute ID of the flag to query.
flag_id (int): The attribute ID of the flag to query. RETURNS (bool): The value of the flag.
Returns (bool): The value of the flag.
""" """
return True if Lexeme.c_check_flag(self.c, flag_id) else False return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other): def similarity(self, other):
""" """Compute a semantic similarity estimate. Defaults to cosine over
Compute a semantic similarity estimate. Defaults to cosine over vectors. vectors.
Arguments: other (object): The object to compare with. By default, accepts `Doc`,
other: `Span`, `Token` and `Lexeme` objects.
The object to compare with. By default, accepts Doc, Span, RETURNS (float): A scalar similarity score. Higher is more similar.
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
""" """
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
@ -119,7 +109,7 @@ cdef class Lexeme:
def to_bytes(self): def to_bytes(self):
lex_data = Lexeme.c_to_bytes(self.c) lex_data = Lexeme.c_to_bytes(self.c)
start = <const char*>&self.c.flags start = <const char*>&self.c.flags
end = <const char*>&self.c.l2_norm + sizeof(self.c.l2_norm) end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data)) assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
byte_string = b'\0' * sizeof(lex_data.data) byte_string = b'\0' * sizeof(lex_data.data)
byte_chars = <char*>byte_string byte_chars = <char*>byte_string
@ -140,22 +130,29 @@ cdef class Lexeme:
self.orth = self.c.orth self.orth = self.c.orth
property has_vector: property has_vector:
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self): def __get__(self):
cdef int i return self.vocab.has_vector(self.c.orth)
for i in range(self.vocab.vectors_length):
if self.c.vector[i] != 0:
return True
else:
return False
property vector_norm: property vector_norm:
def __get__(self): """The L2 norm of the lexeme's vector representation.
return self.c.l2_norm
def __set__(self, float value): RETURNS (float): The L2 norm of the vector representation.
self.c.l2_norm = value """
def __get__(self):
vector = self.vector
return numpy.sqrt((vector**2).sum())
property vector: property vector:
"""A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics.
"""
def __get__(self): def __get__(self):
cdef int length = self.vocab.vectors_length cdef int length = self.vocab.vectors_length
if length == 0: if length == 0:
@ -165,27 +162,16 @@ cdef class Lexeme:
"model doesn't include word vectors. For more info, see " "model doesn't include word vectors. For more info, see "
"the documentation: \n%s\n" % about.__docs_models__ "the documentation: \n%s\n" % about.__docs_models__
) )
return self.vocab.get_vector(self.c.orth)
vector_view = <float[:length,]>self.c.vector
return numpy.asarray(vector_view)
def __set__(self, vector): def __set__(self, vector):
assert len(vector) == self.vocab.vectors_length assert len(vector) == self.vocab.vectors_length
cdef float value self.vocab.set_vector(self.c.orth, vector)
cdef double norm = 0.0
for i, value in enumerate(vector):
self.c.vector[i] = value
norm += value * value
self.c.l2_norm = sqrt(norm)
property rank: property rank:
def __get__(self): def __get__(self):
return self.c.id return self.c.id
property repvec:
def __get__(self):
raise AttributeError("lex.repvec has been renamed to lex.vector")
property sentiment: property sentiment:
def __get__(self): def __get__(self):
return self.c.sentiment return self.c.sentiment
@ -196,33 +182,41 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.orth] return self.vocab.strings[self.c.orth]
property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self):
return self.orth_
property lower: property lower:
def __get__(self): return self.c.lower def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x def __set__(self, attr_t x): self.c.lower = x
property norm: property norm:
def __get__(self): return self.c.norm def __get__(self): return self.c.norm
def __set__(self, int x): self.c.norm = x def __set__(self, attr_t x): self.c.norm = x
property shape: property shape:
def __get__(self): return self.c.shape def __get__(self): return self.c.shape
def __set__(self, int x): self.c.shape = x def __set__(self, attr_t x): self.c.shape = x
property prefix: property prefix:
def __get__(self): return self.c.prefix def __get__(self): return self.c.prefix
def __set__(self, int x): self.c.prefix = x def __set__(self, attr_t x): self.c.prefix = x
property suffix: property suffix:
def __get__(self): return self.c.suffix def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x def __set__(self, attr_t x): self.c.suffix = x
property cluster: property cluster:
def __get__(self): return self.c.cluster def __get__(self): return self.c.cluster
def __set__(self, int x): self.c.cluster = x def __set__(self, attr_t x): self.c.cluster = x
property lang: property lang:
def __get__(self): return self.c.lang def __get__(self): return self.c.lang
def __set__(self, int x): self.c.lang = x def __set__(self, attr_t x): self.c.lang = x
property prob: property prob:
def __get__(self): return self.c.prob def __get__(self): return self.c.prob
@ -230,27 +224,27 @@ cdef class Lexeme:
property lower_: property lower_:
def __get__(self): return self.vocab.strings[self.c.lower] def __get__(self): return self.vocab.strings[self.c.lower]
def __set__(self, unicode x): self.c.lower = self.vocab.strings[x] def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x)
property norm_: property norm_:
def __get__(self): return self.vocab.strings[self.c.norm] def __get__(self): return self.vocab.strings[self.c.norm]
def __set__(self, unicode x): self.c.norm = self.vocab.strings[x] def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x)
property shape_: property shape_:
def __get__(self): return self.vocab.strings[self.c.shape] def __get__(self): return self.vocab.strings[self.c.shape]
def __set__(self, unicode x): self.c.shape = self.vocab.strings[x] def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x)
property prefix_: property prefix_:
def __get__(self): return self.vocab.strings[self.c.prefix] def __get__(self): return self.vocab.strings[self.c.prefix]
def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x] def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x)
property suffix_: property suffix_:
def __get__(self): return self.vocab.strings[self.c.suffix] def __get__(self): return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x)
property lang_: property lang_:
def __get__(self): return self.vocab.strings[self.c.lang] def __get__(self): return self.vocab.strings[self.c.lang]
def __set__(self, unicode x): self.c.lang = self.vocab.strings[x] def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x)
property flags: property flags:
def __get__(self): return self.c.flags def __get__(self): return self.c.flags
@ -258,7 +252,7 @@ cdef class Lexeme:
property is_oov: property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV) def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x) def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
property is_stop: property is_stop:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP) def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
@ -308,7 +302,6 @@ cdef class Lexeme:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property like_url: property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)

View File

@ -87,7 +87,7 @@ ctypedef TokenPatternC* TokenPatternC_ptr
ctypedef pair[int, TokenPatternC_ptr] StateC ctypedef pair[int, TokenPatternC_ptr] StateC
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label, cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
object token_specs) except NULL: object token_specs) except NULL:
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
cdef int i cdef int i
@ -99,15 +99,21 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
pattern[i].attrs[j].attr = attr pattern[i].attrs[j].attr = attr
pattern[i].attrs[j].value = value pattern[i].attrs[j].value = value
i = len(token_specs) i = len(token_specs)
pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC)) pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
pattern[i].attrs[0].attr = ID pattern[i].attrs[0].attr = ID
pattern[i].attrs[0].value = entity_id pattern[i].attrs[0].value = entity_id
pattern[i].attrs[1].attr = ENT_TYPE
pattern[i].attrs[1].value = label
pattern[i].nr_attr = 0 pattern[i].nr_attr = 0
return pattern return pattern
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
while pattern.nr_attr != 0:
pattern += 1
id_attr = pattern[0].attrs[0]
assert id_attr.attr == ID
return id_attr.value
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
for attr in pattern.attrs[:pattern.nr_attr]: for attr in pattern.attrs[:pattern.nr_attr]:
if get_token_attr(token, attr.attr) != attr.value: if get_token_attr(token, attr.attr) != attr.value:
@ -148,7 +154,7 @@ def _convert_strings(token_specs, string_store):
if isinstance(attr, basestring): if isinstance(attr, basestring):
attr = attrs.IDS.get(attr.upper()) attr = attrs.IDS.get(attr.upper())
if isinstance(value, basestring): if isinstance(value, basestring):
value = string_store[value] value = string_store.add(value)
if isinstance(value, bool): if isinstance(value, bool):
value = int(value) value = int(value)
if attr is not None: if attr is not None:
@ -159,14 +165,14 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches): def merge_phrase(matcher, doc, i, matches):
'''Callback to merge a phrase on match''' """Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i] ent_id, label, start, end = matches[i]
span = doc[start : end] span = doc[start : end]
span.merge(ent_type=label, ent_id=ent_id) span.merge(ent_type=label, ent_id=ent_id)
cdef class Matcher: cdef class Matcher:
'''Match sequences of tokens, based on pattern rules.''' """Match sequences of tokens, based on pattern rules."""
cdef Pool mem cdef Pool mem
cdef vector[TokenPatternC*] patterns cdef vector[TokenPatternC*] patterns
cdef readonly Vocab vocab cdef readonly Vocab vocab
@ -175,37 +181,12 @@ cdef class Matcher:
cdef public object _callbacks cdef public object _callbacks
cdef public object _acceptors cdef public object _acceptors
@classmethod def __init__(self, vocab):
def load(cls, path, vocab): """Create the Matcher.
"""
Load the matcher and patterns from a file path.
Arguments: vocab (Vocab): The vocabulary object, which must be shared with the
path (Path): documents the matcher will operate on.
Path to a JSON-formatted patterns file. RETURNS (Matcher): The newly constructed object.
vocab (Vocab):
The vocabulary that the documents to match over will refer to.
Returns:
Matcher: The newly constructed object.
"""
if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = ujson.load(file_)
else:
patterns = {}
return cls(vocab, patterns)
def __init__(self, vocab, patterns={}):
"""
Create the Matcher.
Arguments:
vocab (Vocab):
The vocabulary object, which must be shared with the documents
the matcher will operate on.
patterns (dict): Patterns to add to the matcher.
Returns:
The newly constructed object.
""" """
self._patterns = {} self._patterns = {}
self._entities = {} self._entities = {}
@ -213,144 +194,111 @@ cdef class Matcher:
self._callbacks = {} self._callbacks = {}
self.vocab = vocab self.vocab = vocab
self.mem = Pool() self.mem = Pool()
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add_entity(entity_key, attrs)
for spec in specs:
self.add_pattern(entity_key, spec, label=etype)
def __reduce__(self): def __reduce__(self):
return (self.__class__, (self.vocab, self._patterns), None, None) return (self.__class__, (self.vocab, self._patterns), None, None)
property n_patterns: def __len__(self):
def __get__(self): return self.patterns.size() """Get the number of rules added to the matcher. Note that this only
returns the number of rules (identical with the number of IDs), not the
number of individual patterns.
def add_entity(self, entity_key, attrs=None, if_exists='raise', RETURNS (int): The number of rules.
acceptor=None, on_match=None):
""" """
Add an entity to the matcher. return len(self._patterns)
Arguments: def __contains__(self, key):
entity_key (unicode or int): """Check whether the matcher contains rules for a match ID.
An ID for the entity.
attrs: key (unicode): The match ID.
Attributes to associate with the Matcher. RETURNS (bool): Whether the matcher contains rules for this match ID.
if_exists ('raise', 'ignore' or 'update'):
Controls what happens if the entity ID already exists. Defaults to 'raise'.
acceptor:
Callback function to filter matches of the entity.
on_match:
Callback function to act on matches of the entity.
Returns:
None
""" """
if if_exists not in ('raise', 'ignore', 'update'): return len(self._patterns)
raise ValueError(
"Unexpected value for if_exists: %s.\n"
"Expected one of: ['raise', 'ignore', 'update']" % if_exists)
if attrs is None:
attrs = {}
entity_key = self.normalize_entity_key(entity_key)
if self.has_entity(entity_key):
if if_exists == 'raise':
raise KeyError(
"Tried to add entity %s. Entity exists, and if_exists='raise'.\n"
"Set if_exists='ignore' or if_exists='update', or check with "
"matcher.has_entity()")
elif if_exists == 'ignore':
return
self._entities[entity_key] = dict(attrs)
self._patterns.setdefault(entity_key, [])
self._acceptors[entity_key] = acceptor
self._callbacks[entity_key] = on_match
def add_pattern(self, entity_key, token_specs, label=""): def add(self, key, on_match, *patterns):
"""Add a match-rule to the matcher.
A match-rule consists of: an ID key, an on_match callback, and one or
more patterns. If the key exists, the patterns are appended to the
previous ones, and the previous on_match callback is replaced. The
`on_match` callback will receive the arguments `(matcher, doc, i,
matches)`. You can also set `on_match` to `None` to not perform any
actions. A pattern consists of one or more `token_specs`, where a
`token_spec` is a dictionary mapping attribute IDs to values. Token
descriptors can also include quantifiers. There are currently important
known problems with the quantifiers see the docs.
""" """
Add a pattern to the matcher. for pattern in patterns:
if len(pattern) == 0:
msg = ("Cannot add pattern for zero tokens to matcher.\n"
"key: {key}\n")
raise ValueError(msg.format(key=key))
key = self._normalize_key(key)
self._patterns.setdefault(key, [])
self._callbacks[key] = on_match
Arguments: for pattern in patterns:
entity_key (unicode or int): specs = _convert_strings(pattern, self.vocab.strings)
An ID for the entity. self.patterns.push_back(init_pattern(self.mem, key, specs))
token_specs: self._patterns[key].append(specs)
Description of the pattern to be matched.
label: def remove(self, key):
Label to assign to the matched pattern. Defaults to "". """Remove a rule from the matcher. A KeyError is raised if the key does
Returns: not exist.
None
key (unicode): The ID of the match rule.
""" """
token_specs = list(token_specs) key = self._normalize_key(key)
if len(token_specs) == 0: self._patterns.pop(key)
msg = ("Cannot add pattern for zero tokens to matcher.\n" self._callbacks.pop(key)
"entity_key: {entity_key}\n" cdef int i = 0
"label: {label}") while i < self.patterns.size():
raise ValueError(msg.format(entity_key=entity_key, label=label)) pattern_key = get_pattern_key(self.patterns.at(i))
entity_key = self.normalize_entity_key(entity_key) if pattern_key == key:
if not self.has_entity(entity_key): self.patterns.erase(self.patterns.begin()+i)
self.add_entity(entity_key) else:
if isinstance(label, basestring): i += 1
label = self.vocab.strings[label]
elif label is None:
label = 0
spec = _convert_strings(token_specs, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec)) def has_key(self, key):
self._patterns[entity_key].append((label, token_specs)) """Check whether the matcher has a rule with a given key.
def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None): key (string or int): The key to check.
self.add_entity(entity_key, attrs=attrs, if_exists='update', RETURNS (bool): Whether the matcher has the rule.
acceptor=acceptor, on_match=on_match)
for spec in specs:
self.add_pattern(entity_key, spec, label=label)
def normalize_entity_key(self, entity_key):
if isinstance(entity_key, basestring):
return self.vocab.strings[entity_key]
else:
return entity_key
def has_entity(self, entity_key):
""" """
Check whether the matcher has an entity. key = self._normalize_key(key)
return key in self._patterns
Arguments: def get(self, key, default=None):
entity_key (string or int): The entity key to check. """Retrieve the pattern stored for a key.
Returns:
bool: Whether the matcher has the entity.
"""
entity_key = self.normalize_entity_key(entity_key)
return entity_key in self._entities
def get_entity(self, entity_key): key (unicode or int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
""" """
Retrieve the attributes stored for an entity. key = self._normalize_key(key)
if key not in self._patterns:
return default
return (self._callbacks[key], self._patterns[key])
Arguments: def pipe(self, docs, batch_size=1000, n_threads=2):
entity_key (unicode or int): The entity to retrieve. """Match a stream of documents, yielding them in turn.
Returns:
The entity attributes if present, otherwise None.
"""
entity_key = self.normalize_entity_key(entity_key)
if entity_key in self._entities:
return self._entities[entity_key]
else:
return None
def __call__(self, Doc doc, acceptor=None): docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
""" """
Find all token sequences matching the supplied patterns on the Doc. for doc in docs:
self(doc)
yield doc
Arguments: def __call__(self, Doc doc):
doc (Doc): """Find all token sequences matching the supplied patterns on the `Doc`.
The document to match over.
Returns: doc (Doc): The document to match over.
list RETURNS (list): A list of `(key, label_id, start, end)` tuples,
A list of (entity_key, label_id, start, end) tuples, describing the matches. A match tuple describes a span
describing the matches. A match tuple describes a span doc[start:end]. `doc[start:end]`. The `label_id` and `key` are both integers.
The label_id and entity_key are both integers.
""" """
if acceptor is not None:
raise ValueError(
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
"functions when you add patterns instead.")
cdef vector[StateC] partials cdef vector[StateC] partials
cdef int n_partials = 0 cdef int n_partials = 0
cdef int q = 0 cdef int q = 0
@ -388,13 +336,7 @@ cdef class Matcher:
end = token_i+1 end = token_i+1
ent_id = state.second[1].attrs[0].value ent_id = state.second[1].attrs[0].value
label = state.second[1].attrs[1].value label = state.second[1].attrs[1].value
acceptor = self._acceptors.get(ent_id) matches.append((ent_id, start, end))
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
partials.resize(q) partials.resize(q)
# Check whether we open any new patterns on this token # Check whether we open any new patterns on this token
for pattern in self.patterns: for pattern in self.patterns:
@ -419,13 +361,7 @@ cdef class Matcher:
end = token_i+1 end = token_i+1
ent_id = pattern[1].attrs[0].value ent_id = pattern[1].attrs[0].value
label = pattern[1].attrs[1].value label = pattern[1].attrs[1].value
acceptor = self._acceptors.get(ent_id) matches.append((ent_id, start, end))
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
# Look for open patterns that are actually satisfied # Look for open patterns that are actually satisfied
for state in partials: for state in partials:
while state.second.quantifier in (ZERO, ZERO_PLUS): while state.second.quantifier in (ZERO, ZERO_PLUS):
@ -435,36 +371,19 @@ cdef class Matcher:
end = len(doc) end = len(doc)
ent_id = state.second.attrs[0].value ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value label = state.second.attrs[0].value
acceptor = self._acceptors.get(ent_id) matches.append((ent_id, start, end))
if acceptor is None: for i, (ent_id, start, end) in enumerate(matches):
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
for i, (ent_id, label, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id) on_match = self._callbacks.get(ent_id)
if on_match is not None: if on_match is not None:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)
# TODO: only return (match_id, start, end)
return matches return matches
def pipe(self, docs, batch_size=1000, n_threads=2): def _normalize_key(self, key):
""" if isinstance(key, basestring):
Match a stream of documents, yielding them in turn. return self.vocab.strings.add(key)
else:
Arguments: return key
docs: A stream of documents.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
"""
for doc in docs:
self(doc)
yield doc
def get_bilou(length): def get_bilou(length):
@ -550,7 +469,7 @@ cdef class PhraseMatcher:
self(doc) self(doc)
yield doc yield doc
def accept_match(self, Doc doc, int ent_id, int label, int start, int end): def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
assert (end - start) < self.max_length assert (end - start) < self.max_length
cdef int i, j cdef int i, j
for i in range(self.max_length): for i in range(self.max_length):

View File

@ -30,6 +30,7 @@ cdef class Morphology:
cdef public object n_tags cdef public object n_tags
cdef public object reverse_index cdef public object reverse_index
cdef public object tag_names cdef public object tag_names
cdef public object exc
cdef RichTagC* rich_tags cdef RichTagC* rich_tags
cdef PreshMapArray _cache cdef PreshMapArray _cache

View File

@ -33,36 +33,43 @@ def _normalize_props(props):
cdef class Morphology: cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer): def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
self.mem = Pool() self.mem = Pool()
self.strings = string_store self.strings = string_store
self.tag_map = {} self.tag_map = {}
self.lemmatizer = lemmatizer self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) + 1 self.n_tags = len(tag_map)
self.tag_names = tuple(sorted(tag_map.keys())) self.tag_names = tuple(sorted(tag_map.keys()))
self.reverse_index = {} self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC)) self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs)
self.tag_map[tag_str] = dict(attrs) self.tag_map[tag_str] = dict(attrs)
attrs = _normalize_props(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.rich_tags[i].id = i self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].name = self.strings.add(tag_str)
self.rich_tags[i].morph = 0 self.rich_tags[i].morph = 0
self.rich_tags[i].pos = attrs[POS] self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags) self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
for (tag_str, orth_str), attrs in exc.items():
self.add_special_case(tag_str, orth_str, attrs)
def __reduce__(self): def __reduce__(self):
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None) return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int assign_tag(self, TokenC* token, tag) except -1:
if isinstance(tag, basestring): if isinstance(tag, basestring):
tag_id = self.reverse_index[self.strings[tag]] tag = self.strings.add(tag)
else: if tag in self.reverse_index:
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
self.assign_tag_id(token, tag_id) self.assign_tag_id(token, tag_id)
else:
token.tag = tag
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id >= self.n_tags: if tag_id >= self.n_tags:
@ -73,7 +80,7 @@ cdef class Morphology:
# the statistical model fails. # the statistical model fails.
# Related to Issue #220 # Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE): if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings['SP']] tag_id = self.reverse_index[self.strings.add('SP')]
rich_tag = self.rich_tags[tag_id] rich_tag = self.rich_tags[tag_id]
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL: if analysis is NULL:
@ -104,7 +111,8 @@ cdef class Morphology:
tag (unicode): The part-of-speech tag to key the exception. tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception. orth (unicode): The word-form to key the exception.
""" """
tag = self.strings[tag_str] self.exc[(tag_str, orth_str)] = dict(attrs)
tag = self.strings.add(tag_str)
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
orth = self.strings[orth_str] orth = self.strings[orth_str]
cdef RichTagC rich_tag = self.rich_tags[tag_id] cdef RichTagC rich_tag = self.rich_tags[tag_id]
@ -140,14 +148,14 @@ cdef class Morphology:
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
cdef unicode py_string = self.strings[orth] cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None: if self.lemmatizer is None:
return self.strings[py_string.lower()] return self.strings.add(py_string.lower())
if univ_pos not in (NOUN, VERB, ADJ, PUNCT): if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
return self.strings[py_string.lower()] return self.strings.add(py_string.lower())
cdef set lemma_strings cdef set lemma_strings
cdef unicode lemma_string cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
lemma_string = sorted(lemma_strings)[0] lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string] lemma = self.strings.add(lemma_string)
return lemma return lemma

View File

@ -9,12 +9,18 @@ import numpy
cimport numpy as np cimport numpy as np
import cytoolz import cytoolz
import util import util
from collections import OrderedDict
import ujson
import msgpack
from thinc.api import add, layerize, chain, clone, concatenate from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine from thinc.neural import Model, Maxout, Softmax, Affine
from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural.util import to_categorical from thinc.neural.util import to_categorical
from thinc.neural.pooling import Pooling, max_pool, mean_pool
from thinc.neural._classes.difference import Siamese, CauchySimilarity
from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.convolution import ExtractWindow
from thinc.neural._classes.resnet import Residual from thinc.neural._classes.resnet import Residual
from thinc.neural._classes.batchnorm import BatchNorm as BN from thinc.neural._classes.batchnorm import BatchNorm as BN
@ -31,110 +37,243 @@ from .syntax.stateclass cimport StateClass
from .gold cimport GoldParse from .gold cimport GoldParse
from .morphology cimport Morphology from .morphology cimport Morphology
from .vocab cimport Vocab from .vocab cimport Vocab
from .syntax import nonproj
from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import Tok2Vec, flatten, get_col, doc2feats from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
from ._ml import build_text_classifier, build_tagger_model
from .parts_of_speech import X from .parts_of_speech import X
class TokenVectorEncoder(object): class BaseThincComponent(object):
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.''' name = None
name = 'tok2vec'
@classmethod @classmethod
def Model(cls, width=128, embed_size=5000, **cfg): def Model(cls, *shape, **kwargs):
width = util.env_opt('token_vector_width', width) raise NotImplementedError
embed_size = util.env_opt('embed_size', embed_size)
return Tok2Vec(width, embed_size, preprocess=None)
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab raise NotImplementedError
self.doc2feats = doc2feats()
self.model = model
def __call__(self, docs, state=None): def __call__(self, doc):
if isinstance(docs, Doc): scores = self.predict([doc])
docs = [docs] self.set_annotations([doc], scores)
tokvecs = self.predict(docs) return doc
self.set_annotations(docs, tokvecs)
state = {} if state is None else state
state['tokvecs'] = tokvecs
return state
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
for batch in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs, states = zip(*batch) docs = list(docs)
tokvecs = self.predict(docs) scores = self.predict(docs)
self.set_annotations(docs, tokvecs) self.set_annotations(docs, scores)
for state in states: yield from docs
state['tokvecs'] = tokvecs
yield from zip(docs, states)
def predict(self, docs): def predict(self, docs):
feats = self.doc2feats(docs) raise NotImplementedError
tokvecs = self.model(feats)
return tokvecs
def set_annotations(self, docs, tokvecs): def set_annotations(self, docs, scores):
start = 0 raise NotImplementedError
for doc in docs:
doc.tensor = tokvecs[start : start + len(doc)]
start += len(doc)
def update(self, docs, golds, state=None, def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
drop=0., sgd=None): raise NotImplementedError
if isinstance(docs, Doc):
docs = [docs]
golds = [golds]
state = {} if state is None else state
feats = self.doc2feats(docs)
tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
state['feats'] = feats
state['tokvecs'] = tokvecs
state['bp_tokvecs'] = bp_tokvecs
return state
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
raise NotImplementedError raise NotImplementedError
def begin_training(self, gold_tuples, pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None):
self.doc2feats = doc2feats() token_vector_width = pipeline[0].model.nO
if self.model is True: if self.model is True:
self.model = self.Model() self.model = self.Model(1, token_vector_width)
def use_params(self, params): def use_params(self, params):
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
return util.to_bytes(serialize, exclude)
class NeuralTagger(object): def from_bytes(self, bytes_data, **exclude):
name = 'nn_tagger' if self.model is True:
def __init__(self, vocab, model=True): self.model = self.Model()
deserialize = OrderedDict((
('model', lambda b: self.model.from_bytes(b)),
('vocab', lambda b: self.vocab.from_bytes(b))
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('vocab', lambda p: self.vocab.to_disk(p)),
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
if self.model is True:
self.model = self.Model()
deserialize = OrderedDict((
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
('vocab', lambda p: self.vocab.from_disk(p)),
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
))
util.from_disk(path, deserialize, exclude)
return self
def _load_cfg(path):
if path.exists():
return ujson.load(path.open())
else:
return {}
class TokenVectorEncoder(BaseThincComponent):
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
name = 'tensorizer'
@classmethod
def Model(cls, width=128, embed_size=7500, **cfg):
"""Create a new statistical model for the class.
width (int): Output size of the model.
embed_size (int): Number of vectors in the embedding table.
**cfg: Config parameters.
RETURNS (Model): A `thinc.neural.Model` or similar instance.
"""
width = util.env_opt('token_vector_width', width)
embed_size = util.env_opt('embed_size', embed_size)
return Tok2Vec(width, embed_size, preprocess=None)
def __init__(self, vocab, model=True, **cfg):
"""Construct a new statistical model. Weights are not allocated on
initialisation.
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
instance with the `Doc` objects it will process.
model (Model): A `Model` instance or `True` allocate one later.
**cfg: Config parameters.
EXAMPLE:
>>> from spacy.pipeline import TokenVectorEncoder
>>> tok2vec = TokenVectorEncoder(nlp.vocab)
>>> tok2vec.model = tok2vec.Model(128, 5000)
"""
self.vocab = vocab self.vocab = vocab
self.doc2feats = doc2feats()
self.model = model self.model = model
self.cfg = dict(cfg)
def __call__(self, doc, state=None): def __call__(self, doc):
assert state is not None """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
assert 'tokvecs' in state model. Vectors are set to the `Doc.tensor` attribute.
tokvecs = state['tokvecs']
tags = self.predict(tokvecs) docs (Doc or iterable): One or more documents to add vectors to.
self.set_annotations([doc], tags) RETURNS (dict or None): Intermediate computations.
return state """
tokvecses = self.predict([doc])
self.set_annotations([doc], tokvecses)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
for batch in cytoolz.partition_all(batch_size, stream): """Process `Doc` objects as a stream.
docs, states = zip(*batch)
tag_ids = self.predict(states[0]['tokvecs'])
self.set_annotations(docs, tag_ids)
for state in states:
state['tag_ids'] = tag_ids
yield from zip(docs, states)
def predict(self, tokvecs): stream (iterator): A sequence of `Doc` objects to process.
scores = self.model(tokvecs) batch_size (int): Number of `Doc` objects to group.
n_threads (int): Number of threads.
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
"""
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
tokvecses = self.predict(docs)
self.set_annotations(docs, tokvecses)
yield from docs
def predict(self, docs):
"""Return a single tensor for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the documents.
"""
feats = self.doc2feats(docs)
tokvecs = self.model(feats)
return tokvecs
def set_annotations(self, docs, tokvecses):
"""Set the tensor attribute for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
tokvecs (object): Vector representation for each token in the documents.
"""
for doc, tokvecs in zip(docs, tokvecses):
assert tokvecs.shape[0] == len(doc)
doc.tensor = tokvecs
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
"""Update the model.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
"""
if isinstance(docs, Doc):
docs = [docs]
feats = self.doc2feats(docs)
tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
return tokvecs, bp_tokvecs
def get_loss(self, docs, golds, scores):
# TODO: implement
raise NotImplementedError
def begin_training(self, gold_tuples=tuple(), pipeline=None):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
self.doc2feats = doc2feats()
if self.model is True:
self.model = self.Model()
class NeuralTagger(BaseThincComponent):
name = 'tagger'
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = dict(cfg)
def __call__(self, doc):
tags = self.predict(([doc], [doc.tensor]))
self.set_annotations([doc], tags)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
tokvecs = [d.tensor for d in docs]
tag_ids = self.predict((docs, tokvecs))
self.set_annotations(docs, tag_ids)
yield from docs
def predict(self, docs_tokvecs):
scores = self.model(docs_tokvecs)
scores = self.model.ops.flatten(scores)
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray): if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get() guesses = guesses.get()
tokvecs = docs_tokvecs[1]
guesses = self.model.ops.unflatten(guesses,
[tv.shape[0] for tv in tokvecs])
return guesses return guesses
def set_annotations(self, docs, batch_tag_ids): def set_annotations(self, docs, batch_tag_ids):
@ -142,49 +281,49 @@ class NeuralTagger(object):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
cdef int idx = 0 cdef int idx = 0
cdef int i, j, tag_id
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[idx:idx+len(doc)] doc_tag_ids = batch_tag_ids[i]
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
vocab.morphology.assign_tag_id(&doc.c[j], tag_id) # Don't clobber preset POS tags
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
idx += 1 idx += 1
doc.is_tagged = True
def update(self, docs, golds, state=None, drop=0., sgd=None): def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
state = {} if state is None else state docs, tokvecs = docs_tokvecs
tokvecs = state['tokvecs']
bp_tokvecs = state['bp_tokvecs']
if self.model.nI is None: if self.model.nI is None:
self.model.nI = tokvecs.shape[1] self.model.nI = tokvecs[0].shape[1]
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
return d_tokvecs
bp_tokvecs(d_tokvecs, sgd=sgd)
state['tag_scores'] = tag_scores
state['tag_loss'] = loss
return state
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)} tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
cdef int idx = 0 cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i') correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)
for gold in golds: for gold in golds:
for tag in gold.tags: for tag in gold.tags:
correct[idx] = tag_index[tag] if tag is None:
correct[idx] = guesses[idx]
else:
correct[idx] = tag_index[tag]
idx += 1 idx += 1
correct = self.model.ops.xp.array(correct, dtype='i') correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum() loss = (d_scores**2).sum()
d_scores = self.model.ops.asarray(d_scores, dtype='f') d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, gold_tuples, pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None):
orig_tag_map = dict(self.vocab.morphology.tag_map) orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = {} new_tag_map = {}
for raw_text, annots_brackets in gold_tuples: for raw_text, annots_brackets in gold_tuples:
@ -195,22 +334,277 @@ class NeuralTagger(object):
new_tag_map[tag] = orig_tag_map[tag] new_tag_map[tag] = orig_tag_map[tag]
else: else:
new_tag_map[tag] = {POS: X} new_tag_map[tag] = {POS: X}
if 'SP' not in new_tag_map:
new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X})
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
vocab.morphology = Morphology(vocab.strings, new_tag_map, if new_tag_map:
vocab.morphology.lemmatizer) vocab.morphology = Morphology(vocab.strings, new_tag_map,
self.model = Softmax(self.vocab.morphology.n_tags) vocab.morphology.lemmatizer,
print("Tagging", self.model.nO, "tags") exc=vocab.morphology.exc)
token_vector_width = pipeline[0].model.nO
if self.model is True:
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
@classmethod
def Model(cls, n_tags, token_vector_width):
return build_tagger_model(n_tags, token_vector_width)
def use_params(self, params): def use_params(self, params):
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes()),
('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
use_bin_type=True,
encoding='utf8'))
))
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
def load_model(b):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(b)
def load_tag_map(b):
tag_map = msgpack.loads(b, encoding='utf8')
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
deserialize = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('tag_map', load_tag_map),
('model', lambda b: load_model(b)),
))
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, **exclude):
serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
self.vocab.morphology.tag_map,
use_bin_type=True,
encoding='utf8'))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
def load_model(p):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(p.open('rb').read())
def load_tag_map(p):
with p.open('rb') as file_:
tag_map = msgpack.loads(file_.read(), encoding='utf8')
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
deserialize = OrderedDict((
('vocab', lambda p: self.vocab.from_disk(p)),
('tag_map', load_tag_map),
('model', load_model),
('cfg', lambda p: self.cfg.update(_load_cfg(p)))
))
util.from_disk(path, deserialize, exclude)
return self
class NeuralLabeller(NeuralTagger):
name = 'nn_labeller'
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = dict(cfg)
@property
def labels(self):
return self.cfg.setdefault('labels', {})
@labels.setter
def labels(self, value):
self.cfg['labels'] = value
def set_annotations(self, docs, dep_ids):
pass
def begin_training(self, gold_tuples=tuple(), pipeline=None):
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
for dep in deps:
if dep not in self.labels:
self.labels[dep] = len(self.labels)
token_vector_width = pipeline[0].model.nO
if self.model is True:
self.model = self.Model(len(self.labels), token_vector_width)
@classmethod
def Model(cls, n_tags, token_vector_width):
return build_tagger_model(n_tags, token_vector_width)
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)
for gold in golds:
for tag in gold.labels:
if tag is None or tag not in self.labels:
correct[idx] = guesses[idx]
else:
correct[idx] = self.labels[tag]
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
class SimilarityHook(BaseThincComponent):
"""
Experimental
A pipeline component to install a hook for supervised similarity into
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
model can be any object obeying the Thinc Model interface. By default,
the model concatenates the elementwise mean and elementwise max of the two
tensors, and compares them using the Cauchy-like similarity function
from Chen (2013):
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
Where W is a vector of dimension weights, initialized to 1.
"""
name = 'similarity'
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = dict(cfg)
@classmethod
def Model(cls, length):
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
def __call__(self, doc):
'''Install similarity hook'''
doc.user_hooks['similarity'] = self.predict
return doc
def pipe(self, docs, **kwargs):
for doc in docs:
yield self(doc)
def predict(self, doc1, doc2):
return self.model.predict([(doc1.tensor, doc2.tensor)])
def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.):
doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2
sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
drop=drop)
d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
return d_tensor1s, d_tensor2s
def begin_training(self, _=tuple(), pipeline=None):
"""
Allocate model, using width from tensorizer in pipeline.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
if self.model is True:
self.model = self.Model(pipeline[0].model.nO)
class TextCategorizer(BaseThincComponent):
name = 'textcat'
@classmethod
def Model(cls, nr_class=1, width=64, **cfg):
return build_text_classifier(nr_class, width, **cfg)
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = dict(cfg)
@property
def labels(self):
return self.cfg.get('labels', ['LABEL'])
@labels.setter
def labels(self, value):
self.cfg['labels'] = value
def __call__(self, doc):
scores = self.predict([doc])
self.set_annotations([doc], scores)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
scores = self.predict(docs)
self.set_annotations(docs, scores)
yield from docs
def predict(self, docs):
scores = self.model(docs)
scores = self.model.ops.asarray(scores)
return scores
def set_annotations(self, docs, scores):
for i, doc in enumerate(docs):
for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j])
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
docs, tensors = docs_tensors
scores, bp_scores = self.model.begin_update(docs, drop=drop)
loss, d_scores = self.get_loss(docs, golds, scores)
d_tensors = bp_scores(d_scores, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
return d_tensors
def get_loss(self, docs, golds, scores):
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
for i, gold in enumerate(golds):
for j, label in enumerate(self.labels):
truths[i, j] = label in gold.cats
truths = self.model.ops.asarray(truths)
d_scores = (scores-truths) / scores.shape[0]
mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
return mean_square_error, d_scores
def begin_training(self, gold_tuples=tuple(), pipeline=None):
if pipeline:
token_vector_width = pipeline[0].model.nO
else:
token_vector_width = 64
if self.model is True:
self.model = self.Model(len(self.labels), token_vector_width)
cdef class EntityRecognizer(LinearParser): cdef class EntityRecognizer(LinearParser):
""" """Annotate named entities on Doc objects."""
Annotate named entities on Doc objects.
"""
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner') feature_templates = get_feature_templates('ner')
@ -222,9 +616,7 @@ cdef class EntityRecognizer(LinearParser):
cdef class BeamEntityRecognizer(BeamParser): cdef class BeamEntityRecognizer(BeamParser):
""" """Annotate named entities on Doc objects."""
Annotate named entities on Doc objects.
"""
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner') feature_templates = get_feature_templates('ner')
@ -249,32 +641,26 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser' name = 'parser'
TransitionSystem = ArcEager TransitionSystem = ArcEager
def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
cdef class NeuralEntityRecognizer(NeuralParser): cdef class NeuralEntityRecognizer(NeuralParser):
name = 'entity' name = 'ner'
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
nr_feature = 6 nr_feature = 6
def get_token_ids(self, states): def predict_confidences(self, docs):
cdef StateClass state tensors = [d.tensor for d in docs]
cdef int n_tokens = 6 samples = []
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c') for i in range(10):
for i, state in enumerate(states): states = self.parse_batch(docs, tensors, drop=0.3)
ids[i, 0] = state.c.B(0)-1 for state in states:
ids[i, 1] = state.c.B(0) samples.append(self._get_entities(state))
ids[i, 2] = state.c.B(1)
ids[i, 3] = state.c.E(0)
ids[i, 4] = state.c.E(0)-1
ids[i, 5] = state.c.E(0)+1
for j in range(6):
if ids[i, j] >= state.c.length:
ids[i, j] = -1
if ids[i, j] != -1:
ids[i, j] += state.c.offset
return ids
def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
cdef class BeamDependencyParser(BeamParser): cdef class BeamDependencyParser(BeamParser):

View File

@ -1,4 +1,5 @@
from libc.stdint cimport int64_t from libc.stdint cimport int64_t
from libcpp.vector cimport vector
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
@ -8,6 +9,9 @@ from .typedefs cimport attr_t, hash_t
cpdef hash_t hash_string(unicode string) except 0 cpdef hash_t hash_string(unicode string) except 0
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
cdef unicode decode_Utf8Str(const Utf8Str* string)
ctypedef union Utf8Str: ctypedef union Utf8Str:
@ -17,13 +21,11 @@ ctypedef union Utf8Str:
cdef class StringStore: cdef class StringStore:
cdef Pool mem cdef Pool mem
cdef Utf8Str* c
cdef int64_t size
cdef bint is_frozen cdef bint is_frozen
cdef vector[hash_t] keys
cdef public PreshMap _map cdef public PreshMap _map
cdef public PreshMap _oov cdef public PreshMap _oov
cdef int64_t _resize_at
cdef const Utf8Str* intern_unicode(self, unicode py_string) cdef const Utf8Str* intern_unicode(self, unicode py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)

View File

@ -7,11 +7,16 @@ from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32 from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t from preshed.maps cimport map_iter, key_t
from libc.stdint cimport uint32_t
import ujson
import dill
from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t from .typedefs cimport hash_t
from libc.stdint cimport uint32_t from . import util
from .compat import json_dumps
import ujson
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(unicode string) except 0:
@ -27,7 +32,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
return hash32(utf8_string, length, 1) return hash32(utf8_string, length, 1)
cdef unicode _decode(const Utf8Str* string): cdef unicode decode_Utf8Str(const Utf8Str* string):
cdef int i, length cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0: if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode('utf8') return string.s[1:string.s[0]+1].decode('utf8')
@ -44,10 +49,10 @@ cdef unicode _decode(const Utf8Str* string):
return string.p[i:length + i].decode('utf8') return string.p[i:length + i].decode('utf8')
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *: cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
cdef int n_length_bytes cdef int n_length_bytes
cdef int i cdef int i
cdef Utf8Str string cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
cdef uint32_t ulength = length cdef uint32_t ulength = length
if length < sizeof(string.s): if length < sizeof(string.s):
string.s[0] = <unsigned char>length string.s[0] = <unsigned char>length
@ -72,129 +77,166 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
cdef class StringStore: cdef class StringStore:
""" """Look up strings by 64-bit hashes."""
Map strings to and from integer IDs.
"""
def __init__(self, strings=None, freeze=False): def __init__(self, strings=None, freeze=False):
""" """Create the StringStore.
Create the StringStore.
Arguments: strings (iterable): A sequence of unicode strings to add to the store.
strings: A sequence of unicode strings to add to the store. RETURNS (StringStore): The newly constructed object.
""" """
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()
self._oov = PreshMap() self._oov = PreshMap()
self._resize_at = 10000
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
self.is_frozen = freeze self.is_frozen = freeze
if strings is not None: if strings is not None:
for string in strings: for string in strings:
_ = self[string] self.add(string)
property size:
def __get__(self):
return self.size -1
def __reduce__(self):
# TODO: OOV words, for the is_frozen stuff?
if self.is_frozen:
raise NotImplementedError(
"Currently missing support for pickling StringStore when "
"is_frozen=True")
return (StringStore, (list(self),))
def __len__(self):
"""
The number of strings in the store.
Returns:
int The number of strings in the store.
"""
return self.size-1
def __getitem__(self, object string_or_id): def __getitem__(self, object string_or_id):
""" """Retrieve a string from a given hash, or vice versa.
Retrieve a string from a given integer ID, or vice versa.
Arguments: string_or_id (bytes, unicode or uint64): The value to encode.
string_or_id (bytes or unicode or int): Returns (unicode or uint64): The value to be retrieved.
The value to encode.
Returns:
unicode or int: The value to retrieved.
""" """
if isinstance(string_or_id, basestring) and len(string_or_id) == 0: if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0 return 0
elif string_or_id == 0: elif string_or_id == 0:
return u'' return u''
elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id]
cdef bytes byte_string cdef hash_t key
cdef const Utf8Str* utf8str
cdef uint64_t int_id if isinstance(string_or_id, unicode):
cdef uint32_t oov_id key = hash_string(string_or_id)
if isinstance(string_or_id, (int, long)): return key
int_id = string_or_id elif isinstance(string_or_id, bytes):
oov_id = string_or_id key = hash_utf8(string_or_id, len(string_or_id))
if int_id < <uint64_t>self.size: return key
return _decode(&self.c[int_id]) elif string_or_id < len(SYMBOLS_BY_INT):
else: return SYMBOLS_BY_INT[string_or_id]
utf8str = <Utf8Str*>self._oov.get(oov_id)
if utf8str is not NULL:
return _decode(utf8str)
else:
raise IndexError(string_or_id)
else: else:
if isinstance(string_or_id, bytes): key = string_or_id
byte_string = <bytes>string_or_id utf8str = <Utf8Str*>self._map.get(key)
elif isinstance(string_or_id, unicode):
byte_string = (<unicode>string_or_id).encode('utf8')
else:
raise TypeError(type(string_or_id))
utf8str = self._intern_utf8(byte_string, len(byte_string))
if utf8str is NULL: if utf8str is NULL:
# TODO: We need to use 32 bit here, for compatibility with the raise KeyError(string_or_id)
# vocabulary values. This makes birthday paradox probabilities
# pretty bad.
# We could also get unlucky here, and hash into a value that
# collides with the 'real' strings.
return hash32_utf8(byte_string, len(byte_string))
else: else:
return utf8str - self.c return decode_Utf8Str(utf8str)
def __contains__(self, unicode string not None): def add(self, string):
""" """Add a string to the StringStore.
Check whether a string is in the store.
Arguments: string (unicode): The string to add.
string (unicode): The string to check. RETURNS (uint64): The string's hash value.
Returns bool:
Whether the store contains the string.
""" """
if len(string) == 0: if isinstance(string, unicode):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_string(string)
self.intern_unicode(string)
elif isinstance(string, bytes):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
key = hash_utf8(string, len(string))
self._intern_utf8(string, len(string))
else:
raise TypeError(
"Can only add unicode or bytes. Got type: %s" % type(string))
return key
def __len__(self):
"""The number of strings in the store.
RETURNS (int): The number of strings in the store.
"""
return self.keys.size()
def __contains__(self, string not None):
"""Check whether a string is in the store.
string (unicode): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
cdef hash_t key
if isinstance(string, int) or isinstance(string, long):
if string == 0:
return True
key = string
elif len(string) == 0:
return True return True
cdef hash_t key = hash_string(string) elif string in SYMBOLS_BY_STR:
return self._map.get(key) is not NULL return True
elif isinstance(string, unicode):
key = hash_string(string)
else:
string = string.encode('utf8')
key = hash_utf8(string, len(string))
if key < len(SYMBOLS_BY_INT):
return True
else:
return self._map.get(key) is not NULL
def __iter__(self): def __iter__(self):
""" """Iterate over the strings in the store, in order.
Iterate over the strings in the store, in order.
Yields: unicode A string in the store. YIELDS (unicode): A string in the store.
""" """
cdef int i cdef int i
for i in range(self.size): cdef hash_t key
yield _decode(&self.c[i]) if i > 0 else u'' for i in range(self.keys.size()):
key = self.keys[i]
utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str)
# TODO: Iterate OOV here? # TODO: Iterate OOV here?
def __reduce__(self): def __reduce__(self):
strings = [""] strings = list(self)
for i in range(1, self.size):
string = &self.c[i]
py_string = _decode(string)
strings.append(py_string)
return (StringStore, (strings,), None, None, None) return (StringStore, (strings,), None, None, None)
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
path = util.ensure_path(path)
strings = list(self)
with path.open('w') as file_:
file_.write(json_dumps(strings))
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object.
"""
path = util.ensure_path(path)
with path.open('r') as file_:
strings = ujson.load(file_)
self._reset_and_load(strings)
return self
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object.
"""
return ujson.dumps(list(self))
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object.
"""
strings = ujson.loads(bytes_data)
self._reset_and_load(strings)
return self
def set_frozen(self, bint is_frozen): def set_frozen(self, bint is_frozen):
# TODO # TODO
self.is_frozen = is_frozen self.is_frozen = is_frozen
@ -202,6 +244,15 @@ cdef class StringStore:
def flush_oov(self): def flush_oov(self):
self._oov = PreshMap() self._oov = PreshMap()
def _reset_and_load(self, strings, freeze=False):
self.mem = Pool()
self._map = PreshMap()
self._oov = PreshMap()
self.keys.clear()
for string in strings:
self.add(string)
self.is_frozen = freeze
cdef const Utf8Str* intern_unicode(self, unicode py_string): cdef const Utf8Str* intern_unicode(self, unicode py_string):
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode('utf8') cdef bytes byte_string = py_string.encode('utf8')
@ -223,73 +274,11 @@ cdef class StringStore:
key32 = hash32_utf8(utf8_string, length) key32 = hash32_utf8(utf8_string, length)
# Important: Make the OOV store own the memory. That way it's trivial # Important: Make the OOV store own the memory. That way it's trivial
# to flush them all. # to flush them all.
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str)) value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
self._oov.set(key32, value) self._oov.set(key32, value)
return NULL return NULL
if self.size == self._resize_at: value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._realloc() self._map.set(key, value)
self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length) self.keys.push_back(key)
self._map.set(key, <void*>&self.c[self.size]) return value
self.size += 1
return &self.c[self.size-1]
def dump(self, file_):
"""
Save the strings to a JSON file.
Arguments:
file_ (buffer): The file to save the strings.
Returns:
None
"""
string_data = ujson.dumps(list(self))
if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8')
# TODO: OOV?
file_.write(string_data)
def load(self, file_):
"""
Load the strings from a JSON file.
Arguments:
file_ (buffer): The file from which to load the strings.
Returns:
None
"""
strings = ujson.load(file_)
if strings == ['']:
return None
cdef unicode string
for string in strings:
# explicit None/len check instead of simple truth testing
# (bug in Cython <= 0.23.4)
if string is not None and len(string):
self.intern_unicode(string)
def _realloc(self):
# We want to map straight to pointers, but they'll be invalidated if
# we resize our array. So, first we remap to indices, then we resize,
# then we can acquire the new pointers.
cdef Pool tmp_mem = Pool()
keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
cdef key_t key
cdef void* value
cdef const Utf8Str ptr
cdef int i = 0
cdef size_t offset
while map_iter(self._map.c_map, &i, &key, &value):
# Find array index with pointer arithmetic
offset = ((<Utf8Str*>value) - self.c)
keys[offset] = key
self._resize_at *= 2
cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
self._map = PreshMap(self.size)
for i in range(self.size):
if keys[i]:
self._map.set(keys[i], &self.c[i])

View File

@ -5,8 +5,6 @@ from .parts_of_speech cimport univ_pos_t
cdef struct LexemeC: cdef struct LexemeC:
float* vector
flags_t flags flags_t flags
attr_t lang attr_t lang
@ -25,11 +23,10 @@ cdef struct LexemeC:
float prob float prob
float sentiment float sentiment
float l2_norm
cdef struct SerializedLexemeC: cdef struct SerializedLexemeC:
unsigned char[4*13 + 8] data unsigned char[8 + 8*10 + 4 + 4] data
# sizeof(flags_t) # flags # sizeof(flags_t) # flags
# + sizeof(attr_t) # lang # + sizeof(attr_t) # lang
# + sizeof(attr_t) # id # + sizeof(attr_t) # id
@ -50,7 +47,7 @@ cdef struct Entity:
hash_t id hash_t id
int start int start
int end int end
int label attr_t label
cdef struct TokenC: cdef struct TokenC:
@ -58,12 +55,12 @@ cdef struct TokenC:
uint64_t morph uint64_t morph
univ_pos_t pos univ_pos_t pos
bint spacy bint spacy
int tag attr_t tag
int idx int idx
int lemma attr_t lemma
int sense attr_t sense
int head int head
int dep attr_t dep
bint sent_start bint sent_start
uint32_t l_kids uint32_t l_kids
@ -72,5 +69,5 @@ cdef struct TokenC:
uint32_t r_edge uint32_t r_edge
int ent_iob int ent_iob
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
hash_t ent_id hash_t ent_id

View File

@ -82,6 +82,7 @@ cpdef enum symbol_t:
ENT_IOB ENT_IOB
ENT_TYPE ENT_TYPE
HEAD HEAD
SENT_START
SPACY SPACY
PROB PROB

View File

@ -84,6 +84,7 @@ IDS = {
"ENT_IOB": ENT_IOB, "ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE, "ENT_TYPE": ENT_TYPE,
"HEAD": HEAD, "HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY, "SPACY": SPACY,
"PROB": PROB, "PROB": PROB,

View File

@ -9,6 +9,7 @@ from ..structs cimport TokenC, Entity
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..symbols cimport punct from ..symbols cimport punct
from ..attrs cimport IS_SPACE from ..attrs cimport IS_SPACE
from ..typedefs cimport attr_t
cdef inline bint is_space_token(const TokenC* token) nogil: cdef inline bint is_space_token(const TokenC* token) nogil:
@ -71,6 +72,45 @@ cdef cppclass StateC:
free(this._stack - PADDING) free(this._stack - PADDING)
free(this.shifted - PADDING) free(this.shifted - PADDING)
void set_context_tokens(int* ids, int n) nogil:
if n == 13:
ids[0] = this.B(0)
ids[1] = this.B(1)
ids[2] = this.S(0)
ids[3] = this.S(1)
ids[4] = this.S(2)
ids[5] = this.L(this.S(0), 1)
ids[6] = this.L(this.S(0), 2)
ids[6] = this.R(this.S(0), 1)
ids[7] = this.L(this.B(0), 1)
ids[8] = this.R(this.S(0), 2)
ids[9] = this.L(this.S(1), 1)
ids[10] = this.L(this.S(1), 2)
ids[11] = this.R(this.S(1), 1)
ids[12] = this.R(this.S(1), 2)
elif n == 6:
if this.B(0) >= 0:
ids[0] = this.B(0)
else:
ids[0] = -1
ids[1] = this.B(0)
ids[2] = this.B(1)
ids[3] = this.E(0)
if ids[3] >= 1:
ids[4] = this.E(0)-1
else:
ids[4] = -1
if (ids[3]+1) < this.length:
ids[5] = this.E(0)+1
else:
ids[5] = -1
else:
# TODO error =/
pass
for i in range(n):
if ids[i] >= 0:
ids[i] += this.offset
int S(int i) nogil const: int S(int i) nogil const:
if i >= this._s_i: if i >= this._s_i:
return -1 return -1
@ -238,7 +278,7 @@ cdef cppclass StateC:
this._s_i -= 1 this._s_i -= 1
this.shifted[this.B(0)] = True this.shifted[this.B(0)] = True
void add_arc(int head, int child, int label) nogil: void add_arc(int head, int child, attr_t label) nogil:
if this.has_head(child): if this.has_head(child):
this.del_arc(this.H(child), child) this.del_arc(this.H(child), child)
@ -282,7 +322,7 @@ cdef cppclass StateC:
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
h.l_kids -= 1 h.l_kids -= 1
void open_ent(int label) nogil: void open_ent(attr_t label) nogil:
this._ents[this._e_i].start = this.B(0) this._ents[this._e_i].start = this.B(0)
this._ents[this._e_i].label = label this._ents[this._e_i].label = label
this._ents[this._e_i].end = -1 this._ents[this._e_i].end = -1
@ -294,7 +334,7 @@ cdef cppclass StateC:
this._ents[this._e_i-1].end = this.B(0)+1 this._ents[this._e_i-1].end = this.B(0)+1
this._sent[this.B(0)].ent_iob = 1 this._sent[this.B(0)].ent_iob = 1
void set_ent_tag(int i, int ent_iob, int ent_type) nogil: void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
if 0 <= i < this.length: if 0 <= i < this.length:
this._sent[i].ent_iob = ent_iob this._sent[i].ent_iob = ent_iob
this._sent[i].ent_type = ent_type this._sent[i].ent_type = ent_type
@ -305,16 +345,18 @@ cdef cppclass StateC:
this._break = this._b_i this._break = this._b_i
void clone(const StateC* src) nogil: void clone(const StateC* src) nogil:
this.length = src.length
memcpy(this._sent, src._sent, this.length * sizeof(TokenC)) memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
memcpy(this._stack, src._stack, this.length * sizeof(int)) memcpy(this._stack, src._stack, this.length * sizeof(int))
memcpy(this._buffer, src._buffer, this.length * sizeof(int)) memcpy(this._buffer, src._buffer, this.length * sizeof(int))
memcpy(this._ents, src._ents, this.length * sizeof(Entity)) memcpy(this._ents, src._ents, this.length * sizeof(Entity))
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0])) memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
this.length = src.length
this._b_i = src._b_i this._b_i = src._b_i
this._s_i = src._s_i this._s_i = src._s_i
this._e_i = src._e_i this._e_i = src._e_i
this._break = src._break this._break = src._break
this.offset = src.offset
this._empty_token = src._empty_token
void fast_forward() nogil: void fast_forward() nogil:
# space token attachement policy: # space token attachement policy:

View File

@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..typedefs cimport attr_t
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParseC from ..gold cimport GoldParseC

View File

@ -9,10 +9,12 @@ import ctypes
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.string cimport memcpy from libc.string cimport memcpy
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from collections import OrderedDict
from thinc.extra.search cimport Beam
import numpy
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token from ._state cimport StateC, is_space_token
from .nonproj import PseudoProjectivity
from .nonproj import is_nonproj_tree from .nonproj import is_nonproj_tree
from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t
@ -60,7 +62,7 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
cost += 1 cost += 1
if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)): if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
cost += 1 cost += 1
cost += Break.is_valid(stcls.c, -1) and Break.move_cost(stcls, gold) == 0 cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
return cost return cost
@ -73,7 +75,7 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
cost += gold.heads[target] == B_i cost += gold.heads[target] == B_i
if gold.heads[B_i] == B_i or gold.heads[B_i] < target: if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
break break
if Break.is_valid(stcls.c, -1) and Break.move_cost(stcls, gold) == 0: if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
cost += 1 cost += 1
return cost return cost
@ -84,14 +86,14 @@ cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int c
elif stcls.H(child) == gold.heads[child]: elif stcls.H(child) == gold.heads[child]:
return 1 return 1
# Head in buffer # Head in buffer
elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1: elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != 0:
return 1 return 1
else: else:
return 0 return 0
cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
if gold.labels[child] == -1: if not gold.has_dep[child]:
return True return True
elif gold.heads[child] == head: elif gold.heads[child] == head:
return True return True
@ -99,10 +101,10 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
return False return False
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil: cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
if gold.labels[child] == -1: if not gold.has_dep[child]:
return True return True
elif label == -1: elif label == 0:
return True return True
elif gold.labels[child] == label: elif gold.labels[child] == label:
return True return True
@ -111,21 +113,20 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label)
cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
return gold.labels[word] == -1 or gold.heads[word] == word return gold.heads[word] == word or not gold.has_dep[word]
cdef class Shift: cdef class Shift:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.push() st.push()
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
@staticmethod @staticmethod
@ -133,17 +134,17 @@ cdef class Shift:
return push_cost(s, gold, s.B(0)) return push_cost(s, gold, s.B(0))
@staticmethod @staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0 return 0
cdef class Reduce: cdef class Reduce:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.stack_depth() >= 2 return st.stack_depth() >= 2
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
if st.has_head(st.S(0)): if st.has_head(st.S(0)):
st.pop() st.pop()
else: else:
@ -151,7 +152,7 @@ cdef class Reduce:
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
@staticmethod @staticmethod
@ -165,28 +166,28 @@ cdef class Reduce:
cost -= 1 cost -= 1
if gold.heads[S_i] == st.S(0): if gold.heads[S_i] == st.S(0):
cost -= 1 cost -= 1
if Break.is_valid(st.c, -1) and Break.move_cost(st, gold) == 0: if Break.is_valid(st.c, 0) and Break.move_cost(st, gold) == 0:
cost -= 1 cost -= 1
return cost return cost
@staticmethod @staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0 return 0
cdef class LeftArc: cdef class LeftArc:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start return not st.B_(0).sent_start
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.B(0), st.S(0), label) st.add_arc(st.B(0), st.S(0), label)
st.pop() st.pop()
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
@staticmethod @staticmethod
@ -204,23 +205,23 @@ cdef class LeftArc:
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
@staticmethod @staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
cdef class RightArc: cdef class RightArc:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start return not st.B_(0).sent_start
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.S(0), st.B(0), label) st.add_arc(st.S(0), st.B(0), label)
st.push() st.push()
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
@staticmethod @staticmethod
@ -233,13 +234,13 @@ cdef class RightArc:
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
@staticmethod @staticmethod
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
cdef class Break: cdef class Break:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int i cdef int i
if not USE_BREAK: if not USE_BREAK:
return False return False
@ -251,12 +252,12 @@ cdef class Break:
return True return True
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.set_break(st.B_(0).l_edge) st.set_break(st.B_(0).l_edge)
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
@staticmethod @staticmethod
@ -281,13 +282,13 @@ cdef class Break:
return cost + 1 return cost + 1
@staticmethod @staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0 return 0
cdef int _get_root(int word, const GoldParseC* gold) nogil: cdef int _get_root(int word, const GoldParseC* gold) nogil:
while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0: while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
word = gold.heads[word] word = gold.heads[word]
if gold.labels[word] == -1: if not gold.has_dep[word]:
return -1 return -1
else: else:
return word return word
@ -295,9 +296,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length) cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
# Ensure sent_start is set to 0 throughout
for i in range(st.c.length): for i in range(st.c.length):
st.c._sent[i].sent_start = False
st.c._sent[i].l_edge = i st.c._sent[i].l_edge = i
st.c._sent[i].r_edge = i st.c._sent[i].r_edge = i
st.fast_forward() st.fast_forward()
@ -313,21 +312,24 @@ cdef class ArcEager(TransitionSystem):
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions',
{ OrderedDict((
SHIFT: [''], (SHIFT, ['']),
REDUCE: [''], (REDUCE, ['']),
RIGHT: [], (RIGHT, []),
LEFT: [], (LEFT, []),
BREAK: ['ROOT']}) (BREAK, ['ROOT'])
)))
seen_actions = set() seen_actions = set()
for label in kwargs.get('left_labels', []): for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT': if label.upper() != 'ROOT':
if (LEFT, label) not in seen_actions: if (LEFT, label) not in seen_actions:
actions[LEFT].append(label) actions[LEFT].append(label)
seen_actions.add((LEFT, label))
for label in kwargs.get('right_labels', []): for label in kwargs.get('right_labels', []):
if label.upper() != 'ROOT': if label.upper() != 'ROOT':
if (RIGHT, label) not in seen_actions: if (RIGHT, label) not in seen_actions:
actions[RIGHT].append(label) actions[RIGHT].append(label)
seen_actions.add((RIGHT, label))
for raw_text, sents in kwargs.get('gold_parses', []): for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, iob), ctnts in sents: for (ids, words, tags, heads, labels, iob), ctnts in sents:
@ -338,29 +340,39 @@ cdef class ArcEager(TransitionSystem):
if head < child: if head < child:
if (RIGHT, label) not in seen_actions: if (RIGHT, label) not in seen_actions:
actions[RIGHT].append(label) actions[RIGHT].append(label)
seen_actions.add((RIGHT, label))
elif head > child: elif head > child:
if (LEFT, label) not in seen_actions: if (LEFT, label) not in seen_actions:
actions[LEFT].append(label) actions[LEFT].append(label)
seen_actions.add((LEFT, label))
return actions return actions
property action_types: property action_types:
def __get__(self): def __get__(self):
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
cdef int preprocess_gold(self, GoldParse gold) except -1: def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.heads)
if all([tag is None for tag in gold.heads[start:end]]):
return False
else:
return True
def preprocess_gold(self, GoldParse gold):
if not self.has_gold(gold):
return None
for i in range(gold.length): for i in range(gold.length):
if gold.heads[i] is None: # Missing values if gold.heads[i] is None or gold.labels[i] is None: # Missing values
gold.c.heads[i] = i gold.c.heads[i] = i
gold.c.labels[i] = -1 gold.c.has_dep[i] = False
else: else:
label = gold.labels[i] label = gold.labels[i]
gold.c.has_dep[i] = True
if label.upper() == 'ROOT': if label.upper() == 'ROOT':
label = 'ROOT' label = 'ROOT'
gold.c.heads[i] = gold.heads[i] gold.c.heads[i] = gold.heads[i]
gold.c.labels[i] = self.strings[label] gold.c.labels[i] = self.strings.add(label)
# Count frequencies, for use in encoder return gold
self.freqs[HEAD][gold.c.heads[i] - i] += 1
self.freqs[DEP][gold.c.labels[i]] += 1
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
if '-' in name: if '-' in name:
@ -373,15 +385,16 @@ cdef class ArcEager(TransitionSystem):
for i in range(self.n_moves): for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label: if self.c[i].move == move and self.c[i].label == label:
return self.c[i] return self.c[i]
return Transition(clas=0, move=MISSING, label=0)
def move_name(self, int move, int label): def move_name(self, int move, attr_t label):
label_str = self.strings[label] label_str = self.strings[label]
if label_str: if label_str:
return MOVE_NAMES[move] + '-' + label_str return MOVE_NAMES[move] + '-' + label_str
else: else:
return MOVE_NAMES[move] return MOVE_NAMES[move]
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition() # TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers # constructor with the function pointers
cdef Transition t cdef Transition t
@ -414,9 +427,7 @@ cdef class ArcEager(TransitionSystem):
return t return t
cdef int initialize_state(self, StateC* st) nogil: cdef int initialize_state(self, StateC* st) nogil:
# Ensure sent_start is set to 0 throughout
for i in range(st.length): for i in range(st.length):
st._sent[i].sent_start = False
st._sent[i].l_edge = i st._sent[i].l_edge = i
st._sent[i].r_edge = i st._sent[i].r_edge = i
st.fast_forward() st.fast_forward()
@ -432,18 +443,19 @@ cdef class ArcEager(TransitionSystem):
cdef int set_valid(self, int* output, const StateC* st) nogil: cdef int set_valid(self, int* output, const StateC* st) nogil:
cdef bint[N_MOVES] is_valid cdef bint[N_MOVES] is_valid
is_valid[SHIFT] = Shift.is_valid(st, -1) is_valid[SHIFT] = Shift.is_valid(st, 0)
is_valid[REDUCE] = Reduce.is_valid(st, -1) is_valid[REDUCE] = Reduce.is_valid(st, 0)
is_valid[LEFT] = LeftArc.is_valid(st, -1) is_valid[LEFT] = LeftArc.is_valid(st, 0)
is_valid[RIGHT] = RightArc.is_valid(st, -1) is_valid[RIGHT] = RightArc.is_valid(st, 0)
is_valid[BREAK] = Break.is_valid(st, -1) is_valid[BREAK] = Break.is_valid(st, 0)
cdef int i cdef int i
for i in range(self.n_moves): for i in range(self.n_moves):
output[i] = is_valid[self.c[i].move] output[i] = is_valid[self.c[i].move]
cdef int set_costs(self, int* is_valid, weight_t* costs, cdef int set_costs(self, int* is_valid, weight_t* costs,
StateClass stcls, GoldParse gold) except -1: StateClass stcls, GoldParse gold) except -1:
cdef int i, move, label cdef int i, move
cdef attr_t label
cdef label_cost_func_t[N_MOVES] label_cost_funcs cdef label_cost_func_t[N_MOVES] label_cost_funcs
cdef move_cost_func_t[N_MOVES] move_cost_funcs cdef move_cost_func_t[N_MOVES] move_cost_funcs
cdef weight_t[N_MOVES] move_costs cdef weight_t[N_MOVES] move_costs
@ -461,7 +473,7 @@ cdef class ArcEager(TransitionSystem):
label_cost_funcs[RIGHT] = RightArc.label_cost label_cost_funcs[RIGHT] = RightArc.label_cost
label_cost_funcs[BREAK] = Break.label_cost label_cost_funcs[BREAK] = Break.label_cost
cdef int* labels = gold.c.labels cdef attr_t* labels = gold.c.labels
cdef int* heads = gold.c.heads cdef int* heads = gold.c.heads
n_gold = 0 n_gold = 0
@ -501,3 +513,23 @@ cdef class ArcEager(TransitionSystem):
"State at failure:\n" "State at failure:\n"
"%s" % (self.n_moves, stcls.print_state(gold.words))) "%s" % (self.n_moves, stcls.print_state(gold.words)))
assert n_gold >= 1 assert n_gold >= 1
def get_beam_annot(self, Beam beam):
length = (<StateClass>beam.at(0)).c.length
heads = [{} for _ in range(length)]
deps = [{} for _ in range(length)]
probs = beam.probs
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
self.finalize_state(stcls.c)
if stcls.is_final():
prob = probs[i]
for j in range(stcls.c.length):
head = j + stcls.c._sent[j].head
dep = stcls.c._sent[j].dep
heads[j].setdefault(head, 0.0)
heads[j][head] += prob
deps[j].setdefault(dep, 0.0)
deps[j][dep] += prob
return heads, deps

View File

@ -1,7 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..parts_of_speech cimport NOUN, PROPN, PRON from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
def english_noun_chunks(obj): def english_noun_chunks(obj):
@ -12,9 +12,9 @@ def english_noun_chunks(obj):
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT'] 'attr', 'ROOT']
doc = obj.doc # Ensure works on both Doc and Span. doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings['conj'] conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings['NP'] np_label = doc.vocab.strings.add('NP')
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
@ -48,9 +48,9 @@ def english_noun_chunks(obj):
def german_noun_chunks(obj): def german_noun_chunks(obj):
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app'] labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span. doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings['NP'] np_label = doc.vocab.strings.add('NP')
np_deps = set(doc.vocab.strings[label] for label in labels) np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings['nk'] close_app = doc.vocab.strings.add('nk')
rbracket = 0 rbracket = 0
for i, word in enumerate(obj): for i, word in enumerate(obj):
@ -66,4 +66,79 @@ def german_noun_chunks(obj):
yield word.left_edge.i, rbracket, np_label yield word.left_edge.i, rbracket, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks} def es_noun_chunks(obj):
doc = obj.doc
np_label = doc.vocab.strings['NP']
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
right_labels = ['flat', 'fixed', 'compound', 'neg']
stop_labels = ['punct']
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
def next_token(token):
try:
return token.nbor()
except:
return None
def noun_bounds(root):
def is_verb_token(token):
return token.pos in [VERB, AUX]
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
left_bound = token
right_bound = root
for token in root.rights:
if (token.dep in np_right_deps):
left, right = noun_bounds(token)
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i: right.i])):
break
else:
right_bound = right
return left_bound, right_bound
token = doc[0]
while token and token.i < len(doc):
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(token)
yield left.i, right.i+1, np_label
token = right
token = next_token(token)
def french_noun_chunks(obj):
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
'es': es_noun_chunks, 'fr': french_noun_chunks}

View File

@ -1,6 +1,7 @@
from .transition_system cimport TransitionSystem from .transition_system cimport TransitionSystem
from .transition_system cimport Transition from .transition_system cimport Transition
from ..gold cimport GoldParseC from ..gold cimport GoldParseC
from ..typedefs cimport attr_t
cdef class BiluoPushDown(TransitionSystem): cdef class BiluoPushDown(TransitionSystem):

View File

@ -2,6 +2,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
from collections import OrderedDict
import numpy
from thinc.neural.ops import NumpyOps
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
@ -51,17 +55,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
cdef class BiluoPushDown(TransitionSystem): cdef class BiluoPushDown(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
def __reduce__(self):
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str)
return (BiluoPushDown, (self.strings, labels_by_action),
None, None)
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions',
{ OrderedDict((
MISSING: [''], (MISSING, ['']),
BEGIN: [], (BEGIN, []),
IN: [], (IN, []),
LAST: [], (LAST, []),
UNIT: [], (UNIT, []),
OUT: [''] (OUT, [''])
}) )))
seen_entities = set() seen_entities = set()
for entity_type in kwargs.get('entity_types', []): for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities: if entity_type in seen_entities:
@ -87,42 +103,75 @@ cdef class BiluoPushDown(TransitionSystem):
def __get__(self): def __get__(self):
return (BEGIN, IN, LAST, UNIT, OUT) return (BEGIN, IN, LAST, UNIT, OUT)
def move_name(self, int move, int label): def move_name(self, int move, attr_t label):
if move == OUT: if move == OUT:
return 'O' return 'O'
elif move == 'MISSING': elif move == MISSING:
return 'M' return 'M'
else: else:
return MOVE_NAMES[move] + '-' + self.strings[label] return MOVE_NAMES[move] + '-' + self.strings[label]
cdef int preprocess_gold(self, GoldParse gold) except -1: def has_gold(self, GoldParse gold, start=0, end=None):
end = end or len(gold.ner)
if all([tag == '-' for tag in gold.ner[start:end]]):
return False
else:
return True
def preprocess_gold(self, GoldParse gold):
if not self.has_gold(gold):
return None
for i in range(gold.length): for i in range(gold.length):
gold.c.ner[i] = self.lookup_transition(gold.ner[i]) gold.c.ner[i] = self.lookup_transition(gold.ner[i])
# Count frequencies, for use in encoder return gold
if gold.c.ner[i].move in (BEGIN, UNIT):
self.freqs[ENT_IOB][3] += 1 def get_beam_annot(self, Beam beam):
self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1 entities = {}
elif gold.c.ner[i].move in (IN, LAST): probs = beam.probs
self.freqs[ENT_IOB][2] += 1 for i in range(beam.size):
self.freqs[ENT_TYPE][0] += 1 stcls = <StateClass>beam.at(i)
elif gold.c.ner[i].move == OUT: if stcls.is_final():
self.freqs[ENT_IOB][1] += 1 self.finalize_state(stcls.c)
self.freqs[ENT_TYPE][0] += 1 prob = probs[i]
else: for j in range(stcls.c._e_i):
self.freqs[ENT_IOB][1] += 1 start = stcls.c._ents[j].start
self.freqs[ENT_TYPE][0] += 1 end = stcls.c._ents[j].end
label = stcls.c._ents[j].label
entities.setdefault((start, end, label), 0.0)
entities[(start, end, label)] += prob
return entities
def get_beam_parses(self, Beam beam):
parses = []
probs = beam.probs
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if stcls.is_final():
self.finalize_state(stcls.c)
prob = probs[i]
parse = []
for j in range(stcls.c._e_i):
start = stcls.c._ents[j].start
end = stcls.c._ents[j].end
label = stcls.c._ents[j].label
parse.append((start, end, self.strings[label]))
parses.append((prob, parse))
return parses
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == None: if name == '-' or name == None:
move_str = 'M' move_str = 'M'
label = 0 label = 0
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)
elif '-' in name: elif '-' in name:
move_str, label_str = name.split('-', 1) move_str, label_str = name.split('-', 1)
# Hacky way to denote 'not this entity' # Hacky way to denote 'not this entity'
if label_str.startswith('!'): if label_str.startswith('!'):
label_str = label_str[1:] label_str = label_str[1:]
move_str = 'x' move_str = 'x'
label = self.strings[label_str] label = self.strings.add(label_str)
else: else:
move_str = name move_str = name
label = 0 label = 0
@ -135,7 +184,7 @@ cdef class BiluoPushDown(TransitionSystem):
else: else:
raise KeyError(name) raise KeyError(name)
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition() # TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers # constructor with the function pointers
cdef Transition t cdef Transition t
@ -184,21 +233,21 @@ cdef class BiluoPushDown(TransitionSystem):
cdef class Missing: cdef class Missing:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return False return False
@staticmethod @staticmethod
cdef int transition(StateC* s, int label) nogil: cdef int transition(StateC* s, attr_t label) nogil:
pass pass
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 9000 return 9000
cdef class Begin: cdef class Begin:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
# Ensure we don't clobber preset entities. If no entity preset, # Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0 # ent_iob is 0
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
@ -222,16 +271,16 @@ cdef class Begin:
return label != 0 and not st.entity_is_open() return label != 0 and not st.entity_is_open()
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label) st.open_ent(label)
st.set_ent_tag(st.B(0), 3, label) st.set_ent_tag(st.B(0), 3, label)
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING: if g_act == MISSING:
return 0 return 0
@ -251,7 +300,7 @@ cdef class Begin:
cdef class In: cdef class In:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2: if preset_ent_iob == 2:
return False return False
@ -267,17 +316,17 @@ cdef class In:
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 1, label) st.set_ent_tag(st.B(0), 1, label)
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = IN move = IN
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef bint is_sunk = _entity_is_sunk(s, gold.ner) cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
if g_act == MISSING: if g_act == MISSING:
@ -297,30 +346,33 @@ cdef class In:
elif g_act == UNIT: elif g_act == UNIT:
# I, Gold U --> True iff next tag == O # I, Gold U --> True iff next tag == O
return next_act != OUT return next_act != OUT
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
else: else:
return 1 return 1
cdef class Last: cdef class Last:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.B_(1).ent_iob == 1: if st.B_(1).ent_iob == 1:
return False return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.close_ent() st.close_ent()
st.set_ent_tag(st.B(0), 1, label) st.set_ent_tag(st.B(0), 1, label)
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = LAST move = LAST
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING: if g_act == MISSING:
return 0 return 0
@ -339,13 +391,16 @@ cdef class Last:
elif g_act == UNIT: elif g_act == UNIT:
# L, Gold U --> True # L, Gold U --> True
return 0 return 0
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return 0
else: else:
return 1 return 1
cdef class Unit: cdef class Unit:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2: if preset_ent_iob == 2:
return False return False
@ -358,7 +413,7 @@ cdef class Unit:
return label != 0 and not st.entity_is_open() return label != 0 and not st.entity_is_open()
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label) st.open_ent(label)
st.close_ent() st.close_ent()
st.set_ent_tag(st.B(0), 3, label) st.set_ent_tag(st.B(0), 3, label)
@ -366,9 +421,9 @@ cdef class Unit:
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING: if g_act == MISSING:
return 0 return 0
@ -388,7 +443,7 @@ cdef class Unit:
cdef class Out: cdef class Out:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 3: if preset_ent_iob == 3:
return False return False
@ -397,17 +452,19 @@ cdef class Out:
return not st.entity_is_open() return not st.entity_is_open()
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 2, 0) st.set_ent_tag(st.B(0), 2, 0)
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING or g_act == ISNT: if g_act == ISNT and g_tag == 0:
return 1
elif g_act == MISSING or g_act == ISNT:
return 0 return 0
elif g_act == BEGIN: elif g_act == BEGIN:
# O, Gold B --> False # O, Gold B --> False

View File

@ -5,7 +5,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from collections import Counter from collections import Counter, OrderedDict
import ujson import ujson
import contextlib import contextlib
@ -18,6 +18,7 @@ import dill
import numpy.random import numpy.random
cimport numpy as np cimport numpy as np
from libcpp.vector cimport vector
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals from cpython.exc cimport PyErr_CheckSignals
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
@ -28,26 +29,30 @@ from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec from thinc.linalg cimport VecVec
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
from thinc.extra.eg cimport Example from thinc.extra.eg cimport Example
from thinc.extra.search cimport Beam
from cymem.cymem cimport Pool, Address from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from thinc.api import layerize, chain from thinc.api import layerize, chain, noop, clone
from thinc.neural import Model, Affine, ELU, ReLu, Maxout from thinc.neural import Model, Affine, ELU, ReLu, Maxout
from thinc.neural.ops import NumpyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
from .. import util from .. import util
from ..util import get_async, get_cuda_stream from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats from .._ml import Tok2Vec, doc2feats, rebatch
from ..compat import json_dumps
from . import _parse_features from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .nonproj import PseudoProjectivity from . import nonproj
from .transition_system import OracleError from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC from ..structs cimport TokenC
@ -104,68 +109,75 @@ cdef class precompute_hiddens:
cached = gpu_cached cached = gpu_cached
self.nF = cached.shape[1] self.nF = cached.shape[1]
self.nO = cached.shape[2] self.nO = cached.shape[2]
self.nP = cached.shape[3] self.nP = getattr(lower_model, 'nP', 1)
self.ops = lower_model.ops self.ops = lower_model.ops
self._features = numpy.zeros((batch_size, self.nO, self.nP), dtype='f')
self._is_synchronized = False self._is_synchronized = False
self._cuda_stream = cuda_stream self._cuda_stream = cuda_stream
self._cached = cached self._cached = cached
self._bp_hiddens = bp_features self._bp_hiddens = bp_features
def __call__(self, X): cdef const float* get_feat_weights(self) except NULL:
return self.begin_update(X)[0]
def begin_update(self, token_ids, drop=0.):
self._features.fill(0)
if not self._is_synchronized \ if not self._is_synchronized \
and self._cuda_stream is not None: and self._cuda_stream is not None:
self._cuda_stream.synchronize() self._cuda_stream.synchronize()
self._is_synchronized = True self._is_synchronized = True
return <float*>self._cached.data
def __call__(self, X):
return self.begin_update(X)[0]
def begin_update(self, token_ids, drop=0.):
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
# This is tricky, but (assuming GPU available); # This is tricky, but (assuming GPU available);
# - Input to forward on CPU # - Input to forward on CPU
# - Output from forward on CPU # - Output from forward on CPU
# - Input to backward on GPU! # - Input to backward on GPU!
# - Output from backward on GPU # - Output from backward on GPU
cdef np.ndarray state_vector = self._features[:len(token_ids)]
cdef np.ndarray hiddens = self._cached
bp_hiddens = self._bp_hiddens bp_hiddens = self._bp_hiddens
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids cdef int[:, ::1] ids = token_ids
self._sum_features(<float*>state_vector.data, sum_state_features(<float*>state_vector.data,
<float*>hiddens.data, &ids[0,0], feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP) token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
output, bp_output = self._apply_nonlinearity(state_vector) def backward(d_state_vector, sgd=None):
if bp_nonlinearity is not None:
def backward(d_output, sgd=None): d_state_vector = bp_nonlinearity(d_state_vector, sgd)
# This will usually be on GPU # This will usually be on GPU
if isinstance(d_output, numpy.ndarray): if isinstance(d_state_vector, numpy.ndarray):
d_output = self.ops.xp.array(d_output) d_state_vector = self.ops.xp.array(d_state_vector)
d_state_vector = bp_output(d_output, sgd)
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd) d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
return d_tokens return d_tokens
return output, backward return state_vector, backward
def _apply_nonlinearity(self, X): def _nonlinearity(self, state_vector):
if self.nP < 2: if self.nP == 1:
return X.reshape(X.shape[:2]), lambda dX, sgd=None: dX.reshape(X.shape) return state_vector, None
best, which = self.ops.maxout(X) state_vector = state_vector.reshape(
return best, lambda dX, sgd=None: self.ops.backprop_maxout(dX, which, self.nP) (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
best, which = self.ops.maxout(state_vector)
def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
cdef void _sum_features(self, float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i cdef void sum_state_features(float* output,
cdef const float* feature const float* cached, const int* token_ids, int B, int F, int O) nogil:
for b in range(B): cdef int idx, b, f, i
for f in range(F): cdef const float* feature
if token_ids[f] < 0: for b in range(B):
continue for f in range(F):
idx = token_ids[f] * F * O + f*O if token_ids[f] < 0:
feature = &cached[idx] continue
for i in range(O): idx = token_ids[f] * F * O + f*O
output[i] += feature[i] feature = &cached[idx]
output += O for i in range(O):
token_ids += F output[i] += feature[i]
output += O
token_ids += F
cdef void cpu_log_loss(float* d_scores, cdef void cpu_log_loss(float* d_scores,
@ -220,25 +232,39 @@ cdef class Parser:
Base class of the DependencyParser and EntityRecognizer. Base class of the DependencyParser and EntityRecognizer.
""" """
@classmethod @classmethod
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg): def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
depth = util.env_opt('parser_hidden_depth', depth)
token_vector_width = util.env_opt('token_vector_width', token_vector_width) token_vector_width = util.env_opt('token_vector_width', token_vector_width)
hidden_width = util.env_opt('hidden_width', hidden_width) hidden_width = util.env_opt('hidden_width', hidden_width)
maxout_pieces = util.env_opt('parser_maxout_pieces', 1) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
lower = PrecomputableMaxouts(hidden_width, tensors = Tok2Vec(token_vector_width, 7500, preprocess=doc2feats())
nF=cls.nr_feature, if parser_maxout_pieces == 1:
nI=token_vector_width, lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
pieces=maxout_pieces) nF=cls.nr_feature,
nI=token_vector_width)
else:
lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature,
nP=parser_maxout_pieces,
nI=token_vector_width)
with Model.use_device('cpu'): with Model.use_device('cpu'):
upper = chain( upper = chain(
Maxout(hidden_width), clone(Maxout(hidden_width), (depth-1)),
zero_init(Affine(nr_class)) zero_init(Affine(nr_class, drop_factor=0.0))
) )
# TODO: This is an unfortunate hack atm! # TODO: This is an unfortunate hack atm!
# Used to set input dimensions in network. # Used to set input dimensions in network.
lower.begin_training(lower.ops.allocate((500, token_vector_width))) lower.begin_training(lower.ops.allocate((500, token_vector_width)))
upper.begin_training(upper.ops.allocate((500, hidden_width))) upper.begin_training(upper.ops.allocate((500, hidden_width)))
return lower, upper cfg = {
'nr_class': nr_class,
'depth': depth,
'token_vector_width': token_vector_width,
'hidden_width': hidden_width,
'maxout_pieces': parser_maxout_pieces
}
return (tensors, lower, upper), cfg
def __init__(self, Vocab vocab, moves=True, model=True, **cfg): def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
""" """
@ -274,7 +300,7 @@ cdef class Parser:
def __reduce__(self): def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None) return (Parser, (self.vocab, self.moves, self.model), None, None)
def __call__(self, Doc tokens, state=None): def __call__(self, Doc doc, beam_width=None, beam_density=None):
""" """
Apply the parser or entity recognizer, setting the annotations onto the Doc object. Apply the parser or entity recognizer, setting the annotations onto the Doc object.
@ -283,10 +309,26 @@ cdef class Parser:
Returns: Returns:
None None
""" """
self.parse_batch([tokens], state['tokvecs']) if beam_width is None:
return state beam_width = self.cfg.get('beam_width', 1)
if beam_density is None:
beam_density = self.cfg.get('beam_density', 0.001)
cdef Beam beam
if beam_width == 1:
states = self.parse_batch([doc], [doc.tensor])
self.set_annotations([doc], states)
return doc
else:
beam = self.beam_parse([doc], [doc.tensor],
beam_width=beam_width, beam_density=beam_density)[0]
output = self.moves.get_beam_annot(beam)
state = <StateClass>beam.at(0)
self.set_annotations([doc], [state])
_cleanup(beam)
return output
def pipe(self, stream, int batch_size=1000, int n_threads=2): def pipe(self, docs, int batch_size=1000, int n_threads=2,
beam_width=1, beam_density=0.001):
""" """
Process a stream of documents. Process a stream of documents.
@ -298,99 +340,244 @@ cdef class Parser:
The number of threads with which to work on the buffer in parallel. The number of threads with which to work on the buffer in parallel.
Yields (Doc): Documents, in order. Yields (Doc): Documents, in order.
""" """
cdef StateClass parse_state
cdef Doc doc cdef Doc doc
queue = [] for docs in cytoolz.partition_all(batch_size, docs):
for batch in cytoolz.partition_all(batch_size, stream): docs = list(docs)
batch = list(batch) tokvecs = [doc.tensor for doc in docs]
docs, states = zip(*batch) if beam_width == 1:
parse_states = self.parse_batch(docs, states[0]['tokvecs']) parse_states = self.parse_batch(docs, tokvecs)
else:
parse_states = self.beam_parse(docs, tokvecs,
beam_width=beam_width, beam_density=beam_density)
self.set_annotations(docs, parse_states) self.set_annotations(docs, parse_states)
yield from zip(docs, states) yield from docs
def parse_batch(self, docs, tokvecses):
cdef:
precompute_hiddens state2vec
StateClass state
Pool mem
const float* feat_weights
StateC* st
vector[StateC*] next_step, this_step
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
if isinstance(docs, Doc):
docs = [docs]
if isinstance(tokvecses, np.ndarray):
tokvecses = [tokvecses]
tokvecs = self.model[0].ops.flatten(tokvecses)
tokvecs += self.model[0].ops.flatten(self.model[0](docs))
nr_state = len(docs)
nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1]
nr_feat = self.nr_feature
def parse_batch(self, docs, tokvecs):
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
cuda_stream, 0.0)
nr_piece = state2vec.nP
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, for state in states:
cuda_stream, 0.0) if not state.c.is_final():
next_step.push_back(state.c)
todo = [st for st in states if not st.is_final()] feat_weights = state2vec.get_feat_weights()
while todo: cdef int i
token_ids = self.get_token_ids(states) cdef np.ndarray token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
vectors = state2vec(token_ids) cdef np.ndarray is_valid = numpy.zeros((nr_state, nr_class), dtype='i')
cdef np.ndarray scores
c_token_ids = <int*>token_ids.data
c_is_valid = <int*>is_valid.data
while not next_step.empty():
for i in range(next_step.size()):
st = next_step[i]
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
self.moves.set_valid(&c_is_valid[i*nr_class], st)
vectors = state2vec(token_ids[:next_step.size()])
scores = vec2scores(vectors) scores = vec2scores(vectors)
self.transition_batch(states, scores) c_scores = <float*>scores.data
todo = [st for st in states if not st.is_final()] for i in range(next_step.size()):
st = next_step[i]
guess = arg_max_if_valid(
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
action = self.moves.c[guess]
action.do(st, action.label)
this_step, next_step = next_step, this_step
next_step.clear()
for st in this_step:
if not st.is_final():
next_step.push_back(st)
return states return states
def update(self, docs, golds, state=None, drop=0., sgd=None): def beam_parse(self, docs, tokvecses, int beam_width=8, float beam_density=0.001):
assert state is not None cdef Beam beam
assert 'tokvecs' in state cdef np.ndarray scores
assert 'bp_tokvecs' in state cdef Doc doc
cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output
tokvecs = self.model[0].ops.flatten(tokvecses)
tokvecs += self.model[0].ops.flatten(self.model[0](docs))
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs,
cuda_stream, 0.0)
beams = []
cdef int offset = 0
for doc in docs:
beam = Beam(nr_class, beam_width, min_density=beam_density)
beam.initialize(self.moves.init_beam_state, doc.length, doc.c)
for i in range(beam.width):
stcls = <StateClass>beam.at(i)
stcls.c.offset = offset
offset += len(doc)
beam.check_done(_check_final_state, NULL)
while not beam.is_done:
states = []
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
states.append(stcls)
token_ids = self.get_token_ids(states)
vectors = state2vec(token_ids)
scores = vec2scores(vectors)
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if not stcls.is_final():
self.moves.set_valid(beam.is_valid[i], stcls.c)
for j in range(nr_class):
beam.scores[i][j] = scores[i, j]
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL)
beams.append(beam)
return beams
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs
tokvecs = self.model[0].ops.flatten(tokvec_lists)
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs, drop=0.)
my_tokvecs = self.model[0].ops.flatten(my_tokvecs)
tokvecs += my_tokvecs
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
for gold in golds:
self.moves.preprocess_gold(gold)
tokvecs = state['tokvecs'] states, golds, max_steps = self._init_gold_batch(docs, golds)
bp_tokvecs = state['bp_tokvecs']
states = self.moves.init_batch(docs)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
drop) 0.0)
todo = [(s, g) for (s, g) in zip(states, golds)
todo = [(s, g) for s, g in zip(states, golds) if not s.is_final()] if not s.is_final() and g is not None]
if not todo:
return None
backprops = [] backprops = []
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
cdef float loss = 0. cdef float loss = 0.
cutoff = max(1, len(todo) // 10) n_steps = 0
while len(todo) >= cutoff: while todo:
states, golds = zip(*todo) states, golds = zip(*todo)
token_ids = self.get_token_ids(states) token_ids = self.get_token_ids(states)
vector, bp_vector = state2vec.begin_update(token_ids, drop=drop) vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
if drop != 0:
mask = vec2scores.ops.get_dropout_mask(vector.shape, drop)
vector *= mask
scores, bp_scores = vec2scores.begin_update(vector, drop=drop) scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
d_scores = self.get_batch_loss(states, golds, scores) d_scores = self.get_batch_loss(states, golds, scores)
d_vector = bp_scores(d_scores, sgd=sgd) d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
loss += (d_scores**2).sum() if drop != 0:
d_vector *= mask
if not isinstance(tokvecs, state2vec.ops.xp.ndarray): if isinstance(self.model[0].ops, CupyOps) \
backprops.append((token_ids, d_vector, bp_vector)) and not isinstance(token_ids, state2vec.ops.xp.ndarray):
else:
# Move token_ids and d_vector to CPU, asynchronously # Move token_ids and d_vector to CPU, asynchronously
backprops.append(( backprops.append((
get_async(cuda_stream, token_ids), get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector), get_async(cuda_stream, d_vector),
bp_vector bp_vector
)) ))
else:
backprops.append((token_ids, d_vector, bp_vector))
self.transition_batch(states, scores) self.transition_batch(states, scores)
todo = [st for st in todo if not st[0].is_final()] todo = [st for st in todo if not st[0].is_final()]
if losses is not None:
losses[self.name] += (d_scores**2).sum()
n_steps += 1
if n_steps >= max_steps:
break
self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
#bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N,
where N is the shortest doc. We'll make two states, one representing
long_doc[:N], and another representing long_doc[N:]."""
cdef:
StateClass state
Transition action
whole_states = self.moves.init_batch(whole_docs)
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
max_moves = 0
states = []
golds = []
for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
gold = self.moves.preprocess_gold(gold)
if gold is None:
continue
oracle_actions = self.moves.get_oracle_sequence(doc, gold)
start = 0
while start < len(doc):
state = state.copy()
n_moves = 0
while state.B(0) < start and not state.is_final():
action = self.moves.c[oracle_actions.pop(0)]
action.do(state.c, action.label)
n_moves += 1
has_gold = self.moves.has_gold(gold, start=start,
end=start+max_length)
if not state.is_final() and has_gold:
states.append(state)
golds.append(gold)
max_moves = max(max_moves, n_moves)
start += min(max_length, len(doc)-start)
max_moves = max(max_moves, len(oracle_actions))
return states, golds, max_moves
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
# Tells CUDA to block, so our async copies complete. # Tells CUDA to block, so our async copies complete.
if cuda_stream is not None: if cuda_stream is not None:
cuda_stream.synchronize() cuda_stream.synchronize()
d_tokvecs = state2vec.ops.allocate(tokvecs.shape) xp = get_array_module(d_tokvecs)
xp = state2vec.ops.xp # Handle for numpy/cupy for ids, d_vector, bp_vector in backprops:
for token_ids, d_vector, bp_vector in backprops:
d_state_features = bp_vector(d_vector, sgd=sgd) d_state_features = bp_vector(d_vector, sgd=sgd)
active_feats = token_ids * (token_ids >= 0) active_feats = ids * (ids >= 0)
active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1)) active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
if hasattr(xp, 'scatter_add'): if hasattr(xp, 'scatter_add'):
xp.scatter_add(d_tokvecs, xp.scatter_add(d_tokvecs,
token_ids, d_state_features * active_feats) ids, d_state_features * active_feats)
else: else:
xp.add.at(d_tokvecs, xp.add.at(d_tokvecs,
token_ids, d_state_features * active_feats) ids, d_state_features * active_feats)
bp_tokvecs(d_tokvecs, sgd)
state['parser_loss'] = loss @property
return state def move_names(self):
names = []
for i in range(self.moves.n_moves):
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
names.append(name)
return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout): def get_batch_model(self, batch_size, tokvecs, stream, dropout):
lower, upper = self.model _, lower, upper = self.model
state2vec = precompute_hiddens(batch_size, tokvecs, state2vec = precompute_hiddens(batch_size, tokvecs,
lower, stream, drop=dropout) lower, stream, drop=dropout)
return state2vec, upper return state2vec, upper
@ -400,9 +587,13 @@ cdef class Parser:
def get_token_ids(self, states): def get_token_ids(self, states):
cdef StateClass state cdef StateClass state
cdef int n_tokens = self.nr_feature cdef int n_tokens = self.nr_feature
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='C') cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
dtype='i', order='C')
c_ids = <int*>ids.data
for i, state in enumerate(states): for i, state in enumerate(states):
state.set_context_tokens(ids[i]) if not state.is_final():
state.c.set_context_tokens(c_ids, n_tokens)
c_ids += ids.shape[1]
return ids return ids
def transition_batch(self, states, float[:, ::1] scores): def transition_batch(self, states, float[:, ::1] scores):
@ -445,7 +636,6 @@ cdef class Parser:
self.moves.finalize_doc(doc) self.moves.finalize_doc(doc)
def add_label(self, label): def add_label(self, label):
# Doesn't set label into serializer -- subclasses override it to do that.
for action in self.moves.action_types: for action in self.moves.action_types:
added = self.moves.add_action(action, label) added = self.moves.add_action(action, label)
if added: if added:
@ -456,12 +646,18 @@ cdef class Parser:
def begin_training(self, gold_tuples, **cfg): def begin_training(self, gold_tuples, **cfg):
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
actions = self.moves.get_actions(gold_parses=gold_tuples) actions = self.moves.get_actions(gold_parses=gold_tuples)
for action, labels in actions.items(): for action, labels in actions.items():
for label in labels: for label in labels:
self.moves.add_action(action, label) self.moves.add_action(action, label)
if self.model is True: if self.model is True:
self.model = self.Model(self.moves.n_moves, **cfg) self.model, cfg = self.Model(self.moves.n_moves, **cfg)
self.cfg.update(cfg)
def preprocess_gold(self, docs_golds):
for doc, gold in docs_golds:
yield doc, gold
def use_params(self, params): def use_params(self, params):
# Can't decorate cdef class :(. Workaround. # Can't decorate cdef class :(. Workaround.
@ -469,21 +665,85 @@ cdef class Parser:
with self.model[1].use_params(params): with self.model[1].use_params(params):
yield yield
def to_disk(self, path): def to_disk(self, path, **exclude):
path = util.ensure_path(path) serializers = {
with (path / 'model.bin').open('wb') as file_: 'tok2vec_model': lambda p: p.open('wb').write(
dill.dump(self.model, file_) self.model[0].to_bytes()),
'lower_model': lambda p: p.open('wb').write(
self.model[1].to_bytes()),
'upper_model': lambda p: p.open('wb').write(
self.model[2].to_bytes()),
'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False),
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
}
util.to_disk(path, serializers, exclude)
def from_disk(self, path): def from_disk(self, path, **exclude):
path = util.ensure_path(path) deserializers = {
with (path / 'model.bin').open('wb') as file_: 'vocab': lambda p: self.vocab.from_disk(p),
self.model = dill.load(file_) 'moves': lambda p: self.moves.from_disk(p, strings=False),
'cfg': lambda p: self.cfg.update(ujson.load(p.open())),
'model': lambda p: None
}
util.from_disk(path, deserializers, exclude)
if 'model' not in exclude:
path = util.ensure_path(path)
if self.model is True:
self.model, cfg = self.Model(**self.cfg)
else:
cfg = {}
with (path / 'tok2vec_model').open('rb') as file_:
bytes_data = file_.read()
self.model[0].from_bytes(bytes_data)
with (path / 'lower_model').open('rb') as file_:
bytes_data = file_.read()
self.model[1].from_bytes(bytes_data)
with (path / 'upper_model').open('rb') as file_:
bytes_data = file_.read()
self.model[2].from_bytes(bytes_data)
self.cfg.update(cfg)
return self
def to_bytes(self): def to_bytes(self, **exclude):
pass serializers = OrderedDict((
('tok2vec_model', lambda: self.model[0].to_bytes()),
('lower_model', lambda: self.model[1].to_bytes()),
('upper_model', lambda: self.model[2].to_bytes()),
('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)),
('cfg', lambda: ujson.dumps(self.cfg))
))
if 'model' in exclude:
exclude['tok2vec_model'] = True
exclude['lower_model'] = True
exclude['upper_model'] = True
exclude.pop('model')
return util.to_bytes(serializers, exclude)
def from_bytes(self, data): def from_bytes(self, bytes_data, **exclude):
pass deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('tok2vec_model', lambda b: None),
('lower_model', lambda b: None),
('upper_model', lambda b: None)
))
msg = util.from_bytes(bytes_data, deserializers, exclude)
if 'model' not in exclude:
if self.model is True:
self.model, cfg = self.Model(self.moves.n_moves)
else:
cfg = {}
if 'tok2vec_model' in msg:
self.model[0].from_bytes(msg['tok2vec_model'])
if 'lower_model' in msg:
self.model[1].from_bytes(msg['lower_model'])
if 'upper_model' in msg:
self.model[2].from_bytes(msg['upper_model'])
self.cfg.update(cfg)
return self
class ParserStateError(ValueError): class ParserStateError(ValueError):
@ -521,6 +781,19 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
return best return best
cdef int arg_maxout_if_valid(const weight_t* scores, const int* is_valid,
int n, int nP) nogil:
cdef int best = -1
cdef float best_score = 0
for i in range(n):
if is_valid[i] >= 1:
for j in range(nP):
if best == -1 or scores[i*nP+j] > best_score:
best = i
best_score = scores[i*nP+j]
return best
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
int nr_class) except -1: int nr_class) except -1:
cdef weight_t score = 0 cdef weight_t score = 0
@ -531,3 +804,30 @@ cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actio
mode = i mode = i
score = scores[i] score = scores[i]
return mode return mode
# These are passed as callbacks to thinc.search.Beam
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
dest = <StateClass>_dest
src = <StateClass>_src
moves = <const Transition*>_moves
dest.clone(src)
moves[clas].do(dest.c, moves[clas].label)
cdef int _check_final_state(void* _state, void* extra_args) except -1:
return (<StateClass>_state).is_final()
def _cleanup(Beam beam):
for i in range(beam.width):
Py_XDECREF(<PyObject*>beam._states[i].content)
Py_XDECREF(<PyObject*>beam._parents[i].content)
cdef hash_t _hash_state(void* _state, void* _) except 0:
state = <StateClass>_state
if state.c.is_final():
return 1
else:
return state.c.hash()

View File

@ -1,10 +1,17 @@
# coding: utf-8 # coding: utf-8
"""
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
"""
from __future__ import unicode_literals from __future__ import unicode_literals
from copy import copy from copy import copy
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..attrs import DEP, HEAD from ..attrs import DEP, HEAD
DELIMITER = '||'
def ancestors(tokenid, heads): def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root # returns all words going from the word up the path to the root
@ -60,145 +67,124 @@ def is_nonproj_tree(heads):
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) ) return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
class PseudoProjectivity: def decompose(label):
# implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 return label.partition(DELIMITER)[::2]
# for doing pseudo-projective parsing
# implementation uses the HEAD decoration scheme
delimiter = '||'
@classmethod
def decompose(cls, label):
return label.partition(cls.delimiter)[::2]
@classmethod
def is_decorated(cls, label):
return label.find(cls.delimiter) != -1
@classmethod
def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30):
preprocessed = []
freqs = {}
for raw_text, sents in gold_tuples:
prepro_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads,deco_labels = cls.projectivize(heads,labels)
# set the label to ROOT for each root dependent
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
# count label frequencies
if label_freq_cutoff > 0:
for label in deco_labels:
if cls.is_decorated(label):
freqs[label] = freqs.get(label,0) + 1
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
preprocessed.append((raw_text, prepro_sents))
if label_freq_cutoff > 0:
return cls._filter_labels(preprocessed,label_freq_cutoff,freqs)
return preprocessed
@classmethod def is_decorated(label):
def projectivize(cls, heads, labels): return label.find(DELIMITER) != -1
# use the algorithm by Nivre & Nilsson 2005
# assumes heads to be a proper tree, i.e. connected and cycle-free
# returns a new pair (heads,labels) which encode
# a projective and decorated tree
proj_heads = copy(heads)
smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
if smallest_np_arc == None: # this sentence is already projective
return proj_heads, copy(labels)
while smallest_np_arc != None:
cls._lift(smallest_np_arc, proj_heads)
smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
deco_labels = cls._decorate(heads, proj_heads, labels)
return proj_heads, deco_labels
@classmethod def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
def deprojectivize(cls, tokens): preprocessed = []
# reattach arcs with decorated labels (following HEAD scheme) freqs = {}
# for each decorated arc X||Y, search top-down, left-to-right, for raw_text, sents in gold_tuples:
# breadth-first until hitting a Y then make this the new head prepro_sents = []
#parse = tokens.to_array([HEAD, DEP]) for (ids, words, tags, heads, labels, iob), ctnts in sents:
for token in tokens: proj_heads,deco_labels = projectivize(heads,labels)
if cls.is_decorated(token.dep_): # set the label to ROOT for each root dependent
newlabel,headlabel = cls.decompose(token.dep_) deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
newhead = cls._find_new_head(token,headlabel) # count label frequencies
token.head = newhead if label_freq_cutoff > 0:
token.dep_ = newlabel for label in deco_labels:
if is_decorated(label):
freqs[label] = freqs.get(label,0) + 1
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
preprocessed.append((raw_text, prepro_sents))
# tokens.attach(token,newhead,newlabel) if label_freq_cutoff > 0:
#parse[token.i,1] = tokens.vocab.strings[newlabel] return _filter_labels(preprocessed,label_freq_cutoff,freqs)
#parse[token.i,0] = newhead.i - token.i return preprocessed
#tokens.from_array([HEAD, DEP],parse)
@classmethod def projectivize(heads, labels):
def _decorate(cls, heads, proj_heads, labels): # use the algorithm by Nivre & Nilsson 2005
# uses decoration scheme HEAD from Nivre & Nilsson 2005 # assumes heads to be a proper tree, i.e. connected and cycle-free
assert(len(heads) == len(proj_heads) == len(labels)) # returns a new pair (heads,labels) which encode
deco_labels = [] # a projective and decorated tree
for tokenid,head in enumerate(heads): proj_heads = copy(heads)
if head != proj_heads[tokenid]: smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head])) if smallest_np_arc == None: # this sentence is already projective
else: return proj_heads, copy(labels)
deco_labels.append(labels[tokenid]) while smallest_np_arc != None:
return deco_labels _lift(smallest_np_arc, proj_heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
deco_labels = _decorate(heads, proj_heads, labels)
return proj_heads, deco_labels
@classmethod def deprojectivize(tokens):
def _get_smallest_nonproj_arc(cls, heads): # reattach arcs with decorated labels (following HEAD scheme)
# return the smallest non-proj arc or None # for each decorated arc X||Y, search top-down, left-to-right,
# where size is defined as the distance between dep and head # breadth-first until hitting a Y then make this the new head
# and ties are broken left to right for token in tokens:
smallest_size = float('inf') if is_decorated(token.dep_):
smallest_np_arc = None newlabel,headlabel = decompose(token.dep_)
for tokenid,head in enumerate(heads): newhead = _find_new_head(token,headlabel)
size = abs(tokenid-head) token.head = newhead
if size < smallest_size and is_nonproj_arc(tokenid,heads): token.dep_ = newlabel
smallest_size = size return tokens
smallest_np_arc = tokenid
return smallest_np_arc def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005
assert(len(heads) == len(proj_heads) == len(labels))
deco_labels = []
for tokenid,head in enumerate(heads):
if head != proj_heads[tokenid]:
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
else:
deco_labels.append(labels[tokenid])
return deco_labels
@classmethod def _get_smallest_nonproj_arc(heads):
def _lift(cls, tokenid, heads): # return the smallest non-proj arc or None
# reattaches a word to it's grandfather # where size is defined as the distance between dep and head
head = heads[tokenid] # and ties are broken left to right
ghead = heads[head] smallest_size = float('inf')
# attach to ghead if head isn't attached to root else attach to root smallest_np_arc = None
heads[tokenid] = ghead if head != ghead else tokenid for tokenid,head in enumerate(heads):
size = abs(tokenid-head)
if size < smallest_size and is_nonproj_arc(tokenid,heads):
smallest_size = size
smallest_np_arc = tokenid
return smallest_np_arc
@classmethod def _lift(tokenid, heads):
def _find_new_head(cls, token, headlabel): # reattaches a word to it's grandfather
# search through the tree starting from the head of the given token head = heads[tokenid]
# returns the id of the first descendant with the given label ghead = heads[head]
# if there is none, return the current head (no change) # attach to ghead if head isn't attached to root else attach to root
queue = [token.head] heads[tokenid] = ghead if head != ghead else tokenid
while queue:
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space: continue
if child == token: continue
if child.dep_ == headlabel:
return child
next_queue.append(child)
queue = next_queue
return token.head
@classmethod def _find_new_head(token, headlabel):
def _filter_labels(cls, gold_tuples, cutoff, freqs): # search through the tree starting from the head of the given token
# throw away infrequent decorated labels # returns the id of the first descendant with the given label
# can't learn them reliably anyway and keeps label set smaller # if there is none, return the current head (no change)
filtered = [] queue = [token.head]
for raw_text, sents in gold_tuples: while queue:
filtered_sents = [] next_queue = []
for (ids, words, tags, heads, labels, iob), ctnts in sents: for qtoken in queue:
filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ] for child in qtoken.children:
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) if child.is_space: continue
filtered.append((raw_text, filtered_sents)) if child == token: continue
return filtered if child.dep_ == headlabel:
return child
next_queue.append(child)
queue = next_queue
return token.head
def _filter_labels(gold_tuples, cutoff, freqs):
# throw away infrequent decorated labels
# can't learn them reliably anyway and keeps label set smaller
filtered = []
for raw_text, sents in gold_tuples:
filtered_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
filtered.append((raw_text, filtered_sents))
return filtered

View File

@ -33,7 +33,6 @@ from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .nonproj import PseudoProjectivity
from .transition_system import OracleError from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC from ..structs cimport TokenC

View File

@ -4,6 +4,7 @@ from cymem.cymem cimport Pool
cimport cython cimport cython
from ..structs cimport TokenC, Entity from ..structs cimport TokenC, Entity
from ..typedefs cimport attr_t
from ..vocab cimport EMPTY_LEXEME from ..vocab cimport EMPTY_LEXEME
from ._state cimport StateC from ._state cimport StateC
@ -105,19 +106,19 @@ cdef class StateClass:
cdef inline void unshift(self) nogil: cdef inline void unshift(self) nogil:
self.c.unshift() self.c.unshift()
cdef inline void add_arc(self, int head, int child, int label) nogil: cdef inline void add_arc(self, int head, int child, attr_t label) nogil:
self.c.add_arc(head, child, label) self.c.add_arc(head, child, label)
cdef inline void del_arc(self, int head, int child) nogil: cdef inline void del_arc(self, int head, int child) nogil:
self.c.del_arc(head, child) self.c.del_arc(head, child)
cdef inline void open_ent(self, int label) nogil: cdef inline void open_ent(self, attr_t label) nogil:
self.c.open_ent(label) self.c.open_ent(label)
cdef inline void close_ent(self) nogil: cdef inline void close_ent(self) nogil:
self.c.close_ent() self.c.close_ent()
cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil: cdef inline void set_ent_tag(self, int i, int ent_iob, attr_t ent_type) nogil:
self.c.set_ent_tag(i, ent_iob, ent_type) self.c.set_ent_tag(i, ent_iob, ent_type)
cdef inline void set_break(self, int i) nogil: cdef inline void set_break(self, int i) nogil:

View File

@ -41,6 +41,11 @@ cdef class StateClass:
def is_final(self): def is_final(self):
return self.c.is_final() return self.c.is_final()
def copy(self):
cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length)
new_state.c.clone(self.c)
return new_state
def print_state(self, words): def print_state(self, words):
words = list(words) + ['_'] words = list(words) + ['_']
top = words[self.S(0)] + '_%d' % self.S_(0).head top = words[self.S(0)] + '_%d' % self.S_(0).head

View File

@ -1,6 +1,7 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from ..typedefs cimport attr_t
from ..structs cimport TokenC from ..structs cimport TokenC
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..gold cimport GoldParseC from ..gold cimport GoldParseC
@ -13,20 +14,22 @@ from ._state cimport StateC
cdef struct Transition: cdef struct Transition:
int clas int clas
int move int move
int label attr_t label
weight_t score weight_t score
bint (*is_valid)(const StateC* state, int label) nogil bint (*is_valid)(const StateC* state, attr_t label) nogil
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
int (*do)(StateC* state, int label) nogil int (*do)(StateC* state, attr_t label) nogil
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
attr_tlabel) nogil
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
gold, attr_t label) nogil
ctypedef int (*do_func_t)(StateC* state, int label) nogil ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
@ -36,18 +39,16 @@ cdef class TransitionSystem:
cdef Transition* c cdef Transition* c
cdef readonly int n_moves cdef readonly int n_moves
cdef int _size cdef int _size
cdef public int root_label cdef public attr_t root_label
cdef public freqs cdef public freqs
cdef init_state_t init_beam_state cdef init_state_t init_beam_state
cdef int initialize_state(self, StateC* state) nogil cdef int initialize_state(self, StateC* state) nogil
cdef int finalize_state(self, StateC* state) nogil cdef int finalize_state(self, StateC* state) nogil
cdef int preprocess_gold(self, GoldParse gold) except -1
cdef Transition lookup_transition(self, object name) except * cdef Transition lookup_transition(self, object name) except *
cdef Transition init_transition(self, int clas, int move, int label) except * cdef Transition init_transition(self, int clas, int move, attr_t label) except *
cdef int set_valid(self, int* output, const StateC* st) nogil cdef int set_valid(self, int* output, const StateC* st) nogil

View File

@ -5,11 +5,14 @@ from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from collections import defaultdict from collections import defaultdict, OrderedDict
import ujson
from .. import util
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from ..typedefs cimport attr_t
cdef weight_t MIN_SCORE = -90000 cdef weight_t MIN_SCORE = -90000
@ -26,7 +29,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef class TransitionSystem: cdef class TransitionSystem:
def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None): def __init__(self, StringStore string_table, labels_by_action):
self.mem = Pool() self.mem = Pool()
self.strings = string_table self.strings = string_table
self.n_moves = 0 self.n_moves = 0
@ -34,28 +37,20 @@ cdef class TransitionSystem:
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition)) self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
for action, label_strs in sorted(labels_by_action.items()): for action, label_strs in labels_by_action.items():
for label_str in label_strs: for label_str in label_strs:
self.add_action(int(action), label_str) self.add_action(int(action), label_str)
self.root_label = self.strings['ROOT'] self.root_label = self.strings.add('ROOT')
self.freqs = {} if _freqs is None else _freqs
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
self.freqs[attr] = defaultdict(int)
self.freqs[attr][0] = 1
# Ensure we've seen heads. Need an official dependency length limit...
for i in range(10024):
self.freqs[HEAD][i] = 1
self.freqs[HEAD][-i] = 1
self.init_beam_state = _init_state self.init_beam_state = _init_state
def __reduce__(self): def __reduce__(self):
labels_by_action = {} labels_by_action = OrderedDict()
cdef Transition t cdef Transition t
for trans in self.c[:self.n_moves]: for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label] label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str) labels_by_action.setdefault(trans.move, []).append(label_str)
return (self.__class__, return (self.__class__,
(self.strings, labels_by_action, self.freqs), (self.strings, labels_by_action),
None, None) None, None)
def init_batch(self, docs): def init_batch(self, docs):
@ -69,6 +64,29 @@ cdef class TransitionSystem:
offset += len(doc) offset += len(doc)
return states return states
def get_oracle_sequence(self, doc, GoldParse gold):
cdef Pool mem = Pool()
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
cdef StateClass state = StateClass(doc, offset=0)
self.initialize_state(state.c)
history = []
while not state.is_final():
self.set_costs(is_valid, costs, state, gold)
for i in range(self.n_moves):
if is_valid[i] and costs[i] <= 0:
action = self.c[i]
history.append(i)
action.do(state.c, action.label)
break
else:
print(gold.words)
print(gold.ner)
print(history)
raise ValueError("Could not find gold move")
return history
cdef int initialize_state(self, StateC* state) nogil: cdef int initialize_state(self, StateC* state) nogil:
pass pass
@ -78,17 +96,19 @@ cdef class TransitionSystem:
def finalize_doc(self, doc): def finalize_doc(self, doc):
pass pass
cdef int preprocess_gold(self, GoldParse gold) except -1: def preprocess_gold(self, GoldParse gold):
raise NotImplementedError raise NotImplementedError
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError raise NotImplementedError
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
raise NotImplementedError raise NotImplementedError
def is_valid(self, StateClass stcls, move_name): def is_valid(self, StateClass stcls, move_name):
action = self.lookup_transition(move_name) action = self.lookup_transition(move_name)
if action.move == 0:
return False
return action.is_valid(stcls.c, action.label) return action.is_valid(stcls.c, action.label)
cdef int set_valid(self, int* is_valid, const StateC* st) nogil: cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
@ -100,24 +120,80 @@ cdef class TransitionSystem:
StateClass stcls, GoldParse gold) except -1: StateClass stcls, GoldParse gold) except -1:
cdef int i cdef int i
self.set_valid(is_valid, stcls.c) self.set_valid(is_valid, stcls.c)
cdef int n_gold = 0
for i in range(self.n_moves): for i in range(self.n_moves):
if is_valid[i]: if is_valid[i]:
costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
n_gold += costs[i] <= 0
else: else:
costs[i] = 9000 costs[i] = 9000
if n_gold <= 0:
print(gold.words)
print(gold.ner)
print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
raise ValueError(
"Could not find a gold-standard action to supervise "
"the entity recognizer\n"
"The transition system has %d actions." % (self.n_moves))
def add_action(self, int action, label): def get_class_name(self, int clas):
if not isinstance(label, int): act = self.c[clas]
label = self.strings[label] return self.move_name(act.move, act.label)
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int):
label_id = self.strings.add(label_name)
else:
label_id = label_name
# Check we're not creating a move we already have, so that this is # Check we're not creating a move we already have, so that this is
# idempotent # idempotent
for trans in self.c[:self.n_moves]: for trans in self.c[:self.n_moves]:
if trans.move == action and trans.label == label: if trans.move == action and trans.label == label_id:
return 0 return 0
if self.n_moves >= self._size: if self.n_moves >= self._size:
self._size *= 2 self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label) assert self.c[self.n_moves].label == label_id
self.n_moves += 1 self.n_moves += 1
return 1 return 1
def to_disk(self, path, **exclude):
with path.open('wb') as file_:
file_.write(self.to_bytes(**exclude))
def from_disk(self, path, **exclude):
with path.open('rb') as file_:
byte_data = file_.read()
self.from_bytes(byte_data, **exclude)
return self
def to_bytes(self, **exclude):
transitions = []
for trans in self.c[:self.n_moves]:
transitions.append({
'clas': trans.clas,
'move': trans.move,
'label': self.strings[trans.label],
'name': self.move_name(trans.move, trans.label)
})
serializers = {
'transitions': lambda: ujson.dumps(transitions),
'strings': lambda: self.strings.to_bytes()
}
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
transitions = []
deserializers = {
'transitions': lambda b: transitions.extend(ujson.loads(b)),
'strings': lambda b: self.strings.from_bytes(b)
}
msg = util.from_bytes(bytes_data, deserializers, exclude)
for trans in transitions:
self.add_action(trans['move'], trans['label'])
return self

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson
from collections import defaultdict from collections import defaultdict
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -15,7 +14,6 @@ from .tokens.doc cimport Doc
from .attrs cimport TAG from .attrs cimport TAG
from .gold cimport GoldParse from .gold cimport GoldParse
from .attrs cimport * from .attrs cimport *
from . import util
cpdef enum: cpdef enum:
@ -108,55 +106,15 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
cdef class Tagger: cdef class Tagger:
""" """Annotate part-of-speech tags on Doc objects."""
Annotate part-of-speech tags on Doc objects.
"""
@classmethod
def load(cls, path, vocab, require=False):
"""
Load the statistical model from the supplied path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
The vocabulary. Must be shared by the documents to be processed.
require (bool):
Whether to raise an error if the files are not found.
Returns (Tagger):
The newly created object.
"""
# TODO: Change this to expect config.json when we don't have to
# support old data.
path = util.ensure_path(path)
if (path / 'templates.json').exists():
with (path / 'templates.json').open('r', encoding='utf8') as file_:
templates = ujson.load(file_)
elif require:
raise IOError(
"Required file %s/templates.json not found when loading Tagger" % str(path))
else:
templates = cls.feature_templates
self = cls(vocab, model=None, feature_templates=templates)
if (path / 'model').exists():
self.model.load(str(path / 'model'))
elif require:
raise IOError(
"Required file %s/model not found when loading Tagger" % str(path))
return self
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
""" """Create a Tagger.
Create a Tagger.
Arguments: vocab (Vocab): The vocabulary object. Must be shared with documents to
vocab (Vocab): be processed.
The vocabulary object. Must be shared with documents to be processed. model (thinc.linear.AveragedPerceptron): The statistical model.
model (thinc.linear.AveragedPerceptron): RETURNS (Tagger): The newly constructed object.
The statistical model.
Returns (Tagger):
The newly constructed object.
""" """
if model is None: if model is None:
model = TaggerModel(cfg.get('features', self.feature_templates), model = TaggerModel(cfg.get('features', self.feature_templates),
@ -186,13 +144,9 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
""" """Apply the tagger, setting the POS tags onto the Doc object.
Apply the tagger, setting the POS tags onto the Doc object.
Arguments: doc (Doc): The tokens to be tagged.
doc (Doc): The tokens to be tagged.
Returns:
None
""" """
if tokens.length == 0: if tokens.length == 0:
return 0 return 0
@ -215,34 +169,25 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2): def pipe(self, stream, batch_size=1000, n_threads=2):
""" """Tag a stream of documents.
Tag a stream of documents.
Arguments: Arguments:
stream: The sequence of documents to tag. stream: The sequence of documents to tag.
batch_size (int): batch_size (int): The number of documents to accumulate into a working set.
The number of documents to accumulate into a working set. n_threads (int): The number of threads with which to work on the buffer
n_threads (int): in parallel, if the Matcher implementation supports multi-threading.
The number of threads with which to work on the buffer in parallel, YIELDS (Doc): Documents, in order.
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
""" """
for doc in stream: for doc in stream:
self(doc) self(doc)
yield doc yield doc
def update(self, Doc tokens, GoldParse gold, itn=0): def update(self, Doc tokens, GoldParse gold, itn=0):
""" """Update the statistical model, with tags supplied for the given document.
Update the statistical model, with tags supplied for the given document.
Arguments: doc (Doc): The document to update on.
doc (Doc): gold (GoldParse): Manager for the gold-standard tags.
The document to update on. RETURNS (int): Number of tags predicted correctly.
gold (GoldParse):
Manager for the gold-standard tags.
Returns (int):
Number of tags correct.
""" """
gold_tag_strs = gold.tags gold_tag_strs = gold.tags
assert len(tokens) == len(gold_tag_strs) assert len(tokens) == len(gold_tag_strs)

View File

@ -13,21 +13,32 @@ Tests for spaCy modules and classes live in their own directories of the same na
2. [Dos and don'ts](#dos-and-donts) 2. [Dos and don'ts](#dos-and-donts)
3. [Parameters](#parameters) 3. [Parameters](#parameters)
4. [Fixtures](#fixtures) 4. [Fixtures](#fixtures)
5. [Helpers and utilities](#helpers-and-utilities) 5. [Testing models](#testing-models)
6. [Contributing to the tests](#contributing-to-the-tests) 6. [Helpers and utilities](#helpers-and-utilities)
7. [Contributing to the tests](#contributing-to-the-tests)
## Running the tests ## Running the tests
To show print statements, run the tests with `py.test -s`. To abort after the
first failure, run them with `py.test -x`.
```bash ```bash
py.test spacy # run basic tests py.test spacy # run basic tests
py.test spacy --models # run basic and model tests py.test spacy --models --en # run basic and English model tests
py.test spacy --slow # run basic and slow tests py.test spacy --models --all # run basic and all model tests
py.test spacy --models --slow # run all tests py.test spacy --slow # run basic and slow tests
py.test spacy --models --all --slow # run all tests
``` ```
To show print statements, run the tests with `py.test -s`. To abort after the first failure, run them with `py.test -x`. You can also run tests in a specific file or directory, or even only one
specific test:
```bash
py.test spacy/tests/tokenizer # run all tests in directory
py.test spacy/tests/tokenizer/test_exceptions.py # run all tests in file
py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji # run specific test
```
## Dos and don'ts ## Dos and don'ts
@ -83,14 +94,9 @@ These are the main fixtures that are currently available:
| Fixture | Description | | Fixture | Description |
| --- | --- | | --- | --- |
| `tokenizer` | Creates **all available** language tokenizers and runs the test for **each of them**. | | `tokenizer` | Creates **all available** language tokenizers and runs the test for **each of them**. |
| `en_tokenizer` | Creates an English `Tokenizer` object. | | `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
| `de_tokenizer` | Creates a German `Tokenizer` object. | | `en_vocab`, `en_entityrecognizer`, ... | Creates an instance of the English `Vocab`, `EntityRecognizer` object etc. |
| `hu_tokenizer` | Creates a Hungarian `Tokenizer` object. | | `EN`, `DE`, ... | Creates a language class with a loaded model. For more info, see [Testing models](#testing-models). |
| `en_vocab` | Creates an English `Vocab` object. |
| `en_entityrecognizer` | Creates an English `EntityRecognizer` object. |
| `lemmatizer` | Creates a `Lemmatizer` object from the installed language data (`None` if no data is found).
| `EN` | Creates an instance of `English`. Only use for tests that require the models. |
| `DE` | Creates an instance of `German`. Only use for tests that require the models. |
| `text_file` | Creates an instance of `StringIO` to simulate reading from and writing to files. | | `text_file` | Creates an instance of `StringIO` to simulate reading from and writing to files. |
| `text_file_b` | Creates an instance of `ByteIO` to simulate reading from and writing to files. | | `text_file_b` | Creates an instance of `ByteIO` to simulate reading from and writing to files. |
@ -103,6 +109,48 @@ def test_module_do_something(en_tokenizer):
If all tests in a file require a specific configuration, or use the same complex example, it can be helpful to create a separate fixture. This fixture should be added at the top of each file. Make sure to use descriptive names for these fixtures and don't override any of the global fixtures listed above. **From looking at a test, it should immediately be clear which fixtures are used, and where they are coming from.** If all tests in a file require a specific configuration, or use the same complex example, it can be helpful to create a separate fixture. This fixture should be added at the top of each file. Make sure to use descriptive names for these fixtures and don't override any of the global fixtures listed above. **From looking at a test, it should immediately be clear which fixtures are used, and where they are coming from.**
## Testing models
Models should only be loaded and tested **if absolutely necessary** for example, if you're specifically testing a model's performance, or if your test is related to model loading. If you only need an annotated `Doc`, you should use the `get_doc()` helper function to create it manually instead.
To specify which language models a test is related to, set the language ID as an argument of `@pytest.mark.models`. This allows you to later run the tests with `--models --en`. You can then use the `EN` [fixture](#fixtures) to get a language
class with a loaded model.
```python
@pytest.mark.models('en')
def test_english_model(EN):
doc = EN(u'This is a test')
```
> ⚠️ **Important note:** In order to test models, they need to be installed as a packge. The [conftest.py](conftest.py) includes a list of all available models, mapped to their IDs, e.g. `en`. Unless otherwise specified, each model that's installed in your environment will be imported and tested. If you don't have a model installed, **the test will be skipped**.
Under the hood, `pytest.importorskip` is used to import a model package and skip the test if the package is not installed. The `EN` fixture for example gets all
available models for `en`, [parametrizes](#parameters) them to run the test for *each of them*, and uses `load_test_model()` to import the model and run the test, or skip it if the model is not installed.
### Testing specific models
Using the `load_test_model()` helper function, you can also write tests for specific models, or combinations of them:
```python
from .util import load_test_model
@pytest.mark.models('en')
def test_en_md_only():
nlp = load_test_model('en_core_web_md')
# test something specific to en_core_web_md
@pytest.mark.models('en', 'fr')
@pytest.mark.parametrize('model', ['en_core_web_md', 'fr_depvec_web_lg'])
def test_different_models(model):
nlp = load_test_model(model)
# test something specific to the parametrized models
```
### Known issues and future improvements
Using `importorskip` on a list of model packages is not ideal and we're looking to improve this in the future. But at the moment, it's the best way to ensure that tests are performed on specific model packages only, and that you'll always be able to run the tests, even if you don't have *all available models* installed. (If the tests made a call to `spacy.load('en')` instead, this would load whichever model you've created an `en` shortcut for. This may be one of spaCy's default models, but it could just as easily be your own custom English model.)
The current setup also doesn't provide an easy way to only run tests on specific model versions. The `minversion` keyword argument on `pytest.importorskip` can take care of this, but it currently only checks for the package's `__version__` attribute. An alternative solution would be to load a model package's meta.json and skip if the model's version does not match the one specified in the test.
## Helpers and utilities ## Helpers and utilities
@ -152,11 +200,11 @@ print([token.dep_ for token in doc])
**Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work. **Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work.
### Other utilities ### Other utilities
| Name | Description | | Name | Description |
| --- | --- | | --- | --- |
| `load_test_model` | Load a model if it's installed as a package, otherwise skip test. |
| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. | | `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. | | `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. | | `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |

View File

@ -1,25 +1,50 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..tokens import Doc
from ..strings import StringStore
from ..lemmatizer import Lemmatizer
from ..attrs import ORTH, TAG, HEAD, DEP
from .. import util
from io import StringIO, BytesIO from io import StringIO, BytesIO
from pathlib import Path from pathlib import Path
import pytest import pytest
from .util import load_test_model
from ..tokens import Doc
from ..strings import StringStore
from .. import util
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb', _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
'nl', 'pl', 'pt', 'sv'] 'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
'de': ['de_core_news_md'],
'fr': ['fr_depvec_web_lg'],
'xx': ['xx_ent_web_md']}
@pytest.fixture(params=_languages) # only used for tests that require loading the models
def tokenizer(request): # in all other cases, use specific instances
lang = util.get_lang_class(request.param)
return lang.Defaults.create_tokenizer() @pytest.fixture(params=_models['en'])
def EN(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['de'])
def DE(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['fr'])
def FR(request):
return load_test_model(request.param)
#@pytest.fixture(params=_languages)
#def tokenizer(request):
#lang = util.get_lang_class(request.param)
#return lang.Defaults.create_tokenizer()
@pytest.fixture
def tokenizer():
return util.get_lang_class('xx').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
@ -47,7 +72,7 @@ def de_tokenizer():
return util.get_lang_class('de').Defaults.create_tokenizer() return util.get_lang_class('de').Defaults.create_tokenizer()
@pytest.fixture(scope='module') @pytest.fixture
def fr_tokenizer(): def fr_tokenizer():
return util.get_lang_class('fr').Defaults.create_tokenizer() return util.get_lang_class('fr').Defaults.create_tokenizer()
@ -91,11 +116,6 @@ def en_entityrecognizer():
return util.get_lang_class('en').Defaults.create_entity() return util.get_lang_class('en').Defaults.create_entity()
@pytest.fixture
def lemmatizer():
return util.get_lang_class('en').Defaults.create_lemmatizer()
@pytest.fixture @pytest.fixture
def text_file(): def text_file():
return StringIO() return StringIO()
@ -105,22 +125,6 @@ def text_file_b():
return BytesIO() return BytesIO()
# only used for tests that require loading the models
# in all other cases, use specific instances
@pytest.fixture(scope="session")
def EN():
return English()
@pytest.fixture(scope="session")
def DE():
return German()
@pytest.fixture(scope="session")
def FR():
return French()
def pytest_addoption(parser): def pytest_addoption(parser):
parser.addoption("--models", action="store_true", parser.addoption("--models", action="store_true",
help="include tests that require full models") help="include tests that require full models")
@ -129,8 +133,18 @@ def pytest_addoption(parser):
parser.addoption("--slow", action="store_true", parser.addoption("--slow", action="store_true",
help="include slow tests") help="include slow tests")
for lang in _languages + ['all']:
parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
def pytest_runtest_setup(item): def pytest_runtest_setup(item):
for opt in ['models', 'vectors', 'slow']: for opt in ['models', 'vectors', 'slow']:
if opt in item.keywords and not item.config.getoption("--%s" % opt): if opt in item.keywords and not item.config.getoption("--%s" % opt):
pytest.skip("need --%s option to run" % opt) pytest.skip("need --%s option to run" % opt)
# Check if test is marked with models and has arguments set, i.e. specific
# language. If so, skip test if flag not set.
if item.get_marker('models'):
for arg in item.get_marker('models').args:
if not item.config.getoption("--%s" % arg) and not item.config.getoption("--all"):
pytest.skip("need --%s or --all option to run" % arg)

View File

@ -102,7 +102,7 @@ def test_doc_api_getitem(en_tokenizer):
def test_doc_api_serialize(en_tokenizer, text): def test_doc_api_serialize(en_tokenizer, text):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
new_tokens = get_doc(tokens.vocab).from_bytes(tokens.to_bytes()) new_tokens = get_doc(tokens.vocab).from_bytes(tokens.to_bytes())
assert tokens.string == new_tokens.string assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
assert doc[6].right_edge.text == ',' assert doc[6].right_edge.text == ','
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [ @pytest.mark.parametrize('text,vectors', [
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"]) ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
]) ])

View File

@ -68,6 +68,7 @@ def test_doc_token_api_is_properties(en_vocab):
assert doc[5].like_email assert doc[5].like_email
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [ @pytest.mark.parametrize('text,vectors', [
("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"]) ("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
]) ])
@ -99,8 +100,8 @@ def test_doc_token_api_ancestors(en_tokenizer):
assert [t.text for t in doc[1].ancestors] == ["saw"] assert [t.text for t in doc[1].ancestors] == ["saw"]
assert [t.text for t in doc[2].ancestors] == [] assert [t.text for t in doc[2].ancestors] == []
assert doc[2].is_ancestor_of(doc[7]) assert doc[2].is_ancestor(doc[7])
assert not doc[6].is_ancestor_of(doc[2]) assert not doc[6].is_ancestor(doc[2])
def test_doc_token_api_head_setter(en_tokenizer): def test_doc_token_api_head_setter(en_tokenizer):
@ -155,3 +156,15 @@ def test_doc_token_api_head_setter(en_tokenizer):
assert doc[3].left_edge.i == 0 assert doc[3].left_edge.i == 0
assert doc[4].left_edge.i == 0 assert doc[4].left_edge.i == 0
assert doc[2].left_edge.i == 0 assert doc[2].left_edge.i == 0
def test_sent_start(en_tokenizer):
doc = en_tokenizer(u'This is a sentence. This is another.')
assert not doc[0].sent_start
assert not doc[5].sent_start
doc[5].sent_start = True
assert doc[5].sent_start
assert not doc[0].sent_start
doc.is_parsed = True
assert len(list(doc.sents)) == 2

View File

@ -1,72 +0,0 @@
# coding: utf-8
import pytest
import numpy
@pytest.mark.models
class TestModelSanity:
"""
This is to make sure the model works as expected. The tests make sure that
values are properly set.
Tests are not meant to evaluate the content of the output, only make sure
the output is formally okay.
"""
@pytest.fixture(scope='class', params=['en','de'])
def example(self, request, EN, DE):
assert EN.entity != None
assert DE.entity != None
if request.param == 'en':
doc = EN(u'There was a stranger standing at the big ' +
u'street talking to herself.')
elif request.param == 'de':
doc = DE(u'An der großen Straße stand eine merkwürdige ' +
u'Gestalt und führte Selbstgespräche.')
return doc
def test_tokenization(self, example):
# tokenization should split the document into tokens
assert len(example) > 1
def test_tagging(self, example):
# if tagging was done properly, pos tags shouldn't be empty
assert example.is_tagged
assert all( t.pos != 0 for t in example )
assert all( t.tag != 0 for t in example )
def test_parsing(self, example):
# if parsing was done properly
# - dependency labels shouldn't be empty
# - the head of some tokens should not be root
assert example.is_parsed
assert all( t.dep != 0 for t in example )
assert any( t.dep != i for i,t in enumerate(example) )
def test_ner(self, example):
# if ner was done properly, ent_iob shouldn't be empty
assert all([t.ent_iob != 0 for t in example])
def test_vectors(self, example):
# if vectors are available, they should differ on different words
# this isn't a perfect test since this could in principle fail
# in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
vector0 = example[0].vector
vector1 = example[1].vector
vector2 = example[2].vector
assert not numpy.array_equal(vector0,vector1)
assert not numpy.array_equal(vector0,vector2)
assert not numpy.array_equal(vector1,vector2)
def test_probs(self, example):
# if frequencies/probabilities are okay, they should differ for
# different words
# this isn't a perfect test since this could in principle fail
# in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
prob0 = example[0].prob
prob1 = example[1].prob
prob2 = example[2].prob
assert not prob0 == prob1
assert not prob0 == prob2
assert not prob1 == prob2

View File

@ -8,20 +8,33 @@ import pytest
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"]) @pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
def test_tokenizer_splits_contractions(de_tokenizer, text): def test_de_tokenizer_splits_contractions(de_tokenizer, text):
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) @pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
def test_tokenizer_handles_abbr(de_tokenizer, text): def test_de_tokenizer_handles_abbr(de_tokenizer, text):
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
def test_tokenizer_handles_exc_in_text(de_tokenizer): def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
text = "Ich bin z.Zt. im Urlaub." text = "Ich bin z.Zt. im Urlaub."
tokens = de_tokenizer(text) tokens = de_tokenizer(text)
assert len(tokens) == 6 assert len(tokens) == 6
assert tokens[2].text == "z.Zt." assert tokens[2].text == "z.Zt."
assert tokens[2].lemma_ == "zur Zeit" assert tokens[2].lemma_ == "zur Zeit"
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
tokens = de_tokenizer(text)
assert [token.norm_ for token in tokens] == norms
@pytest.mark.xfail
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
tokens = de_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -0,0 +1,77 @@
# coding: utf-8
from __future__ import unicode_literals
import numpy
import pytest
@pytest.fixture
def example(DE):
"""
This is to make sure the model works as expected. The tests make sure that
values are properly set. Tests are not meant to evaluate the content of the
output, only make sure the output is formally okay.
"""
assert DE.entity != None
return DE('An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
@pytest.mark.models('de')
def test_de_models_tokenization(example):
# tokenization should split the document into tokens
assert len(example) > 1
@pytest.mark.xfail
@pytest.mark.models('de')
def test_de_models_tagging(example):
# if tagging was done properly, pos tags shouldn't be empty
assert example.is_tagged
assert all(t.pos != 0 for t in example)
assert all(t.tag != 0 for t in example)
@pytest.mark.models('de')
def test_de_models_parsing(example):
# if parsing was done properly
# - dependency labels shouldn't be empty
# - the head of some tokens should not be root
assert example.is_parsed
assert all(t.dep != 0 for t in example)
assert any(t.dep != i for i,t in enumerate(example))
@pytest.mark.models('de')
def test_de_models_ner(example):
# if ner was done properly, ent_iob shouldn't be empty
assert all([t.ent_iob != 0 for t in example])
@pytest.mark.models('de')
def test_de_models_vectors(example):
# if vectors are available, they should differ on different words
# this isn't a perfect test since this could in principle fail
# in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
vector0 = example[0].vector
vector1 = example[1].vector
vector2 = example[2].vector
assert not numpy.array_equal(vector0,vector1)
assert not numpy.array_equal(vector0,vector2)
assert not numpy.array_equal(vector1,vector2)
@pytest.mark.xfail
@pytest.mark.models('de')
def test_de_models_probs(example):
# if frequencies/probabilities are okay, they should differ for
# different words
# this isn't a perfect test since this could in principle fail
# in a sane model as well,
# but that's very unlikely and a good indicator if something is wrong
prob0 = example[0].prob
prob1 = example[1].prob
prob2 = example[2].prob
assert not prob0 == prob1
assert not prob0 == prob2
assert not prob1 == prob2

View File

@ -0,0 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals
from ...util import get_doc
import pytest
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
text = "Eine Tasse steht auf dem Tisch."
heads = [1, 1, 0, -1, 1, -2, -4]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text_with_ws == "Eine Tasse "
assert chunks[1].text_with_ws == "dem Tisch "
def test_de_extended_chunk(de_tokenizer):
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
chunks = list(doc.noun_chunks)
assert len(chunks) == 3
assert chunks[0].text_with_ws == "Die Sängerin "
assert chunks[1].text_with_ws == "einer Tasse Kaffee "
assert chunks[2].text_with_ws == "Arien "

View File

@ -1,87 +0,0 @@
# coding: utf-8
"""Test that tokens are created correctly for contractions."""
from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_basic_contraction(en_tokenizer):
text = "don't giggle"
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "n't"
text = "i said don't!"
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[4].text == "!"
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
tokens = en_tokenizer(text_poss)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == "'s"
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'"
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'ll"
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
tokens_lower = en_tokenizer(text_lower)
tokens_title = en_tokenizer(text_title)
assert tokens_title[0].text == tokens_lower[0].text.title()
assert tokens_lower[0].text == tokens_title[0].text.lower()
assert tokens_lower[1].text == tokens_title[1].text
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
tokens = en_tokenizer(pron + contraction)
assert tokens[0].text == pron
assert tokens[1].text == contraction
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
tokens = en_tokenizer(exc)
assert len(tokens) == 1
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
tokens = en_tokenizer(wo_punct)
assert len(tokens) == 2
tokens = en_tokenizer(w_punct)
assert len(tokens) == 3

Some files were not shown because too many files have changed in this diff Show More