mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge branch 'develop' into polish
This commit is contained in:
commit
d81ceb0cd5
1
.appveyor.yml
Normal file
1
.appveyor.yml
Normal file
|
@ -0,0 +1 @@
|
|||
build: off
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -30,6 +30,7 @@ Profile.prof
|
|||
__pycache__/
|
||||
*.py[cod]
|
||||
.env/
|
||||
.env*
|
||||
.~env/
|
||||
.venv
|
||||
venv/
|
||||
|
|
16
README.rst
16
README.rst
|
@ -4,12 +4,10 @@ spaCy: Industrial-strength NLP
|
|||
spaCy is a library for advanced natural language processing in Python and
|
||||
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
||||
It was designed from day one to be used in real products. spaCy currently supports
|
||||
English, German and French, as well as tokenization for Spanish, Italian,
|
||||
Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew,
|
||||
Chinese and Japanese. It's commercial open-source software, released under the
|
||||
MIT license.
|
||||
|
||||
📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_.
|
||||
English, German, French and Spanish, as well as tokenization for Italian,
|
||||
Portuguese, Dutch, Swedish, Finnish, Norwegian, Danish, Hungarian, Polish,
|
||||
Bengali, Hebrew, Chinese and Japanese. It's commercial open-source software,
|
||||
released under the MIT license.
|
||||
|
||||
💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||
|
||||
|
@ -85,7 +83,7 @@ Features
|
|||
* GIL-free **multi-threading**
|
||||
* Efficient binary serialization
|
||||
* Easy **deep learning** integration
|
||||
* Statistical models for **English** and **German**
|
||||
* Statistical models for **English**, **German**, **French** and **Spanish**
|
||||
* State-of-the-art speed
|
||||
* Robust, rigorously evaluated accuracy
|
||||
|
||||
|
@ -197,7 +195,7 @@ To load a model, use ``spacy.load()`` with the model's shortcut link:
|
|||
.. code:: python
|
||||
|
||||
import spacy
|
||||
nlp = spacy.load('en_default')
|
||||
nlp = spacy.load('en')
|
||||
doc = nlp(u'This is a sentence.')
|
||||
|
||||
If you've installed a model via pip, you can also ``import`` it directly and
|
||||
|
@ -313,7 +311,7 @@ and ``--model`` are optional and enable additional tests:
|
|||
# make sure you are using recent pytest version
|
||||
python -m pip install -U pytest
|
||||
|
||||
python -m pytest <spacy-directory> --vectors --models --slow
|
||||
python -m pytest <spacy-directory>
|
||||
|
||||
🛠 Changelog
|
||||
============
|
||||
|
|
|
@ -1,68 +1,27 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
import json
|
||||
import pathlib
|
||||
|
||||
import random
|
||||
|
||||
import spacy
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.lang.en import English
|
||||
from spacy.gold import GoldParse, biluo_tags_from_offsets
|
||||
|
||||
|
||||
try:
|
||||
unicode
|
||||
except:
|
||||
unicode = str
|
||||
|
||||
|
||||
def train_ner(nlp, train_data, entity_types):
|
||||
# Add new words to vocab.
|
||||
for raw_text, _ in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
for word in doc:
|
||||
_ = nlp.vocab[word.orth]
|
||||
|
||||
# Train NER.
|
||||
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
|
||||
for itn in range(5):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
ner.update(doc, gold)
|
||||
return ner
|
||||
|
||||
def save_model(ner, model_dir):
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
if not model_dir.exists():
|
||||
model_dir.mkdir()
|
||||
assert model_dir.is_dir()
|
||||
|
||||
with (model_dir / 'config.json').open('wb') as file_:
|
||||
data = json.dumps(ner.cfg)
|
||||
if isinstance(data, unicode):
|
||||
data = data.encode('utf8')
|
||||
file_.write(data)
|
||||
ner.model.dump(str(model_dir / 'model'))
|
||||
if not (model_dir / 'vocab').exists():
|
||||
(model_dir / 'vocab').mkdir()
|
||||
ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
|
||||
with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
|
||||
ner.vocab.strings.dump(file_)
|
||||
def reformat_train_data(tokenizer, examples):
|
||||
"""Reformat data to match JSON format"""
|
||||
output = []
|
||||
for i, (text, entity_offsets) in enumerate(examples):
|
||||
doc = tokenizer(text)
|
||||
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets)
|
||||
words = [w.text for w in doc]
|
||||
tags = ['-'] * len(doc)
|
||||
heads = [0] * len(doc)
|
||||
deps = [''] * len(doc)
|
||||
sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
|
||||
output.append((text, [(sentence, [])]))
|
||||
return output
|
||||
|
||||
|
||||
def main(model_dir=None):
|
||||
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
|
||||
|
||||
# v1.1.2 onwards
|
||||
if nlp.tagger is None:
|
||||
print('---- WARNING ----')
|
||||
print('Data directory not found')
|
||||
print('please run: `python -m spacy.en.download --force all` for better performance')
|
||||
print('Using feature templates for tagging')
|
||||
print('-----------------')
|
||||
nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
|
||||
|
||||
train_data = [
|
||||
(
|
||||
'Who is Shaka Khan?',
|
||||
|
@ -74,23 +33,35 @@ def main(model_dir=None):
|
|||
(len('I like London and '), len('I like London and Berlin'), 'LOC')]
|
||||
)
|
||||
]
|
||||
ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])
|
||||
|
||||
doc = nlp.make_doc('Who is Shaka Khan?')
|
||||
nlp.tagger(doc)
|
||||
ner(doc)
|
||||
nlp = English(pipeline=['tensorizer', 'ner'])
|
||||
get_data = lambda: reformat_train_data(nlp.tokenizer, train_data)
|
||||
optimizer = nlp.begin_training(get_data)
|
||||
for itn in range(100):
|
||||
random.shuffle(train_data)
|
||||
losses = {}
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
nlp.update(
|
||||
[doc], # Batch of Doc objects
|
||||
[gold], # Batch of GoldParse objects
|
||||
drop=0.5, # Dropout -- make it harder to memorise data
|
||||
sgd=optimizer, # Callable to update weights
|
||||
losses=losses)
|
||||
print(losses)
|
||||
print("Save to", model_dir)
|
||||
nlp.to_disk(model_dir)
|
||||
print("Load from", model_dir)
|
||||
nlp = spacy.lang.en.English(pipeline=['tensorizer', 'ner'])
|
||||
nlp.from_disk(model_dir)
|
||||
for raw_text, _ in train_data:
|
||||
doc = nlp(raw_text)
|
||||
for word in doc:
|
||||
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
|
||||
|
||||
if model_dir is not None:
|
||||
save_model(ner, model_dir)
|
||||
|
||||
|
||||
|
||||
|
||||
print(word.text, word.ent_type_, word.ent_iob_)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main('ner')
|
||||
import plac
|
||||
plac.call(main)
|
||||
# Who "" 2
|
||||
# is "" 2
|
||||
# Shaka "" PERSON 3
|
||||
|
|
|
@ -3,8 +3,8 @@ pathlib
|
|||
numpy>=1.7
|
||||
cymem>=1.30,<1.32
|
||||
preshed>=1.0.0,<2.0.0
|
||||
thinc>=6.6.0,<6.7.0
|
||||
murmurhash>=0.26,<0.27
|
||||
thinc>=6.7.3,<6.8.0
|
||||
murmurhash>=0.28,<0.29
|
||||
plac<1.0.0,>=0.9.6
|
||||
six
|
||||
ujson>=1.35
|
||||
|
@ -14,3 +14,6 @@ regex==2017.4.5
|
|||
ftfy>=4.4.2,<5.0.0
|
||||
pytest>=3.0.6,<4.0.0
|
||||
pip>=9.0.0,<10.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
msgpack-python
|
||||
msgpack-numpy
|
||||
|
|
11
setup.py
11
setup.py
|
@ -44,7 +44,8 @@ MOD_NAMES = [
|
|||
'spacy.matcher',
|
||||
'spacy.syntax.ner',
|
||||
'spacy.symbols',
|
||||
'spacy.syntax.iterators']
|
||||
'spacy.vectors',
|
||||
]
|
||||
|
||||
|
||||
COMPILE_OPTIONS = {
|
||||
|
@ -188,10 +189,10 @@ def setup_package():
|
|||
ext_modules=ext_modules,
|
||||
install_requires=[
|
||||
'numpy>=1.7',
|
||||
'murmurhash>=0.26,<0.27',
|
||||
'murmurhash>=0.28,<0.29',
|
||||
'cymem>=1.30,<1.32',
|
||||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.6.0,<6.7.0',
|
||||
'thinc>=6.7.3,<6.8.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'pip>=9.0.0,<10.0.0',
|
||||
'six',
|
||||
|
@ -200,7 +201,9 @@ def setup_package():
|
|||
'dill>=0.2,<0.3',
|
||||
'requests>=2.13.0,<3.0.0',
|
||||
'regex==2017.4.5',
|
||||
'ftfy>=4.4.2,<5.0.0'],
|
||||
'ftfy>=4.4.2,<5.0.0',
|
||||
'msgpack-python',
|
||||
'msgpack-numpy'],
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
|
|
|
@ -1,22 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import importlib
|
||||
|
||||
from .compat import basestring_
|
||||
from .cli.info import info
|
||||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .deprecated import resolve_load_name
|
||||
from .about import __version__
|
||||
from . import util
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
name = resolve_load_name(name, **overrides)
|
||||
model_path = util.resolve_model_path(name)
|
||||
meta = util.parse_package_meta(model_path)
|
||||
if 'lang' not in meta:
|
||||
raise IOError('No language setting found in model meta.')
|
||||
cls = util.get_lang_class(meta['lang'])
|
||||
overrides['meta'] = meta
|
||||
overrides['path'] = model_path
|
||||
return cls(**overrides)
|
||||
return util.load_model(name, **overrides)
|
||||
|
||||
|
||||
def info(model=None, markdown=False):
|
||||
return cli_info(None, model, markdown)
|
||||
|
|
|
@ -3,135 +3,21 @@ from __future__ import print_function
|
|||
# NB! This breaks in plac on Python 2!!
|
||||
#from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from spacy.cli import download as cli_download
|
||||
from spacy.cli import link as cli_link
|
||||
from spacy.cli import info as cli_info
|
||||
from spacy.cli import package as cli_package
|
||||
from spacy.cli import train as cli_train
|
||||
from spacy.cli import model as cli_model
|
||||
from spacy.cli import convert as cli_convert
|
||||
|
||||
|
||||
class CLI(object):
|
||||
"""
|
||||
Command-line interface for spaCy
|
||||
"""
|
||||
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
|
||||
|
||||
@plac.annotations(
|
||||
model=("model to download (shortcut or model name)", "positional", None, str),
|
||||
direct=("force direct download. Needs model name with version and won't "
|
||||
"perform compatibility check", "flag", "d", bool)
|
||||
)
|
||||
def download(self, model, direct=False):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version.
|
||||
"""
|
||||
cli_download(model, direct)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
origin=("package name or local path to model", "positional", None, str),
|
||||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool)
|
||||
)
|
||||
def link(self, origin, link_name, force=False):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
cli_link(origin, link_name, force)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("optional: shortcut link of model", "positional", None, str),
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
||||
)
|
||||
def info(self, model=None, markdown=False):
|
||||
"""
|
||||
Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
"""
|
||||
cli_info(model, markdown)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_dir=("directory with model data", "positional", None, str),
|
||||
output_dir=("output parent directory", "positional", None, str),
|
||||
meta=("path to meta.json", "option", "m", str),
|
||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||
)
|
||||
def package(self, input_dir, output_dir, meta=None, force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over.
|
||||
"""
|
||||
cli_package(input_dir, output_dir, meta, force)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("output directory to store model in", "positional", None, str),
|
||||
train_data=("location of JSON-formatted training data", "positional", None, str),
|
||||
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
|
||||
n_iter=("number of iterations", "option", "n", int),
|
||||
nsents=("number of sentences", "option", None, int),
|
||||
parser_L1=("L1 regularization penalty for parser", "option", "L", float),
|
||||
use_gpu=("Use GPU", "flag", "g", bool),
|
||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_ner=("Don't train NER", "flag", "N", bool)
|
||||
)
|
||||
def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
|
||||
nsents=0, parser_L1=0.0, use_gpu=False,
|
||||
no_tagger=False, no_parser=False, no_ner=False):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
nsents = nsents or None
|
||||
cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents,
|
||||
use_gpu, not no_tagger, not no_parser, not no_ner, parser_L1)
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
model_dir=("output directory to store model in", "positional", None, str),
|
||||
freqs_data=("tab-separated frequencies file", "positional", None, str),
|
||||
clusters_data=("Brown clusters file", "positional", None, str),
|
||||
vectors_data=("word vectors file", "positional", None, str)
|
||||
)
|
||||
def model(self, lang, model_dir, freqs_data, clusters_data=None, vectors_data=None):
|
||||
"""
|
||||
Initialize a new model and its data directory.
|
||||
"""
|
||||
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
|
||||
|
||||
@plac.annotations(
|
||||
input_file=("input file", "positional", None, str),
|
||||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", float),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
def convert(self, input_file, output_dir, n_sents=10, morphology=False):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
"""
|
||||
cli_convert(input_file, output_dir, n_sents, morphology)
|
||||
|
||||
|
||||
def __missing__(self, name):
|
||||
print("\n Command %r does not exist."
|
||||
"\n Use the --help flag for a list of available commands.\n" % name)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import plac
|
||||
import sys
|
||||
sys.argv[0] = 'spacy'
|
||||
plac.Interpreter.call(CLI)
|
||||
from spacy.cli import download, link, info, package, train, convert
|
||||
from spacy.util import prints
|
||||
|
||||
commands = {'download': download, 'link': link, 'info': info, 'train': train,
|
||||
'convert': convert, 'package': package}
|
||||
if len(sys.argv) == 1:
|
||||
prints(', '.join(commands), title="Available commands", exits=1)
|
||||
command = sys.argv.pop(1)
|
||||
sys.argv[0] = 'spacy %s' % command
|
||||
if command in commands:
|
||||
plac.call(commands[command])
|
||||
else:
|
||||
prints("Available: %s" % ', '.join(commands),
|
||||
title="Unknown command: %s" % command, exits=1)
|
||||
|
|
134
spacy/_ml.py
134
spacy/_ml.py
|
@ -1,3 +1,4 @@
|
|||
import ujson
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
||||
from thinc.neural._classes.hash_embed import HashEmbed
|
||||
|
@ -7,24 +8,33 @@ from thinc.neural._classes.convolution import ExtractWindow
|
|||
from thinc.neural._classes.static_vectors import StaticVectors
|
||||
from thinc.neural._classes.batchnorm import BatchNorm
|
||||
from thinc.neural._classes.resnet import Residual
|
||||
from thinc.neural import ReLu
|
||||
from thinc import describe
|
||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||
|
||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
||||
from .attrs import ID, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
||||
from .tokens.doc import Doc
|
||||
|
||||
import numpy
|
||||
import io
|
||||
|
||||
|
||||
def _init_for_precomputed(W, ops):
|
||||
if (W**2).sum() != 0.:
|
||||
return
|
||||
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
|
||||
ops.xavier_uniform_init(reshaped)
|
||||
W[:] = reshaped.reshape(W.shape)
|
||||
|
||||
@describe.on_data(_set_dimensions_if_needed)
|
||||
@describe.attributes(
|
||||
nI=Dimension("Input size"),
|
||||
nF=Dimension("Number of features"),
|
||||
nO=Dimension("Output size"),
|
||||
W=Synapses("Weights matrix",
|
||||
lambda obj: (obj.nO, obj.nF, obj.nI),
|
||||
lambda W, ops: ops.xavier_uniform_init(W)),
|
||||
lambda obj: (obj.nF, obj.nO, obj.nI),
|
||||
lambda W, ops: _init_for_precomputed(W, ops)),
|
||||
b=Biases("Bias vector",
|
||||
lambda obj: (obj.nO,)),
|
||||
d_W=Gradient("W"),
|
||||
|
@ -39,25 +49,25 @@ class PrecomputableAffine(Model):
|
|||
|
||||
def begin_update(self, X, drop=0.):
|
||||
# X: (b, i)
|
||||
# Xf: (b, f, i)
|
||||
# Yf: (b, f, i)
|
||||
# dY: (b, o)
|
||||
# dYf: (b, f, o)
|
||||
#Yf = numpy.einsum('bi,ofi->bfo', X, self.W)
|
||||
#Yf = numpy.einsum('bi,foi->bfo', X, self.W)
|
||||
Yf = self.ops.xp.tensordot(
|
||||
X, self.W, axes=[[1], [2]]).transpose((0, 2, 1))
|
||||
X, self.W, axes=[[1], [2]])
|
||||
Yf += self.b
|
||||
def backward(dY_ids, sgd=None):
|
||||
tensordot = self.ops.xp.tensordot
|
||||
dY, ids = dY_ids
|
||||
Xf = X[ids]
|
||||
|
||||
#dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
|
||||
dXf = tensordot(dY, self.W, axes=[[1], [1]])
|
||||
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
|
||||
dW = self.ops.xp.tensordot(dY, Xf, axes=[[0], [0]])
|
||||
db = dY.sum(axis=0)
|
||||
#dXf = numpy.einsum('bo,ofi->bfi', dY, self.W)
|
||||
dXf = self.ops.xp.tensordot(dY, self.W, axes=[[1], [0]])
|
||||
|
||||
self.d_W += dW
|
||||
self.d_b += db
|
||||
dW = tensordot(dY, Xf, axes=[[0], [0]])
|
||||
# ofi -> foi
|
||||
self.d_W += dW.transpose((1, 0, 2))
|
||||
self.d_b += dY.sum(axis=0)
|
||||
|
||||
if sgd is not None:
|
||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
|
@ -80,10 +90,10 @@ class PrecomputableAffine(Model):
|
|||
d_b=Gradient("b")
|
||||
)
|
||||
class PrecomputableMaxouts(Model):
|
||||
def __init__(self, nO=None, nI=None, nF=None, pieces=3, **kwargs):
|
||||
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
|
||||
Model.__init__(self, **kwargs)
|
||||
self.nO = nO
|
||||
self.nP = pieces
|
||||
self.nP = nP
|
||||
self.nI = nI
|
||||
self.nF = nF
|
||||
|
||||
|
@ -121,37 +131,103 @@ class PrecomputableMaxouts(Model):
|
|||
return Yfp, backward
|
||||
|
||||
def Tok2Vec(width, embed_size, preprocess=None):
|
||||
cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||
lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size)
|
||||
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2)
|
||||
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2)
|
||||
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2)
|
||||
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
||||
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
||||
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
|
||||
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
|
||||
|
||||
embed = (norm | prefix | suffix | shape )
|
||||
tok2vec = (
|
||||
flatten
|
||||
>> (lower | prefix | suffix | shape )
|
||||
with_flatten(
|
||||
asarray(Model.ops, dtype='uint64')
|
||||
>> embed
|
||||
>> Maxout(width, width*4, pieces=3)
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
||||
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)),
|
||||
pad=4)
|
||||
)
|
||||
if preprocess not in (False, None):
|
||||
tok2vec = preprocess >> tok2vec
|
||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||
tok2vec.nO = width
|
||||
tok2vec.embed = embed
|
||||
return tok2vec
|
||||
|
||||
|
||||
def get_col(idx):
|
||||
def asarray(ops, dtype):
|
||||
def forward(X, drop=0.):
|
||||
return ops.asarray(X, dtype=dtype), None
|
||||
return layerize(forward)
|
||||
|
||||
|
||||
def foreach(layer):
|
||||
def forward(Xs, drop=0.):
|
||||
results = []
|
||||
backprops = []
|
||||
for X in Xs:
|
||||
result, bp = layer.begin_update(X, drop=drop)
|
||||
results.append(result)
|
||||
backprops.append(bp)
|
||||
def backward(d_results, sgd=None):
|
||||
dXs = []
|
||||
for d_result, backprop in zip(d_results, backprops):
|
||||
dXs.append(backprop(d_result, sgd))
|
||||
return dXs
|
||||
return results, backward
|
||||
model = layerize(forward)
|
||||
model._layers.append(layer)
|
||||
return model
|
||||
|
||||
|
||||
def rebatch(size, layer):
|
||||
ops = layer.ops
|
||||
def forward(X, drop=0.):
|
||||
if X.shape[0] < size:
|
||||
return layer.begin_update(X)
|
||||
parts = _divide_array(X, size)
|
||||
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
|
||||
for p in parts])
|
||||
y = ops.flatten(results)
|
||||
def backward(dy, sgd=None):
|
||||
d_parts = [bp(y, sgd=sgd) for bp, y in
|
||||
zip(bp_results, _divide_array(dy, size))]
|
||||
try:
|
||||
dX = ops.flatten(d_parts)
|
||||
except TypeError:
|
||||
dX = None
|
||||
except ValueError:
|
||||
dX = None
|
||||
return dX
|
||||
return y, backward
|
||||
model = layerize(forward)
|
||||
model._layers.append(layer)
|
||||
return model
|
||||
|
||||
|
||||
def _divide_array(X, size):
|
||||
parts = []
|
||||
index = 0
|
||||
while index < len(X):
|
||||
parts.append(X[index : index + size])
|
||||
index += size
|
||||
return parts
|
||||
|
||||
|
||||
def get_col(idx):
|
||||
assert idx >= 0, idx
|
||||
def forward(X, drop=0.):
|
||||
assert idx >= 0, idx
|
||||
if isinstance(X, numpy.ndarray):
|
||||
ops = NumpyOps()
|
||||
else:
|
||||
ops = CupyOps()
|
||||
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
||||
def backward(y, sgd=None):
|
||||
assert idx >= 0, idx
|
||||
dX = ops.allocate(X.shape)
|
||||
dX[:, idx] += y
|
||||
return dX
|
||||
|
@ -167,21 +243,17 @@ def zero_init(model):
|
|||
|
||||
|
||||
def doc2feats(cols=None):
|
||||
cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE]
|
||||
def forward(docs, drop=0.):
|
||||
feats = []
|
||||
for doc in docs:
|
||||
if 'cached_feats' not in doc.user_data:
|
||||
doc.user_data['cached_feats'] = model.ops.asarray(
|
||||
doc.to_array(cols),
|
||||
dtype='uint64')
|
||||
feats.append(doc.user_data['cached_feats'])
|
||||
assert feats[-1].dtype == 'uint64'
|
||||
feats.append(doc.to_array(cols))
|
||||
return feats, None
|
||||
model = layerize(forward)
|
||||
model.cols = cols
|
||||
return model
|
||||
|
||||
|
||||
def print_shape(prefix):
|
||||
def forward(X, drop=0.):
|
||||
return X, lambda dX, **kwargs: dX
|
||||
|
|
|
@ -2,16 +2,16 @@
|
|||
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy'
|
||||
__version__ = '1.8.2'
|
||||
__title__ = 'spacy-nightly'
|
||||
__version__ = '2.0.0a1'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Matthew Honnibal'
|
||||
__email__ = 'matt@explosion.ai'
|
||||
__author__ = 'Explosion AI'
|
||||
__email__ = 'contact@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
|
||||
__docs_models__ = 'https://spacy.io/docs/usage/models'
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
||||
__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
|
||||
__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/develop/templates/model/'
|
||||
|
|
|
@ -83,6 +83,7 @@ cpdef enum attr_id_t:
|
|||
ENT_IOB
|
||||
ENT_TYPE
|
||||
HEAD
|
||||
SENT_START
|
||||
SPACY
|
||||
PROB
|
||||
|
||||
|
|
|
@ -85,6 +85,7 @@ IDS = {
|
|||
"ENT_IOB": ENT_IOB,
|
||||
"ENT_TYPE": ENT_TYPE,
|
||||
"HEAD": HEAD,
|
||||
"SENT_START": SENT_START,
|
||||
"SPACY": SPACY,
|
||||
"PROB": PROB,
|
||||
"LANG": LANG,
|
||||
|
@ -149,6 +150,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
else:
|
||||
int_key = IDS[name.upper()]
|
||||
if strings_map is not None and isinstance(value, basestring):
|
||||
if hasattr(strings_map, 'add'):
|
||||
value = strings_map.add(value)
|
||||
else:
|
||||
value = strings_map[value]
|
||||
inty_attrs[int_key] = value
|
||||
return inty_attrs
|
||||
|
|
|
@ -2,6 +2,5 @@ from .download import download
|
|||
from .info import info
|
||||
from .link import link
|
||||
from .package import package
|
||||
from .train import train, train_config
|
||||
from .model import model
|
||||
from .train import train
|
||||
from .convert import convert
|
||||
|
|
|
@ -1,31 +1,43 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
|
||||
from .converters import conllu2json
|
||||
from .converters import conllu2json, iob2json
|
||||
from ..util import prints
|
||||
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new entry
|
||||
# to this dict with the file extension mapped to the converter function imported
|
||||
# from /converters.
|
||||
|
||||
CONVERTERS = {
|
||||
'.conllu': conllu2json,
|
||||
'.conll': conllu2json
|
||||
'.conll': conllu2json,
|
||||
'.iob': iob2json
|
||||
}
|
||||
|
||||
|
||||
def convert(input_file, output_dir, *args):
|
||||
@plac.annotations(
|
||||
input_file=("input file", "positional", None, str),
|
||||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", float),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
def convert(cmd, input_file, output_dir, n_sents, morphology):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
"""
|
||||
input_path = Path(input_file)
|
||||
output_path = Path(output_dir)
|
||||
if not input_path.exists():
|
||||
prints(input_path, title="Input file not found", exits=True)
|
||||
prints(input_path, title="Input file not found", exits=1)
|
||||
if not output_path.exists():
|
||||
prints(output_path, title="Output directory not found", exits=True)
|
||||
prints(output_path, title="Output directory not found", exits=1)
|
||||
file_ext = input_path.suffix
|
||||
if not file_ext in CONVERTERS:
|
||||
prints("Can't find converter for %s" % input_path.parts[-1],
|
||||
title="Unknown format", exits=True)
|
||||
CONVERTERS[file_ext](input_path, output_path, *args)
|
||||
title="Unknown format", exits=1)
|
||||
CONVERTERS[file_ext](input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology)
|
||||
|
|
|
@ -1 +1,2 @@
|
|||
from .conllu2json import conllu2json
|
||||
from .iob2json import iob2json
|
||||
|
|
|
@ -73,10 +73,10 @@ def generate_sentence(sent):
|
|||
tokens = []
|
||||
for i, id in enumerate(id_):
|
||||
token = {}
|
||||
token["orth"] = word[id]
|
||||
token["tag"] = tag[id]
|
||||
token["head"] = head[id] - i
|
||||
token["dep"] = dep[id]
|
||||
token["orth"] = word[i]
|
||||
token["tag"] = tag[i]
|
||||
token["head"] = head[i] - id
|
||||
token["dep"] = dep[i]
|
||||
tokens.append(token)
|
||||
sentence["tokens"] = tokens
|
||||
return sentence
|
||||
|
|
45
spacy/cli/converters/iob2json.py
Normal file
45
spacy/cli/converters/iob2json.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
||||
"""
|
||||
Convert IOB files into JSON format for use with train cli.
|
||||
"""
|
||||
# TODO: This isn't complete yet -- need to map from IOB to
|
||||
# BILUO
|
||||
with input_path.open('r', encoding='utf8') as file_:
|
||||
docs = read_iob(file_)
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".iob", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints("Created %d documents" % len(docs),
|
||||
title="Generated output file %s" % path2str(output_file))
|
||||
|
||||
|
||||
def read_iob(file_):
|
||||
sentences = []
|
||||
for line in file_:
|
||||
if not line.strip():
|
||||
continue
|
||||
tokens = [t.split('|') for t in line.split()]
|
||||
if len(tokens[0]) == 3:
|
||||
words, pos, iob = zip(*tokens)
|
||||
else:
|
||||
words, iob = zip(*tokens)
|
||||
pos = ['-'] * len(words)
|
||||
biluo = iob_to_biluo(iob)
|
||||
sentences.append([
|
||||
{'orth': w, 'tag': p, 'ner': ent}
|
||||
for (w, p, ent) in zip(words, pos, biluo)
|
||||
])
|
||||
sentences = [{'tokens': sent} for sent in sentences]
|
||||
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
||||
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
|
||||
return docs
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import requests
|
||||
import os
|
||||
import subprocess
|
||||
|
@ -11,7 +12,17 @@ from ..util import prints
|
|||
from .. import about
|
||||
|
||||
|
||||
def download(model, direct=False):
|
||||
@plac.annotations(
|
||||
model=("model to download (shortcut or model name)", "positional", None, str),
|
||||
direct=("force direct download. Needs model name with version and won't "
|
||||
"perform compatibility check", "flag", "d", bool)
|
||||
)
|
||||
def download(cmd, model, direct=False):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version.
|
||||
"""
|
||||
if direct:
|
||||
download_model('{m}/{m}.tar.gz'.format(m=model))
|
||||
else:
|
||||
|
@ -20,7 +31,17 @@ def download(model, direct=False):
|
|||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||
link(model_name, model, force=True)
|
||||
try:
|
||||
link(None, model_name, model, force=True)
|
||||
except:
|
||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||
# a convenience wrapper, it's best to show a success message and
|
||||
# loading instructions, even if linking fails.
|
||||
prints("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load "
|
||||
"the model via its full package name:",
|
||||
"nlp = spacy.load('%s')" % model_name,
|
||||
title="Download successful")
|
||||
|
||||
|
||||
def get_json(url, desc):
|
||||
|
@ -28,7 +49,7 @@ def get_json(url, desc):
|
|||
if r.status_code != 200:
|
||||
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
|
||||
"(v%s), and download it manually." % (desc, about.__version__),
|
||||
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=True)
|
||||
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
|
||||
return r.json()
|
||||
|
||||
|
||||
|
@ -38,7 +59,7 @@ def get_compatibility():
|
|||
comp = comp_table['spacy']
|
||||
if version not in comp:
|
||||
prints("No compatible models found for v%s of spaCy." % version,
|
||||
title="Compatibility error", exits=True)
|
||||
title="Compatibility error", exits=1)
|
||||
return comp[version]
|
||||
|
||||
|
||||
|
@ -46,7 +67,7 @@ def get_version(model, comp):
|
|||
if model not in comp:
|
||||
version = about.__version__
|
||||
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
|
||||
title="Compatibility error", exits=True)
|
||||
title="Compatibility error", exits=1)
|
||||
return comp[model][0]
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import platform
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -9,17 +10,30 @@ from .. import about
|
|||
from .. import util
|
||||
|
||||
|
||||
def info(model=None, markdown=False):
|
||||
@plac.annotations(
|
||||
model=("optional: shortcut link of model", "positional", None, str),
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
||||
)
|
||||
def info(cmd, model=None, markdown=False):
|
||||
"""Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
"""
|
||||
if model:
|
||||
data_path = util.get_data_path()
|
||||
data = util.parse_package_meta(data_path / model, require=True)
|
||||
model_path = Path(__file__).parent / data_path / model
|
||||
if model_path.resolve() != model_path:
|
||||
data['link'] = path2str(model_path)
|
||||
data['source'] = path2str(model_path.resolve())
|
||||
if util.is_package(model):
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
data['source'] = path2str(model_path)
|
||||
print_info(data, 'model %s' % model, markdown)
|
||||
model_path = util.get_data_path() / model
|
||||
meta_path = model_path / 'meta.json'
|
||||
if not meta_path.is_file():
|
||||
util.prints(meta_path, title="Can't find model meta.json", exits=1)
|
||||
meta = util.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta['link'] = path2str(model_path)
|
||||
meta['source'] = path2str(model_path.resolve())
|
||||
else:
|
||||
meta['source'] = path2str(model_path)
|
||||
print_info(meta, 'model %s' % model, markdown)
|
||||
else:
|
||||
data = {'spaCy version': about.__version__,
|
||||
'Location': path2str(Path(__file__).parent.parent),
|
||||
|
|
|
@ -1,24 +1,36 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
|
||||
from ..compat import symlink_to, path2str
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
||||
|
||||
def link(origin, link_name, force=False):
|
||||
@plac.annotations(
|
||||
origin=("package name or local path to model", "positional", None, str),
|
||||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool)
|
||||
)
|
||||
def link(cmd, origin, link_name, force=False):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
if util.is_package(origin):
|
||||
model_path = util.get_model_package_path(origin)
|
||||
model_path = util.get_package_path(origin)
|
||||
else:
|
||||
model_path = Path(origin)
|
||||
if not model_path.exists():
|
||||
prints("The data should be located in %s" % path2str(model_path),
|
||||
title="Can't locate model data", exits=True)
|
||||
title="Can't locate model data", exits=1)
|
||||
link_path = util.get_data_path() / link_name
|
||||
if link_path.exists() and not force:
|
||||
prints("To overwrite an existing link, use the --force flag.",
|
||||
title="Link %s already exists" % link_name, exits=True)
|
||||
title="Link %s already exists" % link_name, exits=1)
|
||||
elif link_path.exists():
|
||||
link_path.unlink()
|
||||
try:
|
||||
|
@ -33,5 +45,5 @@ def link(origin, link_name, force=False):
|
|||
title="Error: Couldn't link model to '%s'" % link_name)
|
||||
raise
|
||||
prints("%s --> %s" % (path2str(model_path), path2str(link_path)),
|
||||
"You can now load the model via spacy.load('%s')." % link_name,
|
||||
"You can now load the model via spacy.load('%s')" % link_name,
|
||||
title="Linking successful")
|
||||
|
|
|
@ -1,122 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import gzip
|
||||
import math
|
||||
from ast import literal_eval
|
||||
from preshed.counter import PreshCounter
|
||||
|
||||
from ..vocab import write_binary_vectors
|
||||
from ..compat import fix_text, path2str
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
||||
|
||||
def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
|
||||
model_path = util.ensure_path(model_dir)
|
||||
freqs_path = util.ensure_path(freqs_data)
|
||||
clusters_path = util.ensure_path(clusters_data)
|
||||
vectors_path = util.ensure_path(vectors_data)
|
||||
if not freqs_path.is_file():
|
||||
prints(freqs_path, title="No frequencies file found", exits=True)
|
||||
if clusters_path and not clusters_path.is_file():
|
||||
prints(clusters_path, title="No Brown clusters file found", exits=True)
|
||||
if vectors_path and not vectors_path.is_file():
|
||||
prints(vectors_path, title="No word vectors file found", exits=True)
|
||||
vocab = util.get_lang_class(lang).Defaults.create_vocab()
|
||||
probs, oov_prob = read_probs(freqs_path)
|
||||
clusters = read_clusters(clusters_path) if clusters_path else {}
|
||||
populate_vocab(vocab, clusters, probs, oov_prob)
|
||||
create_model(model_path, vectors_path, vocab, oov_prob)
|
||||
|
||||
|
||||
def create_model(model_path, vectors_path, vocab, oov_prob):
|
||||
vocab_path = model_path / 'vocab'
|
||||
lexemes_path = vocab_path / 'lexemes.bin'
|
||||
strings_path = vocab_path / 'strings.json'
|
||||
oov_path = vocab_path / 'oov_prob'
|
||||
|
||||
if not model_path.exists():
|
||||
model_path.mkdir()
|
||||
if not vocab_path.exists():
|
||||
vocab_path.mkdir()
|
||||
vocab.dump(path2str(lexemes_path))
|
||||
with strings_path.open('w') as f:
|
||||
vocab.strings.dump(f)
|
||||
with oov_path.open('w') as f:
|
||||
f.write('%f' % oov_prob)
|
||||
if vectors_path:
|
||||
vectors_dest = vocab_path / 'vec.bin'
|
||||
write_binary_vectors(path2str(vectors_path), path2str(vectors_dest))
|
||||
|
||||
|
||||
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
|
||||
counts = PreshCounter()
|
||||
total = 0
|
||||
freqs_file = check_unzip(freqs_path)
|
||||
for i, line in enumerate(freqs_file):
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
freq = int(freq)
|
||||
counts.inc(i+1, freq)
|
||||
total += freq
|
||||
counts.smooth()
|
||||
log_total = math.log(total)
|
||||
freqs_file = check_unzip(freqs_path)
|
||||
probs = {}
|
||||
for line in freqs_file:
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
doc_freq = int(doc_freq)
|
||||
freq = int(freq)
|
||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
||||
word = literal_eval(key)
|
||||
smooth_count = counts.smoother(int(freq))
|
||||
probs[word] = math.log(smooth_count) - log_total
|
||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||
return probs, oov_prob
|
||||
|
||||
|
||||
def read_clusters(clusters_path):
|
||||
clusters = {}
|
||||
with clusters_path.open() as f:
|
||||
for line in f:
|
||||
try:
|
||||
cluster, word, freq = line.split()
|
||||
word = fix_text(word)
|
||||
except ValueError:
|
||||
continue
|
||||
# If the clusterer has only seen the word a few times, its
|
||||
# cluster is unreliable.
|
||||
if int(freq) >= 3:
|
||||
clusters[word] = cluster
|
||||
else:
|
||||
clusters[word] = '0'
|
||||
# Expand clusters with re-casing
|
||||
for word, cluster in list(clusters.items()):
|
||||
if word.lower() not in clusters:
|
||||
clusters[word.lower()] = cluster
|
||||
if word.title() not in clusters:
|
||||
clusters[word.title()] = cluster
|
||||
if word.upper() not in clusters:
|
||||
clusters[word.upper()] = cluster
|
||||
return clusters
|
||||
|
||||
|
||||
def populate_vocab(vocab, clusters, probs, oov_prob):
|
||||
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||
lexeme = vocab[word]
|
||||
lexeme.prob = prob
|
||||
lexeme.is_oov = False
|
||||
# Decode as a little-endian string, so that we can do & 15 to get
|
||||
# the first 4 bits. See _parse_features.pyx
|
||||
if word in clusters:
|
||||
lexeme.cluster = int(clusters[word][::-1], 2)
|
||||
else:
|
||||
lexeme.cluster = 0
|
||||
|
||||
|
||||
def check_unzip(file_path):
|
||||
file_path_str = path2str(file_path)
|
||||
if file_path_str.endswith('gz'):
|
||||
return gzip.open(file_path_str)
|
||||
else:
|
||||
return file_path.open()
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import shutil
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
@ -11,27 +12,38 @@ from .. import util
|
|||
from .. import about
|
||||
|
||||
|
||||
def package(input_dir, output_dir, meta_path, force):
|
||||
@plac.annotations(
|
||||
input_dir=("directory with model data", "positional", None, str),
|
||||
output_dir=("output parent directory", "positional", None, str),
|
||||
meta=("path to meta.json", "option", "m", str),
|
||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||
)
|
||||
def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over.
|
||||
"""
|
||||
input_path = util.ensure_path(input_dir)
|
||||
output_path = util.ensure_path(output_dir)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
meta_path = util.ensure_path(meta)
|
||||
if not input_path or not input_path.exists():
|
||||
prints(input_path, title="Model directory not found", exits=True)
|
||||
prints(input_path, title="Model directory not found", exits=1)
|
||||
if not output_path or not output_path.exists():
|
||||
prints(output_path, title="Output directory not found", exits=True)
|
||||
prints(output_path, title="Output directory not found", exits=1)
|
||||
if meta_path and not meta_path.exists():
|
||||
prints(meta_path, title="meta.json not found", exits=True)
|
||||
prints(meta_path, title="meta.json not found", exits=1)
|
||||
|
||||
template_setup = get_template('setup.py')
|
||||
template_manifest = get_template('MANIFEST.in')
|
||||
template_init = get_template('en_model_name/__init__.py')
|
||||
template_init = get_template('xx_model_name/__init__.py')
|
||||
meta_path = meta_path or input_path / 'meta.json'
|
||||
if meta_path.is_file():
|
||||
prints(meta_path, title="Reading meta.json from file")
|
||||
meta = util.read_json(meta_path)
|
||||
else:
|
||||
meta = generate_meta()
|
||||
validate_meta(meta, ['lang', 'name', 'version'])
|
||||
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
||||
|
||||
model_name = meta['lang'] + '_' + meta['name']
|
||||
model_name_v = model_name + '-' + meta['version']
|
||||
|
@ -55,7 +67,7 @@ def create_dirs(package_path, force):
|
|||
else:
|
||||
prints(package_path, "Please delete the directory and try again, or "
|
||||
"use the --force flag to overwrite existing directories.",
|
||||
title="Package directory already exists", exits=True)
|
||||
title="Package directory already exists", exits=1)
|
||||
Path.mkdir(package_path, parents=True)
|
||||
|
||||
|
||||
|
@ -68,31 +80,45 @@ def generate_meta():
|
|||
settings = [('lang', 'Model language', 'en'),
|
||||
('name', 'Model name', 'model'),
|
||||
('version', 'Model version', '0.0.0'),
|
||||
('spacy_version', 'Required spaCy version', '>=2.0.0,<3.0.0'),
|
||||
('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
|
||||
('description', 'Model description', False),
|
||||
('author', 'Author', False),
|
||||
('email', 'Author email', False),
|
||||
('url', 'Author website', False),
|
||||
('license', 'License', 'CC BY-NC 3.0')]
|
||||
|
||||
prints("Enter the package settings for your model.", title="Generating meta.json")
|
||||
meta = {}
|
||||
for setting, desc, default in settings:
|
||||
response = util.get_raw_input(desc, default)
|
||||
meta[setting] = default if response == '' and default else response
|
||||
meta['pipeline'] = generate_pipeline()
|
||||
if about.__title__ != 'spacy':
|
||||
meta['parent_package'] = about.__title__
|
||||
return meta
|
||||
|
||||
|
||||
def generate_pipeline():
|
||||
prints("If set to 'True', the default pipeline is used. If set to 'False', "
|
||||
"the pipeline will be disabled. Components should be specified as a "
|
||||
"comma-separated list of component names, e.g. vectorizer, tagger, "
|
||||
"parser, ner. For more information, see the docs on processing pipelines.",
|
||||
title="Enter your model's pipeline components")
|
||||
pipeline = util.get_raw_input("Pipeline components", True)
|
||||
replace = {'True': True, 'False': False}
|
||||
return replace[pipeline] if pipeline in replace else pipeline.split(', ')
|
||||
|
||||
|
||||
def validate_meta(meta, keys):
|
||||
for key in keys:
|
||||
if key not in meta or meta[key] == '':
|
||||
prints("This setting is required to build your package.",
|
||||
title='No "%s" setting found in meta.json' % key, exits=True)
|
||||
title='No "%s" setting found in meta.json' % key, exits=1)
|
||||
return meta
|
||||
|
||||
|
||||
def get_template(filepath):
|
||||
r = requests.get(about.__model_files__ + filepath)
|
||||
if r.status_code != 200:
|
||||
prints("Couldn't fetch template files from GitHub.",
|
||||
title="Server error (%d)" % r.status_code, exits=True)
|
||||
title="Server error (%d)" % r.status_code, exits=1)
|
||||
return r.text
|
||||
|
|
|
@ -1,82 +1,124 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import json
|
||||
from collections import defaultdict
|
||||
import cytoolz
|
||||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
|
||||
from ..tokens.doc import Doc
|
||||
from ..scorer import Scorer
|
||||
from ..gold import GoldParse, merge_sents
|
||||
from ..gold import read_json_file as read_gold_json
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
|
||||
def train(language, output_dir, train_data, dev_data, n_iter, n_sents,
|
||||
use_gpu, tagger, parser, ner, parser_L1):
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("output directory to store model in", "positional", None, str),
|
||||
train_data=("location of JSON-formatted training data", "positional", None, str),
|
||||
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
|
||||
n_iter=("number of iterations", "option", "n", int),
|
||||
n_sents=("number of sentences", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
resume=("Whether to resume training", "flag", "R", bool),
|
||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool)
|
||||
)
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
util.set_env_log(True)
|
||||
n_sents = n_sents or None
|
||||
output_path = util.ensure_path(output_dir)
|
||||
train_path = util.ensure_path(train_data)
|
||||
dev_path = util.ensure_path(dev_data)
|
||||
if not output_path.exists():
|
||||
prints(output_path, title="Output directory not found", exits=True)
|
||||
output_path.mkdir()
|
||||
if not train_path.exists():
|
||||
prints(train_path, title="Training data not found", exits=True)
|
||||
prints(train_path, title="Training data not found", exits=1)
|
||||
if dev_path and not dev_path.exists():
|
||||
prints(dev_path, title="Development data not found", exits=True)
|
||||
prints(dev_path, title="Development data not found", exits=1)
|
||||
|
||||
lang = util.get_lang_class(language)
|
||||
parser_cfg = {
|
||||
'pseudoprojective': True,
|
||||
'L1': parser_L1,
|
||||
'n_iter': n_iter,
|
||||
'lang': language,
|
||||
'features': lang.Defaults.parser_features}
|
||||
entity_cfg = {
|
||||
'n_iter': n_iter,
|
||||
'lang': language,
|
||||
'features': lang.Defaults.entity_features}
|
||||
tagger_cfg = {
|
||||
'n_iter': n_iter,
|
||||
'lang': language,
|
||||
'features': lang.Defaults.tagger_features}
|
||||
gold_train = list(read_gold_json(train_path, limit=n_sents))
|
||||
gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None
|
||||
lang_class = util.get_lang_class(lang)
|
||||
|
||||
train_model(lang, gold_train, gold_dev, output_path, n_iter, use_gpu=use_gpu)
|
||||
if gold_dev:
|
||||
scorer = evaluate(lang, gold_dev, output_path)
|
||||
print_results(scorer)
|
||||
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
|
||||
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
|
||||
if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
|
||||
if no_entities and 'entities' in pipeline: pipeline.remove('entities')
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||
# at the beginning of training.
|
||||
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
|
||||
util.env_opt('dropout_to', 0.2),
|
||||
util.env_opt('dropout_decay', 0.0))
|
||||
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
||||
util.env_opt('batch_to', 64),
|
||||
util.env_opt('batch_compound', 1.001))
|
||||
|
||||
if resume:
|
||||
prints(output_path / 'model19.pickle', title="Resuming training")
|
||||
nlp = dill.load((output_path / 'model19.pickle').open('rb'))
|
||||
else:
|
||||
nlp = lang_class(pipeline=pipeline)
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||
n_train_words = corpus.count_train()
|
||||
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
|
||||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
try:
|
||||
for i in range(n_iter):
|
||||
if resume:
|
||||
i += 20
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
train_docs = corpus.train_docs(nlp, projectivize=True,
|
||||
gold_preproc=False, max_length=0)
|
||||
losses = {}
|
||||
for batch in minibatch(train_docs, size=batch_sizes):
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, sgd=optimizer,
|
||||
drop=next(dropout_rates), losses=losses)
|
||||
pbar.update(sum(len(doc) for doc in docs))
|
||||
|
||||
with nlp.use_params(optimizer.averages):
|
||||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ('model%d' % i)
|
||||
nlp.to_disk(epoch_model_path)
|
||||
with (output_path / ('model%d.pickle' % i)).open('wb') as file_:
|
||||
dill.dump(nlp, file_, -1)
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
scorer = nlp_loaded.evaluate(
|
||||
corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=False))
|
||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||
with acc_loc.open('w') as file_:
|
||||
file_.write(json_dumps(scorer.scores))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
with (output_path / 'model-final.pickle').open('wb') as file_:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
dill.dump(nlp, file_, -1)
|
||||
|
||||
|
||||
def train_config(config):
|
||||
config_path = util.ensure_path(config)
|
||||
if not config_path.is_file():
|
||||
prints(config_path, title="Config file not found", exits=True)
|
||||
config = json.load(config_path)
|
||||
for setting in []:
|
||||
if setting not in config.keys():
|
||||
prints("%s not found in config file." % setting, title="Missing setting")
|
||||
|
||||
|
||||
def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg):
|
||||
print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||
|
||||
nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies'])
|
||||
dropout = util.env_opt('dropout', 0.0)
|
||||
# TODO: Get spaCy using Thinc's trainer and optimizer
|
||||
with nlp.begin_training(train_data, **cfg) as (trainer, optimizer):
|
||||
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)):
|
||||
losses = defaultdict(float)
|
||||
to_render = []
|
||||
for i, (docs, golds) in enumerate(epoch):
|
||||
state = nlp.update(docs, golds, drop=dropout, sgd=optimizer)
|
||||
losses['dep_loss'] += state.get('parser_loss', 0.0)
|
||||
losses['tag_loss'] += state.get('tag_loss', 0.0)
|
||||
to_render.insert(0, nlp(docs[-1].text))
|
||||
def _render_parses(i, to_render):
|
||||
to_render[0].user_data['title'] = "Batch %d" % i
|
||||
with Path('/tmp/entities.html').open('w') as file_:
|
||||
html = displacy.render(to_render[:5], style='ent', page=True)
|
||||
|
@ -84,49 +126,27 @@ def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg):
|
|||
with Path('/tmp/parses.html').open('w') as file_:
|
||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
||||
file_.write(html)
|
||||
if dev_data:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
dev_scores = trainer.evaluate(dev_data).scores
|
||||
else:
|
||||
dev_scores = defaultdict(float)
|
||||
print_progress(itn, losses, dev_scores)
|
||||
with (output_path / 'model.bin').open('wb') as file_:
|
||||
dill.dump(nlp, file_, -1)
|
||||
#nlp.to_disk(output_path, tokenizer=False)
|
||||
|
||||
|
||||
def evaluate(Language, gold_tuples, path):
|
||||
with (path / 'model.bin').open('rb') as file_:
|
||||
nlp = dill.load(file_)
|
||||
# TODO:
|
||||
# 1. This code is duplicate with spacy.train.Trainer.evaluate
|
||||
# 2. There's currently a semantic difference between pipe and
|
||||
# not pipe! It matters whether we batch the inputs. Must fix!
|
||||
all_docs = []
|
||||
all_golds = []
|
||||
for raw_text, paragraph_tuples in dev_sents:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||
docs = self.make_docs(raw_text, paragraph_tuples)
|
||||
golds = self.make_golds(docs, paragraph_tuples)
|
||||
all_docs.extend(docs)
|
||||
all_golds.extend(golds)
|
||||
scorer = Scorer()
|
||||
for doc, gold in zip(self.nlp.pipe(all_docs), all_golds):
|
||||
scorer.score(doc, gold)
|
||||
return scorer
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores):
|
||||
# TODO: Fix!
|
||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', 'ents_f']:
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||
scores[col] = 0.0
|
||||
scores.update(losses)
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores.update(dev_scores)
|
||||
tpl = '{:d}\t{dep_loss:.3f}\t{tag_loss:.3f}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
|
||||
scores['wps'] = wps
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.3f}',
|
||||
'{uas:.3f}',
|
||||
'{ents_p:.3f}',
|
||||
'{ents_r:.3f}',
|
||||
'{ents_f:.3f}',
|
||||
'{tags_acc:.3f}',
|
||||
'{token_acc:.3f}',
|
||||
'{wps:.1f}'))
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
|
|
|
@ -6,6 +6,8 @@ import ftfy
|
|||
import sys
|
||||
import ujson
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
|
||||
try:
|
||||
import cPickle as pickle
|
||||
except ImportError:
|
||||
|
@ -32,6 +34,7 @@ copy_reg = copy_reg
|
|||
CudaStream = CudaStream
|
||||
cupy = cupy
|
||||
fix_text = ftfy.fix_text
|
||||
copy_array = copy_array
|
||||
|
||||
is_python2 = six.PY2
|
||||
is_python3 = six.PY3
|
||||
|
@ -56,6 +59,11 @@ elif is_python3:
|
|||
json_dumps = lambda data: ujson.dumps(data, indent=2)
|
||||
path2str = lambda path: str(path)
|
||||
|
||||
def getattr_(obj, name, *default):
|
||||
if is_python3 and isinstance(name, bytes):
|
||||
name = name.decode('utf8')
|
||||
return getattr(obj, name, *default)
|
||||
|
||||
|
||||
def symlink_to(orig, dest):
|
||||
if is_python2 and is_windows:
|
||||
|
@ -71,3 +79,16 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
|||
(windows == None or windows == is_windows) and
|
||||
(linux == None or linux == is_linux) and
|
||||
(osx == None or osx == is_osx))
|
||||
|
||||
|
||||
def normalize_string_keys(old):
|
||||
'''Given a dictionary, make sure keys are unicode strings, not bytes.'''
|
||||
new = {}
|
||||
for key, value in old.items():
|
||||
if isinstance(key, bytes_):
|
||||
new[key.decode('utf8')] = value
|
||||
else:
|
||||
new[key] = value
|
||||
return new
|
||||
|
||||
|
||||
|
|
|
@ -10,27 +10,28 @@ _html = {}
|
|||
IS_JUPYTER = is_in_jupyter()
|
||||
|
||||
|
||||
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, options={}):
|
||||
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||
options={}, manual=False):
|
||||
"""Render displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||
page (bool): Render markup as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
jupyter (bool): Experimental, use Jupyter's display() to output markup.
|
||||
jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
if style == 'dep':
|
||||
renderer = DependencyRenderer(options=options)
|
||||
parsed = [parse_deps(doc, options) for doc in docs]
|
||||
elif style == 'ent':
|
||||
renderer = EntityRenderer(options=options)
|
||||
parsed = [parse_ents(doc, options) for doc in docs]
|
||||
else:
|
||||
factories = {'dep': (DependencyRenderer, parse_deps),
|
||||
'ent': (EntityRenderer, parse_ents)}
|
||||
if style not in factories:
|
||||
raise ValueError("Unknown style: %s" % style)
|
||||
if isinstance(docs, Doc) or isinstance(docs, dict):
|
||||
docs = [docs]
|
||||
renderer, converter = factories[style]
|
||||
renderer = renderer(options=options)
|
||||
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
||||
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
||||
html = _html['parsed']
|
||||
if jupyter: # return HTML rendered by IPython display()
|
||||
|
@ -39,7 +40,8 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, opti
|
|||
return html
|
||||
|
||||
|
||||
def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
|
||||
def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
||||
port=5000):
|
||||
"""Serve displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
|
@ -47,13 +49,19 @@ def serve(docs, style='dep', page=True, minify=False, options={}, port=5000):
|
|||
page (bool): Render markup as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
|
||||
port (int): Port to serve visualisation.
|
||||
"""
|
||||
from wsgiref import simple_server
|
||||
render(docs, style=style, page=page, minify=minify, options=options)
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
||||
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
prints("Shutting down server on port %d." % port)
|
||||
finally:
|
||||
httpd.server_close()
|
||||
|
||||
|
||||
def app(environ, start_response):
|
||||
|
@ -62,12 +70,13 @@ def app(environ, start_response):
|
|||
return [res]
|
||||
|
||||
|
||||
def parse_deps(doc, options={}):
|
||||
def parse_deps(orig_doc, options={}):
|
||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||
|
||||
doc (Doc): Document do parse.
|
||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
||||
if options.get('collapse_punct', True):
|
||||
spans = []
|
||||
for word in doc[:-1]:
|
||||
|
|
|
@ -18,12 +18,11 @@ class DependencyRenderer(object):
|
|||
offset_x, color, bg, font)
|
||||
"""
|
||||
self.compact = options.get('compact', False)
|
||||
distance, arrow_width = (85, 8) if self.compact else (175, 10)
|
||||
self.word_spacing = options.get('word_spacing', 45)
|
||||
self.arrow_spacing = options.get('arrow_spacing', 20)
|
||||
self.arrow_width = options.get('arrow_width', arrow_width)
|
||||
self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
|
||||
self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
|
||||
self.arrow_stroke = options.get('arrow_stroke', 2)
|
||||
self.distance = options.get('distance', distance)
|
||||
self.distance = options.get('distance', 150 if self.compact else 175)
|
||||
self.offset_x = options.get('offset_x', 50)
|
||||
self.color = options.get('color', '#000000')
|
||||
self.bg = options.get('bg', '#ffffff')
|
||||
|
@ -99,6 +98,8 @@ class DependencyRenderer(object):
|
|||
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
||||
-self.arrow_spacing*(self.highest_level-level)/4)
|
||||
y_curve = self.offset_y-level*self.distance/2
|
||||
if self.compact:
|
||||
y_curve = self.offset_y-level*self.distance/6
|
||||
if y_curve == 0 and len(self.levels) > 5:
|
||||
y_curve = -self.distance
|
||||
arrowhead = self.get_arrowhead(direction, x_start, y, x_end)
|
||||
|
@ -175,7 +176,7 @@ class EntityRenderer(object):
|
|||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
"""
|
||||
rendered = [self.render_ents(p['text'], p['ents'], p['title']) for p in parsed]
|
||||
rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed]
|
||||
if page:
|
||||
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||
markup = TPL_PAGE.format(content=docs)
|
||||
|
|
|
@ -21,7 +21,7 @@ TPL_DEP_WORDS = """
|
|||
TPL_DEP_ARCS = """
|
||||
<g class="displacy-arrow">
|
||||
<path class="displacy-arc" id="arrow-{id}-{i}" stroke-width="{stroke}px" d="{arc}" fill="none" stroke="currentColor"/>
|
||||
<text dy="1.25em" style="font-size: 0.8em">
|
||||
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
|
||||
<textPath xlink:href="#arrow-{id}-{i}" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">{label}</textPath>
|
||||
</text>
|
||||
<path class="displacy-arrowhead" d="{head}" fill="currentColor"/>
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .structs cimport TokenC
|
||||
from .typedefs cimport attr_t
|
||||
from .syntax.transition_system cimport Transition
|
||||
|
||||
|
||||
cdef struct GoldParseC:
|
||||
int* tags
|
||||
int* heads
|
||||
int* labels
|
||||
int* has_dep
|
||||
attr_t* labels
|
||||
int** brackets
|
||||
Transition* ner
|
||||
|
||||
|
@ -18,15 +20,15 @@ cdef class GoldParse:
|
|||
cdef GoldParseC c
|
||||
|
||||
cdef int length
|
||||
cdef readonly int loss
|
||||
cdef readonly list words
|
||||
cdef readonly list tags
|
||||
cdef readonly list heads
|
||||
cdef readonly list labels
|
||||
cdef readonly dict orths
|
||||
cdef readonly list ner
|
||||
cdef readonly list ents
|
||||
cdef readonly dict brackets
|
||||
cdef public int loss
|
||||
cdef public list words
|
||||
cdef public list tags
|
||||
cdef public list heads
|
||||
cdef public list labels
|
||||
cdef public dict orths
|
||||
cdef public list ner
|
||||
cdef public list ents
|
||||
cdef public dict brackets
|
||||
|
||||
cdef readonly list cand_to_gold
|
||||
cdef readonly list gold_to_cand
|
||||
|
|
263
spacy/gold.pyx
263
spacy/gold.pyx
|
@ -5,10 +5,13 @@ from __future__ import unicode_literals, print_function
|
|||
import io
|
||||
import re
|
||||
import ujson
|
||||
import random
|
||||
import cytoolz
|
||||
|
||||
from .syntax import nonproj
|
||||
from .util import ensure_path
|
||||
from . import util
|
||||
from .tokens import Doc
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
|
@ -86,8 +89,8 @@ def _min_edit_path(cand_words, gold_words):
|
|||
# TODO: Fix this --- just do it properly, make the full edit matrix and
|
||||
# then walk back over it...
|
||||
# Preprocess inputs
|
||||
cand_words = [punct_re.sub('', w) for w in cand_words]
|
||||
gold_words = [punct_re.sub('', w) for w in gold_words]
|
||||
cand_words = [punct_re.sub('', w).lower() for w in cand_words]
|
||||
gold_words = [punct_re.sub('', w).lower() for w in gold_words]
|
||||
|
||||
if cand_words == gold_words:
|
||||
return 0, ''.join(['M' for _ in gold_words])
|
||||
|
@ -139,8 +142,164 @@ def _min_edit_path(cand_words, gold_words):
|
|||
return prev_costs[n_gold], previous_row[-1]
|
||||
|
||||
|
||||
def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
|
||||
make_supertags = util.env_opt('make_supertags', make_supertags)
|
||||
def minibatch(items, size=8):
|
||||
'''Iterate over batches of items. `size` may be an iterator,
|
||||
so that batch-size can vary on each step.
|
||||
'''
|
||||
items = iter(items)
|
||||
while True:
|
||||
batch_size = next(size) #if hasattr(size, '__next__') else size
|
||||
batch = list(cytoolz.take(int(batch_size), items))
|
||||
if len(batch) == 0:
|
||||
break
|
||||
yield list(batch)
|
||||
|
||||
|
||||
class GoldCorpus(object):
|
||||
"""An annotated corpus, using the JSON file format. Manages
|
||||
annotations for tagging, dependency parsing and NER."""
|
||||
def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
|
||||
"""Create a GoldCorpus.
|
||||
|
||||
train_path (unicode or Path): File or directory of training data.
|
||||
dev_path (unicode or Path): File or directory of development data.
|
||||
"""
|
||||
self.train_path = util.ensure_path(train_path)
|
||||
self.dev_path = util.ensure_path(dev_path)
|
||||
self.limit = limit
|
||||
self.train_locs = self.walk_corpus(self.train_path)
|
||||
self.dev_locs = self.walk_corpus(self.dev_path)
|
||||
|
||||
@property
|
||||
def train_tuples(self):
|
||||
i = 0
|
||||
for loc in self.train_locs:
|
||||
gold_tuples = read_json_file(loc)
|
||||
for item in gold_tuples:
|
||||
yield item
|
||||
i += len(item[1])
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
|
||||
@property
|
||||
def dev_tuples(self):
|
||||
i = 0
|
||||
for loc in self.dev_locs:
|
||||
gold_tuples = read_json_file(loc)
|
||||
for item in gold_tuples:
|
||||
yield item
|
||||
i += 1
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
|
||||
def count_train(self):
|
||||
n = 0
|
||||
i = 0
|
||||
for raw_text, paragraph_tuples in self.train_tuples:
|
||||
n += sum([len(s[0][1]) for s in paragraph_tuples])
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
i += len(paragraph_tuples)
|
||||
return n
|
||||
|
||||
def train_docs(self, nlp, gold_preproc=False,
|
||||
projectivize=False, max_length=None,
|
||||
noise_level=0.0):
|
||||
train_tuples = self.train_tuples
|
||||
if projectivize:
|
||||
train_tuples = nonproj.preprocess_training_data(
|
||||
self.train_tuples)
|
||||
random.shuffle(train_tuples)
|
||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||
max_length=max_length,
|
||||
noise_level=noise_level)
|
||||
yield from gold_docs
|
||||
|
||||
def dev_docs(self, nlp, gold_preproc=False):
|
||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||
#gold_docs = nlp.preprocess_gold(gold_docs)
|
||||
yield from gold_docs
|
||||
|
||||
@classmethod
|
||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
||||
noise_level=0.0):
|
||||
for raw_text, paragraph_tuples in tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||
|
||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||
gold_preproc, noise_level=noise_level)
|
||||
golds = cls._make_golds(docs, paragraph_tuples)
|
||||
for doc, gold in zip(docs, golds):
|
||||
if (not max_length) or len(doc) < max_length:
|
||||
yield doc, gold
|
||||
|
||||
@classmethod
|
||||
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
|
||||
noise_level=0.0):
|
||||
if raw_text is not None:
|
||||
raw_text = add_noise(raw_text, noise_level)
|
||||
return [nlp.make_doc(raw_text)]
|
||||
else:
|
||||
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
||||
for (sent_tuples, brackets) in paragraph_tuples]
|
||||
|
||||
@classmethod
|
||||
def _make_golds(cls, docs, paragraph_tuples):
|
||||
assert len(docs) == len(paragraph_tuples)
|
||||
if len(docs) == 1:
|
||||
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
|
||||
else:
|
||||
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
||||
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
|
||||
|
||||
@staticmethod
|
||||
def walk_corpus(path):
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith('.'):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif path.parts[-1].endswith('.json'):
|
||||
locs.append(path)
|
||||
return locs
|
||||
|
||||
|
||||
def add_noise(orig, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return orig
|
||||
elif type(orig) == list:
|
||||
corrupted = [_corrupt(word, noise_level) for word in orig]
|
||||
corrupted = [w for w in corrupted if w]
|
||||
return corrupted
|
||||
else:
|
||||
return ''.join(_corrupt(c, noise_level) for c in orig)
|
||||
|
||||
|
||||
def _corrupt(c, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return c
|
||||
elif c == ' ':
|
||||
return '\n'
|
||||
elif c == '\n':
|
||||
return ' '
|
||||
elif c in ['.', "'", "!", "?"]:
|
||||
return ''
|
||||
else:
|
||||
return c.lower()
|
||||
|
||||
|
||||
def read_json_file(loc, docs_filter=None, limit=None):
|
||||
loc = ensure_path(loc)
|
||||
if loc.is_dir():
|
||||
for filename in loc.iterdir():
|
||||
|
@ -173,8 +332,6 @@ def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
|
|||
if labels[-1].lower() == 'root':
|
||||
labels[-1] = 'ROOT'
|
||||
ner.append(token.get('ner', '-'))
|
||||
if make_supertags:
|
||||
tags[-1] = '-'.join((tags[-1], labels[-1], ner[-1]))
|
||||
sents.append([
|
||||
[ids, words, tags, heads, labels, ner],
|
||||
sent.get('brackets', [])])
|
||||
|
@ -182,7 +339,7 @@ def read_json_file(loc, docs_filter=None, make_supertags=True, limit=None):
|
|||
yield [paragraph.get('raw', None), sents]
|
||||
|
||||
|
||||
def _iob_to_biluo(tags):
|
||||
def iob_to_biluo(tags):
|
||||
out = []
|
||||
curr_label = None
|
||||
tags = list(tags)
|
||||
|
@ -225,25 +382,17 @@ cdef class GoldParse:
|
|||
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||
deps=None, entities=None, make_projective=False):
|
||||
"""
|
||||
Create a GoldParse.
|
||||
"""Create a GoldParse.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document the annotations refer to.
|
||||
words:
|
||||
A sequence of unicode word strings.
|
||||
tags:
|
||||
A sequence of strings, representing tag annotations.
|
||||
heads:
|
||||
A sequence of integers, representing syntactic head offsets.
|
||||
deps:
|
||||
A sequence of strings, representing the syntactic relation types.
|
||||
entities:
|
||||
A sequence of named entity annotations, either as BILUO tag strings,
|
||||
or as (start_char, end_char, label) tuples, representing the entity
|
||||
positions.
|
||||
Returns (GoldParse): The newly constructed object.
|
||||
doc (Doc): The document the annotations refer to.
|
||||
words (iterable): A sequence of unicode word strings.
|
||||
tags (iterable): A sequence of strings, representing tag annotations.
|
||||
heads (iterable): A sequence of integers, representing syntactic head offsets.
|
||||
deps (iterable): A sequence of strings, representing the syntactic relation types.
|
||||
entities (iterable): A sequence of named entity annotations, either as
|
||||
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||
representing the entity positions.
|
||||
RETURNS (GoldParse): The newly constructed object.
|
||||
"""
|
||||
if words is None:
|
||||
words = [token.text for token in doc]
|
||||
|
@ -268,7 +417,8 @@ cdef class GoldParse:
|
|||
# These are filled by the tagger/parser/entity recogniser
|
||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||
|
||||
self.words = [None] * len(doc)
|
||||
|
@ -295,6 +445,9 @@ cdef class GoldParse:
|
|||
else:
|
||||
self.words[i] = words[gold_i]
|
||||
self.tags[i] = tags[gold_i]
|
||||
if heads[gold_i] is None:
|
||||
self.heads[i] = None
|
||||
else:
|
||||
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
||||
self.labels[i] = deps[gold_i]
|
||||
self.ner[i] = entities[gold_i]
|
||||
|
@ -304,59 +457,49 @@ cdef class GoldParse:
|
|||
raise Exception("Cycle found: %s" % cycle)
|
||||
|
||||
if make_projective:
|
||||
proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads, self.labels)
|
||||
proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
|
||||
self.heads = proj_heads
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Get the number of gold-standard tokens.
|
||||
"""Get the number of gold-standard tokens.
|
||||
|
||||
Returns (int): The number of gold-standard tokens.
|
||||
RETURNS (int): The number of gold-standard tokens.
|
||||
"""
|
||||
return self.length
|
||||
|
||||
@property
|
||||
def is_projective(self):
|
||||
"""
|
||||
Whether the provided syntactic annotations form a projective dependency
|
||||
tree.
|
||||
"""Whether the provided syntactic annotations form a projective
|
||||
dependency tree.
|
||||
"""
|
||||
return not nonproj.is_nonproj_tree(self.heads)
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities):
|
||||
"""
|
||||
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
scheme (biluo).
|
||||
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
scheme (BILUO).
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document that the entity offsets refer to. The output tags will
|
||||
refer to the token boundaries within the document.
|
||||
doc (Doc): The document that the entity offsets refer to. The output tags
|
||||
will refer to the token boundaries within the document.
|
||||
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
|
||||
`end` should be character-offset integers denoting the slice into the
|
||||
original string.
|
||||
|
||||
entities (sequence):
|
||||
A sequence of (start, end, label) triples. start and end should be
|
||||
character-offset integers denoting the slice into the original string.
|
||||
|
||||
Returns:
|
||||
tags (list):
|
||||
A list of unicode strings, describing the tags. Each tag string will
|
||||
be of the form either "", "O" or "{action}-{label}", where action is one
|
||||
of "B", "I", "L", "U". The string "-" is used where the entity
|
||||
offsets don't align with the tokenization in the Doc object. The
|
||||
training algorithm will view these as missing values. "O" denotes
|
||||
a non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||
string will be of the form either "", "O" or "{action}-{label}", where
|
||||
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||
entity offsets don't align with the tokenization in the `Doc` object. The
|
||||
training algorithm will view these as missing values. "O" denotes a
|
||||
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||
|
||||
Example:
|
||||
text = 'I like London.'
|
||||
entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||
doc = nlp.tokenizer(text)
|
||||
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
|
||||
assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||
EXAMPLE:
|
||||
>>> text = 'I like London.'
|
||||
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||
>>> doc = nlp.tokenizer(text)
|
||||
>>> tags = biluo_tags_from_offsets(doc, entities)
|
||||
>>> assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||
"""
|
||||
starts = {token.idx: token.i for token in doc}
|
||||
ends = {token.idx+len(token): token.i for token in doc}
|
||||
|
|
|
@ -13,10 +13,7 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class Bengali(Language):
|
||||
lang = 'bn'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class BengaliDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||
|
||||
|
@ -30,4 +27,9 @@ class Bengali(Language):
|
|||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
|
||||
|
||||
class Bengali(Language):
|
||||
lang = 'bn'
|
||||
Defaults = BengaliDefaults
|
||||
|
||||
|
||||
__all__ = ['Bengali']
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||
|
||||
|
||||
_currency = r"\$|¢|£|€|¥|฿|৳"
|
||||
|
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
|
|||
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
||||
|
||||
|
||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
|
||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
||||
|
||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
|
||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
||||
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
||||
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
|
|
|
@ -20,7 +20,6 @@ _upper = [_latin_upper]
|
|||
_lower = [_latin_lower]
|
||||
_uncased = [_bengali, _hebrew]
|
||||
|
||||
|
||||
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||||
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
||||
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||||
|
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
|||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||
_hyphens = '- – — -- ---'
|
||||
|
||||
_other_symbols = r'[\p{So}]'
|
||||
|
||||
UNITS = merge_chars(_units)
|
||||
CURRENCY = merge_chars(_currency)
|
||||
QUOTES = merge_chars(_quotes)
|
||||
PUNCT = merge_chars(_punct)
|
||||
HYPHENS = merge_chars(_hyphens)
|
||||
ICONS = _other_symbols
|
||||
|
||||
LIST_UNITS = split_chars(_units)
|
||||
LIST_CURRENCY = split_chars(_currency)
|
||||
|
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
|
|||
LIST_PUNCT = split_chars(_punct)
|
||||
LIST_HYPHENS = split_chars(_hyphens)
|
||||
LIST_ELLIPSES = [r'\.\.+', '…']
|
||||
LIST_ICONS = [_other_symbols]
|
||||
|
|
|
@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class Danish(Language):
|
||||
lang = 'da'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class DanishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'da'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Danish(Language):
|
||||
lang = 'da'
|
||||
Defaults = DanishDefaults
|
||||
|
||||
|
||||
__all__ = ['Danish']
|
||||
|
|
|
@ -2,24 +2,25 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class German(Language):
|
||||
lang = 'de'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class GermanDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'de'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
NORM_EXCEPTIONS, BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
|
@ -31,4 +32,9 @@ class German(Language):
|
|||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class German(Language):
|
||||
lang = 'de'
|
||||
Defaults = GermanDefaults
|
||||
|
||||
|
||||
__all__ = ['German']
|
||||
|
|
17
spacy/lang/de/norm_exceptions.py
Normal file
17
spacy/lang/de/norm_exceptions.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Here we only want to include the absolute most common words. Otherwise,
|
||||
# this list would get impossibly long for German – especially considering the
|
||||
# old vs. new spelling rules, and all possible cases.
|
||||
|
||||
|
||||
_exc = {
|
||||
"daß": "dass"
|
||||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
|
@ -15,9 +15,9 @@ def noun_chunks(obj):
|
|||
# and not just "eine Tasse", same for "das Thema Familie".
|
||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_label = doc.vocab.strings['NP']
|
||||
np_deps = set(doc.vocab.strings[label] for label in labels)
|
||||
close_app = doc.vocab.strings['nk']
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||
close_app = doc.vocab.strings.add('nk')
|
||||
|
||||
rbracket = 0
|
||||
for i, word in enumerate(obj):
|
||||
|
|
|
@ -53,97 +53,97 @@ _exc = {
|
|||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "'S", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "S'", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "s'", LEMMA: PRON_LEMMA, TAG: "PPER"},
|
||||
{ORTH: "'S", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "S'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "s'", LEMMA: PRON_LEMMA, NORM: "'s", TAG: "PPER"},
|
||||
{ORTH: "'n", LEMMA: "ein", NORM: "ein"},
|
||||
{ORTH: "'ne", LEMMA: "eine", NORM: "eine"},
|
||||
{ORTH: "'nen", LEMMA: "ein", NORM: "einen"},
|
||||
{ORTH: "'nem", LEMMA: "ein", NORM: "einem"},
|
||||
{ORTH: "Abb.", LEMMA: "Abbildung"},
|
||||
{ORTH: "Abk.", LEMMA: "Abkürzung"},
|
||||
{ORTH: "Abt.", LEMMA: "Abteilung"},
|
||||
{ORTH: "Apr.", LEMMA: "April"},
|
||||
{ORTH: "Aug.", LEMMA: "August"},
|
||||
{ORTH: "Bd.", LEMMA: "Band"},
|
||||
{ORTH: "Betr.", LEMMA: "Betreff"},
|
||||
{ORTH: "Bf.", LEMMA: "Bahnhof"},
|
||||
{ORTH: "Bhf.", LEMMA: "Bahnhof"},
|
||||
{ORTH: "Bsp.", LEMMA: "Beispiel"},
|
||||
{ORTH: "Dez.", LEMMA: "Dezember"},
|
||||
{ORTH: "Di.", LEMMA: "Dienstag"},
|
||||
{ORTH: "Do.", LEMMA: "Donnerstag"},
|
||||
{ORTH: "Fa.", LEMMA: "Firma"},
|
||||
{ORTH: "Fam.", LEMMA: "Familie"},
|
||||
{ORTH: "Feb.", LEMMA: "Februar"},
|
||||
{ORTH: "Fr.", LEMMA: "Frau"},
|
||||
{ORTH: "Frl.", LEMMA: "Fräulein"},
|
||||
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"},
|
||||
{ORTH: "Hr.", LEMMA: "Herr"},
|
||||
{ORTH: "Hrn.", LEMMA: "Herr"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar"},
|
||||
{ORTH: "Jh.", LEMMA: "Jahrhundert"},
|
||||
{ORTH: "Jhd.", LEMMA: "Jahrhundert"},
|
||||
{ORTH: "Jul.", LEMMA: "Juli"},
|
||||
{ORTH: "Jun.", LEMMA: "Juni"},
|
||||
{ORTH: "Mi.", LEMMA: "Mittwoch"},
|
||||
{ORTH: "Mio.", LEMMA: "Million"},
|
||||
{ORTH: "Mo.", LEMMA: "Montag"},
|
||||
{ORTH: "Mrd.", LEMMA: "Milliarde"},
|
||||
{ORTH: "Mrz.", LEMMA: "März"},
|
||||
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"},
|
||||
{ORTH: "Mär.", LEMMA: "März"},
|
||||
{ORTH: "Nov.", LEMMA: "November"},
|
||||
{ORTH: "Nr.", LEMMA: "Nummer"},
|
||||
{ORTH: "Okt.", LEMMA: "Oktober"},
|
||||
{ORTH: "Orig.", LEMMA: "Original"},
|
||||
{ORTH: "Pkt.", LEMMA: "Punkt"},
|
||||
{ORTH: "Prof.", LEMMA: "Professor"},
|
||||
{ORTH: "Red.", LEMMA: "Redaktion"},
|
||||
{ORTH: "Sa.", LEMMA: "Samstag"},
|
||||
{ORTH: "Sep.", LEMMA: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September"},
|
||||
{ORTH: "So.", LEMMA: "Sonntag"},
|
||||
{ORTH: "Std.", LEMMA: "Stunde"},
|
||||
{ORTH: "Str.", LEMMA: "Straße"},
|
||||
{ORTH: "Tel.", LEMMA: "Telefon"},
|
||||
{ORTH: "Tsd.", LEMMA: "Tausend"},
|
||||
{ORTH: "Univ.", LEMMA: "Universität"},
|
||||
{ORTH: "abzgl.", LEMMA: "abzüglich"},
|
||||
{ORTH: "allg.", LEMMA: "allgemein"},
|
||||
{ORTH: "bspw.", LEMMA: "beispielsweise"},
|
||||
{ORTH: "bzgl.", LEMMA: "bezüglich"},
|
||||
{ORTH: "bzw.", LEMMA: "beziehungsweise"},
|
||||
{ORTH: "Abb.", LEMMA: "Abbildung", NORM: "Abbildung"},
|
||||
{ORTH: "Abk.", LEMMA: "Abkürzung", NORM: "Abkürzung"},
|
||||
{ORTH: "Abt.", LEMMA: "Abteilung", NORM: "Abteilung"},
|
||||
{ORTH: "Apr.", LEMMA: "April", NORM: "April"},
|
||||
{ORTH: "Aug.", LEMMA: "August", NORM: "August"},
|
||||
{ORTH: "Bd.", LEMMA: "Band", NORM: "Band"},
|
||||
{ORTH: "Betr.", LEMMA: "Betreff", NORM: "Betreff"},
|
||||
{ORTH: "Bf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
|
||||
{ORTH: "Bhf.", LEMMA: "Bahnhof", NORM: "Bahnhof"},
|
||||
{ORTH: "Bsp.", LEMMA: "Beispiel", NORM: "Beispiel"},
|
||||
{ORTH: "Dez.", LEMMA: "Dezember", NORM: "Dezember"},
|
||||
{ORTH: "Di.", LEMMA: "Dienstag", NORM: "Dienstag"},
|
||||
{ORTH: "Do.", LEMMA: "Donnerstag", NORM: "Donnerstag"},
|
||||
{ORTH: "Fa.", LEMMA: "Firma", NORM: "Firma"},
|
||||
{ORTH: "Fam.", LEMMA: "Familie", NORM: "Familie"},
|
||||
{ORTH: "Feb.", LEMMA: "Februar", NORM: "Februar"},
|
||||
{ORTH: "Fr.", LEMMA: "Frau", NORM: "Frau"},
|
||||
{ORTH: "Frl.", LEMMA: "Fräulein", NORM: "Fräulein"},
|
||||
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof", NORM: "Hauptbahnhof"},
|
||||
{ORTH: "Hr.", LEMMA: "Herr", NORM: "Herr"},
|
||||
{ORTH: "Hrn.", LEMMA: "Herr", NORM: "Herrn"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
|
||||
{ORTH: "Jh.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
|
||||
{ORTH: "Jhd.", LEMMA: "Jahrhundert", NORM: "Jahrhundert"},
|
||||
{ORTH: "Jul.", LEMMA: "Juli", NORM: "Juli"},
|
||||
{ORTH: "Jun.", LEMMA: "Juni", NORM: "Juni"},
|
||||
{ORTH: "Mi.", LEMMA: "Mittwoch", NORM: "Mittwoch"},
|
||||
{ORTH: "Mio.", LEMMA: "Million", NORM: "Million"},
|
||||
{ORTH: "Mo.", LEMMA: "Montag", NORM: "Montag"},
|
||||
{ORTH: "Mrd.", LEMMA: "Milliarde", NORM: "Milliarde"},
|
||||
{ORTH: "Mrz.", LEMMA: "März", NORM: "März"},
|
||||
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer", NORM: "Mehrwertsteuer"},
|
||||
{ORTH: "Mär.", LEMMA: "März", NORM: "März"},
|
||||
{ORTH: "Nov.", LEMMA: "November", NORM: "November"},
|
||||
{ORTH: "Nr.", LEMMA: "Nummer", NORM: "Nummer"},
|
||||
{ORTH: "Okt.", LEMMA: "Oktober", NORM: "Oktober"},
|
||||
{ORTH: "Orig.", LEMMA: "Original", NORM: "Original"},
|
||||
{ORTH: "Pkt.", LEMMA: "Punkt", NORM: "Punkt"},
|
||||
{ORTH: "Prof.", LEMMA: "Professor", NORM: "Professor"},
|
||||
{ORTH: "Red.", LEMMA: "Redaktion", NORM: "Redaktion"},
|
||||
{ORTH: "Sa.", LEMMA: "Samstag", NORM: "Samstag"},
|
||||
{ORTH: "Sep.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "So.", LEMMA: "Sonntag", NORM: "Sonntag"},
|
||||
{ORTH: "Std.", LEMMA: "Stunde", NORM: "Stunde"},
|
||||
{ORTH: "Str.", LEMMA: "Straße", NORM: "Straße"},
|
||||
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
|
||||
{ORTH: "Tsd.", LEMMA: "Tausend", NORM: "Tausend"},
|
||||
{ORTH: "Univ.", LEMMA: "Universität", NORM: "Universität"},
|
||||
{ORTH: "abzgl.", LEMMA: "abzüglich", NORM: "abzüglich"},
|
||||
{ORTH: "allg.", LEMMA: "allgemein", NORM: "allgemein"},
|
||||
{ORTH: "bspw.", LEMMA: "beispielsweise", NORM: "beispielsweise"},
|
||||
{ORTH: "bzgl.", LEMMA: "bezüglich", NORM: "bezüglich"},
|
||||
{ORTH: "bzw.", LEMMA: "beziehungsweise", NORM: "beziehungsweise"},
|
||||
{ORTH: "d.h.", LEMMA: "das heißt"},
|
||||
{ORTH: "dgl.", LEMMA: "dergleichen"},
|
||||
{ORTH: "ebd.", LEMMA: "ebenda"},
|
||||
{ORTH: "eigtl.", LEMMA: "eigentlich"},
|
||||
{ORTH: "engl.", LEMMA: "englisch"},
|
||||
{ORTH: "evtl.", LEMMA: "eventuell"},
|
||||
{ORTH: "frz.", LEMMA: "französisch"},
|
||||
{ORTH: "gegr.", LEMMA: "gegründet"},
|
||||
{ORTH: "ggf.", LEMMA: "gegebenenfalls"},
|
||||
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"},
|
||||
{ORTH: "ggü.", LEMMA: "gegenüber"},
|
||||
{ORTH: "dgl.", LEMMA: "dergleichen", NORM: "dergleichen"},
|
||||
{ORTH: "ebd.", LEMMA: "ebenda", NORM: "ebenda"},
|
||||
{ORTH: "eigtl.", LEMMA: "eigentlich", NORM: "eigentlich"},
|
||||
{ORTH: "engl.", LEMMA: "englisch", NORM: "englisch"},
|
||||
{ORTH: "evtl.", LEMMA: "eventuell", NORM: "eventuell"},
|
||||
{ORTH: "frz.", LEMMA: "französisch", NORM: "französisch"},
|
||||
{ORTH: "gegr.", LEMMA: "gegründet", NORM: "gegründet"},
|
||||
{ORTH: "ggf.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
|
||||
{ORTH: "ggfs.", LEMMA: "gegebenenfalls", NORM: "gegebenenfalls"},
|
||||
{ORTH: "ggü.", LEMMA: "gegenüber", NORM: "gegenüber"},
|
||||
{ORTH: "i.O.", LEMMA: "in Ordnung"},
|
||||
{ORTH: "i.d.R.", LEMMA: "in der Regel"},
|
||||
{ORTH: "incl.", LEMMA: "inklusive"},
|
||||
{ORTH: "inkl.", LEMMA: "inklusive"},
|
||||
{ORTH: "insb.", LEMMA: "insbesondere"},
|
||||
{ORTH: "kath.", LEMMA: "katholisch"},
|
||||
{ORTH: "lt.", LEMMA: "laut"},
|
||||
{ORTH: "max.", LEMMA: "maximal"},
|
||||
{ORTH: "min.", LEMMA: "minimal"},
|
||||
{ORTH: "mind.", LEMMA: "mindestens"},
|
||||
{ORTH: "mtl.", LEMMA: "monatlich"},
|
||||
{ORTH: "incl.", LEMMA: "inklusive", NORM: "inklusive"},
|
||||
{ORTH: "inkl.", LEMMA: "inklusive", NORM: "inklusive"},
|
||||
{ORTH: "insb.", LEMMA: "insbesondere", NORM: "insbesondere"},
|
||||
{ORTH: "kath.", LEMMA: "katholisch", NORM: "katholisch"},
|
||||
{ORTH: "lt.", LEMMA: "laut", NORM: "laut"},
|
||||
{ORTH: "max.", LEMMA: "maximal", NORM: "maximal"},
|
||||
{ORTH: "min.", LEMMA: "minimal", NORM: "minimal"},
|
||||
{ORTH: "mind.", LEMMA: "mindestens", NORM: "mindestens"},
|
||||
{ORTH: "mtl.", LEMMA: "monatlich", NORM: "monatlich"},
|
||||
{ORTH: "n.Chr.", LEMMA: "nach Christus"},
|
||||
{ORTH: "orig.", LEMMA: "original"},
|
||||
{ORTH: "röm.", LEMMA: "römisch"},
|
||||
{ORTH: "orig.", LEMMA: "original", NORM: "original"},
|
||||
{ORTH: "röm.", LEMMA: "römisch", NORM: "römisch"},
|
||||
{ORTH: "s.o.", LEMMA: "siehe oben"},
|
||||
{ORTH: "sog.", LEMMA: "so genannt"},
|
||||
{ORTH: "stellv.", LEMMA: "stellvertretend"},
|
||||
{ORTH: "tägl.", LEMMA: "täglich"},
|
||||
{ORTH: "tägl.", LEMMA: "täglich", NORM: "täglich"},
|
||||
{ORTH: "u.U.", LEMMA: "unter Umständen"},
|
||||
{ORTH: "u.s.w.", LEMMA: "und so weiter"},
|
||||
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"},
|
||||
|
@ -153,9 +153,9 @@ for exc_data in [
|
|||
{ORTH: "v.Chr.", LEMMA: "vor Christus"},
|
||||
{ORTH: "v.a.", LEMMA: "vor allem"},
|
||||
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"},
|
||||
{ORTH: "vgl.", LEMMA: "vergleiche"},
|
||||
{ORTH: "vllt.", LEMMA: "vielleicht"},
|
||||
{ORTH: "vlt.", LEMMA: "vielleicht"},
|
||||
{ORTH: "vgl.", LEMMA: "vergleiche", NORM: "vergleiche"},
|
||||
{ORTH: "vllt.", LEMMA: "vielleicht", NORM: "vielleicht"},
|
||||
{ORTH: "vlt.", LEMMA: "vielleicht", NORM: "vielleicht"},
|
||||
{ORTH: "z.B.", LEMMA: "zum Beispiel"},
|
||||
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"},
|
||||
{ORTH: "z.T.", LEMMA: "zum Teil"},
|
||||
|
@ -163,7 +163,7 @@ for exc_data in [
|
|||
{ORTH: "z.Zt.", LEMMA: "zur Zeit"},
|
||||
{ORTH: "z.b.", LEMMA: "zum Beispiel"},
|
||||
{ORTH: "zzgl.", LEMMA: "zuzüglich"},
|
||||
{ORTH: "österr.", LEMMA: "österreichisch"}]:
|
||||
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
|
||||
_exc[exc_data[ORTH]] = [dict(exc_data)]
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -10,18 +11,18 @@ from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class English(Language):
|
||||
lang = 'en'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'en'
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'en'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
BASE_NORMS, NORM_EXCEPTIONS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
|
@ -30,7 +31,12 @@ class English(Language):
|
|||
lemma_rules = dict(LEMMA_RULES)
|
||||
lemma_index = dict(LEMMA_INDEX)
|
||||
lemma_exc = dict(LEMMA_EXC)
|
||||
sytax_iterators = dict(SYNTAX_ITERATORS)
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
|
||||
class English(Language):
|
||||
lang = 'en'
|
||||
Defaults = EnglishDefaults
|
||||
|
||||
|
||||
__all__ = ['English']
|
||||
|
|
1761
spacy/lang/en/norm_exceptions.py
Normal file
1761
spacy/lang/en/norm_exceptions.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -11,9 +11,9 @@ def noun_chunks(obj):
|
|||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||
'attr', 'ROOT']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings['conj']
|
||||
np_label = doc.vocab.strings['NP']
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
|
|
|
@ -15,20 +15,20 @@ _exclude = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell",
|
|||
for pron in ["i"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'m"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP", "tenspect": 1, "number": 1}]
|
||||
|
||||
_exc[orth + "m"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 }]
|
||||
|
||||
_exc[orth + "'ma"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'m", LEMMA: "be", NORM: "am"},
|
||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
||||
|
||||
_exc[orth + "ma"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "m", LEMMA: "be", NORM: "am"},
|
||||
{ORTH: "a", LEMMA: "going to", NORM: "gonna"}]
|
||||
|
||||
|
@ -36,72 +36,72 @@ for pron in ["i"]:
|
|||
for pron in ["i", "you", "he", "she", "it", "we", "they"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'ll"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "ll"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "'ll've"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "llve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "'d"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "d"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "'d've"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "dve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
for pron in ["i", "you", "we", "they"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
for pron in ["you", "we", "they"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
||||
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "re", LEMMA: "be", NORM: "are", TAG: "VBZ"}]
|
||||
|
||||
|
||||
for pron in ["he", "she", "it"]:
|
||||
for orth in [pron, pron.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: "'s"}]
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "'s", NORM: "'s"}]
|
||||
|
||||
_exc[orth + "s"] = [
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"},
|
||||
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
|
||||
{ORTH: "s"}]
|
||||
|
||||
|
||||
|
@ -110,111 +110,111 @@ for pron in ["he", "she", "it"]:
|
|||
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
|
||||
for orth in [word, word.title()]:
|
||||
_exc[orth + "'s"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'s"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'s", NORM: "'s"}]
|
||||
|
||||
_exc[orth + "s"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "s"}]
|
||||
|
||||
_exc[orth + "'ll"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "ll"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"}]
|
||||
|
||||
_exc[orth + "'ll've"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "llve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ll", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "'re"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'re", LEMMA: "be", NORM: "are"}]
|
||||
|
||||
_exc[orth + "re"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "re", LEMMA: "be", NORM: "are"}]
|
||||
|
||||
_exc[orth + "'ve"] = [
|
||||
{ORTH: orth},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "ve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "'d"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'d"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'d", NORM: "'d"}]
|
||||
|
||||
_exc[orth + "d"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "d"}]
|
||||
|
||||
_exc[orth + "'d've"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "'d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[orth + "dve"] = [
|
||||
{ORTH: orth, LEMMA: word},
|
||||
{ORTH: "d", LEMMA: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: orth, LEMMA: word, NORM: word},
|
||||
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
# Verbs
|
||||
|
||||
for verb_data in [
|
||||
{ORTH: "ca", LEMMA: "can", TAG: "MD"},
|
||||
{ORTH: "could", TAG: "MD"},
|
||||
{ORTH: "do", LEMMA: "do"},
|
||||
{ORTH: "does", LEMMA: "do"},
|
||||
{ORTH: "did", LEMMA: "do", TAG: "VBD"},
|
||||
{ORTH: "had", LEMMA: "have", TAG: "VBD"},
|
||||
{ORTH: "may", TAG: "MD"},
|
||||
{ORTH: "might", TAG: "MD"},
|
||||
{ORTH: "must", TAG: "MD"},
|
||||
{ORTH: "need"},
|
||||
{ORTH: "ought"},
|
||||
{ORTH: "sha", LEMMA: "shall", TAG: "MD"},
|
||||
{ORTH: "should", TAG: "MD"},
|
||||
{ORTH: "wo", LEMMA: "will", TAG: "MD"},
|
||||
{ORTH: "would", TAG: "MD"}]:
|
||||
{ORTH: "ca", LEMMA: "can", NORM: "can", TAG: "MD"},
|
||||
{ORTH: "could", NORM: "could", TAG: "MD"},
|
||||
{ORTH: "do", LEMMA: "do", NORM: "do"},
|
||||
{ORTH: "does", LEMMA: "do", NORM: "does"},
|
||||
{ORTH: "did", LEMMA: "do", NORM: "do", TAG: "VBD"},
|
||||
{ORTH: "had", LEMMA: "have", NORM: "have", TAG: "VBD"},
|
||||
{ORTH: "may", NORM: "may", TAG: "MD"},
|
||||
{ORTH: "might", NORM: "might", TAG: "MD"},
|
||||
{ORTH: "must", NORM: "must", TAG: "MD"},
|
||||
{ORTH: "need", NORM: "need"},
|
||||
{ORTH: "ought", NORM: "ought", TAG: "MD"},
|
||||
{ORTH: "sha", LEMMA: "shall", NORM: "shall", TAG: "MD"},
|
||||
{ORTH: "should", NORM: "should", TAG: "MD"},
|
||||
{ORTH: "wo", LEMMA: "will", NORM: "will", TAG: "MD"},
|
||||
{ORTH: "would", NORM: "would", TAG: "MD"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
_exc[data[ORTH] + "n't"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
_exc[data[ORTH] + "nt"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
_exc[data[ORTH] + "n't've"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
_exc[data[ORTH] + "ntve"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}]
|
||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}]
|
||||
|
||||
|
||||
for verb_data in [
|
||||
{ORTH: "could", TAG: "MD"},
|
||||
{ORTH: "might"},
|
||||
{ORTH: "must"},
|
||||
{ORTH: "should"}]:
|
||||
{ORTH: "could", NORM: "could", TAG: "MD"},
|
||||
{ORTH: "might", NORM: "might", TAG: "MD"},
|
||||
{ORTH: "must", NORM: "must", TAG: "MD"},
|
||||
{ORTH: "should", NORM: "should", TAG: "MD"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
|
@ -228,21 +228,21 @@ for verb_data in [
|
|||
|
||||
|
||||
for verb_data in [
|
||||
{ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"},
|
||||
{ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2},
|
||||
{ORTH: "is", LEMMA: "be", TAG: "VBZ"},
|
||||
{ORTH: "was", LEMMA: "be"},
|
||||
{ORTH: "were", LEMMA: "be"}]:
|
||||
{ORTH: "ai", LEMMA: "be", TAG: "VBP", "number": 2},
|
||||
{ORTH: "are", LEMMA: "be", NORM: "are", TAG: "VBP", "number": 2},
|
||||
{ORTH: "is", LEMMA: "be", NORM: "is", TAG: "VBZ"},
|
||||
{ORTH: "was", LEMMA: "be", NORM: "was"},
|
||||
{ORTH: "were", LEMMA: "be", NORM: "were"}]:
|
||||
verb_data_tc = dict(verb_data)
|
||||
verb_data_tc[ORTH] = verb_data_tc[ORTH].title()
|
||||
for data in [verb_data, verb_data_tc]:
|
||||
_exc[data[ORTH] + "n't"] = [
|
||||
dict(data),
|
||||
{ORTH: "n't", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
_exc[data[ORTH] + "nt"] = [
|
||||
dict(data),
|
||||
{ORTH: "nt", LEMMA: "not", TAG: "RB"}]
|
||||
{ORTH: "nt", LEMMA: "not", NORM: "not", TAG: "RB"}]
|
||||
|
||||
|
||||
# Other contractions with trailing apostrophe
|
||||
|
@ -250,10 +250,10 @@ for verb_data in [
|
|||
for exc_data in [
|
||||
{ORTH: "doin", LEMMA: "do", NORM: "doing"},
|
||||
{ORTH: "goin", LEMMA: "go", NORM: "going"},
|
||||
{ORTH: "nothin", LEMMA: "nothing"},
|
||||
{ORTH: "nuthin", LEMMA: "nothing"},
|
||||
{ORTH: "ol", LEMMA: "old"},
|
||||
{ORTH: "somethin", LEMMA: "something"}]:
|
||||
{ORTH: "nothin", LEMMA: "nothing", NORM: "nothing"},
|
||||
{ORTH: "nuthin", LEMMA: "nothing", NORM: "nothing"},
|
||||
{ORTH: "ol", LEMMA: "old", NORM: "old"},
|
||||
{ORTH: "somethin", LEMMA: "something", NORM: "something"}]:
|
||||
exc_data_tc = dict(exc_data)
|
||||
exc_data_tc[ORTH] = exc_data_tc[ORTH].title()
|
||||
for data in [exc_data, exc_data_tc]:
|
||||
|
@ -266,10 +266,10 @@ for exc_data in [
|
|||
# Other contractions with leading apostrophe
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "cause", LEMMA: "because"},
|
||||
{ORTH: "cause", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"},
|
||||
{ORTH: "ll", LEMMA: "will"},
|
||||
{ORTH: "nuff", LEMMA: "enough"}]:
|
||||
{ORTH: "ll", LEMMA: "will", NORM: "will"},
|
||||
{ORTH: "nuff", LEMMA: "enough", NORM: "enough"}]:
|
||||
exc_data_apos = dict(exc_data)
|
||||
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
|
||||
for data in [exc_data, exc_data_apos]:
|
||||
|
@ -282,11 +282,11 @@ for h in range(1, 12 + 1):
|
|||
for period in ["a.m.", "am"]:
|
||||
_exc["%d%s" % (h, period)] = [
|
||||
{ORTH: "%d" % h},
|
||||
{ORTH: period, LEMMA: "a.m."}]
|
||||
{ORTH: period, LEMMA: "a.m.", NORM: "a.m."}]
|
||||
for period in ["p.m.", "pm"]:
|
||||
_exc["%d%s" % (h, period)] = [
|
||||
{ORTH: "%d" % h},
|
||||
{ORTH: period, LEMMA: "p.m."}]
|
||||
{ORTH: period, LEMMA: "p.m.", NORM: "p.m."}]
|
||||
|
||||
|
||||
# Rest
|
||||
|
@ -306,56 +306,56 @@ _other_exc = {
|
|||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
||||
|
||||
"How'd'y": [
|
||||
{ORTH: "How", LEMMA: "how"},
|
||||
{ORTH: "How", LEMMA: "how", NORM: "how"},
|
||||
{ORTH: "'d", LEMMA: "do"},
|
||||
{ORTH: "'y", LEMMA: PRON_LEMMA, NORM: "you"}],
|
||||
|
||||
"not've": [
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"notve": [
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"Not've": [
|
||||
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "'ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"Notve": [
|
||||
{ORTH: "Not", LEMMA: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", TAG: "VB"}],
|
||||
{ORTH: "Not", LEMMA: "not", NORM: "not", TAG: "RB"},
|
||||
{ORTH: "ve", LEMMA: "have", NORM: "have", TAG: "VB"}],
|
||||
|
||||
"cannot": [
|
||||
{ORTH: "can", LEMMA: "can", TAG: "MD"},
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
||||
|
||||
"Cannot": [
|
||||
{ORTH: "Can", LEMMA: "can", TAG: "MD"},
|
||||
{ORTH: "Can", LEMMA: "can", NORM: "can", TAG: "MD"},
|
||||
{ORTH: "not", LEMMA: "not", TAG: "RB"}],
|
||||
|
||||
"gonna": [
|
||||
{ORTH: "gon", LEMMA: "go", NORM: "going"},
|
||||
{ORTH: "na", LEMMA: "to"}],
|
||||
{ORTH: "na", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"Gonna": [
|
||||
{ORTH: "Gon", LEMMA: "go", NORM: "going"},
|
||||
{ORTH: "na", LEMMA: "to"}],
|
||||
{ORTH: "na", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"gotta": [
|
||||
{ORTH: "got"},
|
||||
{ORTH: "ta", LEMMA: "to"}],
|
||||
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"Gotta": [
|
||||
{ORTH: "Got"},
|
||||
{ORTH: "ta", LEMMA: "to"}],
|
||||
{ORTH: "Got", NORM: "got"},
|
||||
{ORTH: "ta", LEMMA: "to", NORM: "to"}],
|
||||
|
||||
"let's": [
|
||||
{ORTH: "let"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}],
|
||||
|
||||
"Let's": [
|
||||
{ORTH: "Let", LEMMA: "let"},
|
||||
{ORTH: "Let", LEMMA: "let", NORM: "let"},
|
||||
{ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}]
|
||||
}
|
||||
|
||||
|
@ -363,72 +363,80 @@ _exc.update(_other_exc)
|
|||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: "'S", LEMMA: "'s"},
|
||||
{ORTH: "'s", LEMMA: "'s"},
|
||||
{ORTH: "\u2018S", LEMMA: "'s"},
|
||||
{ORTH: "\u2018s", LEMMA: "'s"},
|
||||
{ORTH: "and/or", LEMMA: "and/or", TAG: "CC"},
|
||||
{ORTH: "'S", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "'s", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "\u2018S", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "\u2018s", LEMMA: "'s", NORM: "'s"},
|
||||
{ORTH: "and/or", LEMMA: "and/or", NORM: "and/or", TAG: "CC"},
|
||||
{ORTH: "w/o", LEMMA: "without", NORM: "without"},
|
||||
{ORTH: "'re", LEMMA: "be", NORM: "are"},
|
||||
{ORTH: "'Cause", LEMMA: "because"},
|
||||
{ORTH: "'cause", LEMMA: "because"},
|
||||
{ORTH: "ma'am", LEMMA: "madam"},
|
||||
{ORTH: "Ma'am", LEMMA: "madam"},
|
||||
{ORTH: "o'clock", LEMMA: "o'clock"},
|
||||
{ORTH: "O'clock", LEMMA: "o'clock"},
|
||||
{ORTH: "'Cause", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'cause", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'cos", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'Cos", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'coz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'Coz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'cuz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'Cuz", LEMMA: "because", NORM: "because"},
|
||||
{ORTH: "'bout", LEMMA: "about", NORM: "about"},
|
||||
{ORTH: "ma'am", LEMMA: "madam", NORM: "madam"},
|
||||
{ORTH: "Ma'am", LEMMA: "madam", NORM: "madam"},
|
||||
{ORTH: "o'clock", LEMMA: "o'clock", NORM: "o'clock"},
|
||||
{ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
|
||||
|
||||
{ORTH: "Mt.", LEMMA: "Mount"},
|
||||
{ORTH: "Ak.", LEMMA: "Alaska"},
|
||||
{ORTH: "Ala.", LEMMA: "Alabama"},
|
||||
{ORTH: "Apr.", LEMMA: "April"},
|
||||
{ORTH: "Ariz.", LEMMA: "Arizona"},
|
||||
{ORTH: "Ark.", LEMMA: "Arkansas"},
|
||||
{ORTH: "Aug.", LEMMA: "August"},
|
||||
{ORTH: "Calif.", LEMMA: "California"},
|
||||
{ORTH: "Colo.", LEMMA: "Colorado"},
|
||||
{ORTH: "Conn.", LEMMA: "Connecticut"},
|
||||
{ORTH: "Dec.", LEMMA: "December"},
|
||||
{ORTH: "Del.", LEMMA: "Delaware"},
|
||||
{ORTH: "Feb.", LEMMA: "February"},
|
||||
{ORTH: "Fla.", LEMMA: "Florida"},
|
||||
{ORTH: "Ga.", LEMMA: "Georgia"},
|
||||
{ORTH: "Ia.", LEMMA: "Iowa"},
|
||||
{ORTH: "Id.", LEMMA: "Idaho"},
|
||||
{ORTH: "Ill.", LEMMA: "Illinois"},
|
||||
{ORTH: "Ind.", LEMMA: "Indiana"},
|
||||
{ORTH: "Jan.", LEMMA: "January"},
|
||||
{ORTH: "Jul.", LEMMA: "July"},
|
||||
{ORTH: "Jun.", LEMMA: "June"},
|
||||
{ORTH: "Kan.", LEMMA: "Kansas"},
|
||||
{ORTH: "Kans.", LEMMA: "Kansas"},
|
||||
{ORTH: "Ky.", LEMMA: "Kentucky"},
|
||||
{ORTH: "La.", LEMMA: "Louisiana"},
|
||||
{ORTH: "Mar.", LEMMA: "March"},
|
||||
{ORTH: "Mass.", LEMMA: "Massachusetts"},
|
||||
{ORTH: "May.", LEMMA: "May"},
|
||||
{ORTH: "Mich.", LEMMA: "Michigan"},
|
||||
{ORTH: "Minn.", LEMMA: "Minnesota"},
|
||||
{ORTH: "Miss.", LEMMA: "Mississippi"},
|
||||
{ORTH: "N.C.", LEMMA: "North Carolina"},
|
||||
{ORTH: "N.D.", LEMMA: "North Dakota"},
|
||||
{ORTH: "N.H.", LEMMA: "New Hampshire"},
|
||||
{ORTH: "N.J.", LEMMA: "New Jersey"},
|
||||
{ORTH: "N.M.", LEMMA: "New Mexico"},
|
||||
{ORTH: "N.Y.", LEMMA: "New York"},
|
||||
{ORTH: "Neb.", LEMMA: "Nebraska"},
|
||||
{ORTH: "Nebr.", LEMMA: "Nebraska"},
|
||||
{ORTH: "Nev.", LEMMA: "Nevada"},
|
||||
{ORTH: "Nov.", LEMMA: "November"},
|
||||
{ORTH: "Oct.", LEMMA: "October"},
|
||||
{ORTH: "Okla.", LEMMA: "Oklahoma"},
|
||||
{ORTH: "Ore.", LEMMA: "Oregon"},
|
||||
{ORTH: "Pa.", LEMMA: "Pennsylvania"},
|
||||
{ORTH: "S.C.", LEMMA: "South Carolina"},
|
||||
{ORTH: "Sep.", LEMMA: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September"},
|
||||
{ORTH: "Tenn.", LEMMA: "Tennessee"},
|
||||
{ORTH: "Va.", LEMMA: "Virginia"},
|
||||
{ORTH: "Wash.", LEMMA: "Washington"},
|
||||
{ORTH: "Wis.", LEMMA: "Wisconsin"}]:
|
||||
{ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
|
||||
{ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
|
||||
{ORTH: "Ala.", LEMMA: "Alabama", NORM: "Alabama"},
|
||||
{ORTH: "Apr.", LEMMA: "April", NORM: "April"},
|
||||
{ORTH: "Ariz.", LEMMA: "Arizona", NORM: "Arizona"},
|
||||
{ORTH: "Ark.", LEMMA: "Arkansas", NORM: "Arkansas"},
|
||||
{ORTH: "Aug.", LEMMA: "August", NORM: "August"},
|
||||
{ORTH: "Calif.", LEMMA: "California", NORM: "California"},
|
||||
{ORTH: "Colo.", LEMMA: "Colorado", NORM: "Colorado"},
|
||||
{ORTH: "Conn.", LEMMA: "Connecticut", NORM: "Connecticut"},
|
||||
{ORTH: "Dec.", LEMMA: "December", NORM: "December"},
|
||||
{ORTH: "Del.", LEMMA: "Delaware", NORM: "Delaware"},
|
||||
{ORTH: "Feb.", LEMMA: "February", NORM: "February"},
|
||||
{ORTH: "Fla.", LEMMA: "Florida", NORM: "Florida"},
|
||||
{ORTH: "Ga.", LEMMA: "Georgia", NORM: "Georgia"},
|
||||
{ORTH: "Ia.", LEMMA: "Iowa", NORM: "Iowa"},
|
||||
{ORTH: "Id.", LEMMA: "Idaho", NORM: "Idaho"},
|
||||
{ORTH: "Ill.", LEMMA: "Illinois", NORM: "Illinois"},
|
||||
{ORTH: "Ind.", LEMMA: "Indiana", NORM: "Indiana"},
|
||||
{ORTH: "Jan.", LEMMA: "January", NORM: "January"},
|
||||
{ORTH: "Jul.", LEMMA: "July", NORM: "July"},
|
||||
{ORTH: "Jun.", LEMMA: "June", NORM: "June"},
|
||||
{ORTH: "Kan.", LEMMA: "Kansas", NORM: "Kansas"},
|
||||
{ORTH: "Kans.", LEMMA: "Kansas", NORM: "Kansas"},
|
||||
{ORTH: "Ky.", LEMMA: "Kentucky", NORM: "Kentucky"},
|
||||
{ORTH: "La.", LEMMA: "Louisiana", NORM: "Louisiana"},
|
||||
{ORTH: "Mar.", LEMMA: "March", NORM: "March"},
|
||||
{ORTH: "Mass.", LEMMA: "Massachusetts", NORM: "Massachusetts"},
|
||||
{ORTH: "May.", LEMMA: "May", NORM: "May"},
|
||||
{ORTH: "Mich.", LEMMA: "Michigan", NORM: "Michigan"},
|
||||
{ORTH: "Minn.", LEMMA: "Minnesota", NORM: "Minnesota"},
|
||||
{ORTH: "Miss.", LEMMA: "Mississippi", NORM: "Mississippi"},
|
||||
{ORTH: "N.C.", LEMMA: "North Carolina", NORM: "North Carolina"},
|
||||
{ORTH: "N.D.", LEMMA: "North Dakota", NORM: "North Dakota"},
|
||||
{ORTH: "N.H.", LEMMA: "New Hampshire", NORM: "New Hampshire"},
|
||||
{ORTH: "N.J.", LEMMA: "New Jersey", NORM: "New Jersey"},
|
||||
{ORTH: "N.M.", LEMMA: "New Mexico", NORM: "New Mexico"},
|
||||
{ORTH: "N.Y.", LEMMA: "New York", NORM: "New York"},
|
||||
{ORTH: "Neb.", LEMMA: "Nebraska", NORM: "Nebraska"},
|
||||
{ORTH: "Nebr.", LEMMA: "Nebraska", NORM: "Nebraska"},
|
||||
{ORTH: "Nev.", LEMMA: "Nevada", NORM: "Nevada"},
|
||||
{ORTH: "Nov.", LEMMA: "November", NORM: "November"},
|
||||
{ORTH: "Oct.", LEMMA: "October", NORM: "October"},
|
||||
{ORTH: "Okla.", LEMMA: "Oklahoma", NORM: "Oklahoma"},
|
||||
{ORTH: "Ore.", LEMMA: "Oregon", NORM: "Oregon"},
|
||||
{ORTH: "Pa.", LEMMA: "Pennsylvania", NORM: "Pennsylvania"},
|
||||
{ORTH: "S.C.", LEMMA: "South Carolina", NORM: "South Carolina"},
|
||||
{ORTH: "Sep.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "Sept.", LEMMA: "September", NORM: "September"},
|
||||
{ORTH: "Tenn.", LEMMA: "Tennessee", NORM: "Tennessee"},
|
||||
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
|
||||
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
|
||||
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
|
||||
_exc[exc_data[ORTH]] = [dict(exc_data)]
|
||||
|
||||
|
||||
|
|
|
@ -5,21 +5,25 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class SpanishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'es'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
sytax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
|
@ -28,7 +32,7 @@ class SpanishDefaults(Language.Defaults):
|
|||
|
||||
class Spanish(Language):
|
||||
lang = 'es'
|
||||
|
||||
Defaults = SpanishDefaults
|
||||
|
||||
|
||||
__all__ = ['Spanish']
|
||||
|
|
55
spacy/lang/es/syntax_iterators.py
Normal file
55
spacy/lang/es/syntax_iterators.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||
|
||||
|
||||
def noun_chunks(obj):
|
||||
doc = obj.doc
|
||||
np_label = doc.vocab.strings['NP']
|
||||
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
||||
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
||||
stop_labels = ['punct']
|
||||
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
|
||||
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
|
||||
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
|
||||
token = doc[0]
|
||||
while token and token.i < len(doc):
|
||||
if token.pos in [PROPN, NOUN, PRON]:
|
||||
left, right = noun_bounds(token)
|
||||
yield left.i, right.i+1, np_label
|
||||
token = right
|
||||
token = next_token(token)
|
||||
|
||||
|
||||
def is_verb_token(token):
|
||||
return token.pos in [VERB, AUX]
|
||||
|
||||
|
||||
def next_token(token):
|
||||
try:
|
||||
return token.nbor()
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def noun_bounds(root):
|
||||
left_bound = root
|
||||
for token in reversed(list(root.lefts)):
|
||||
if token.dep in np_left_deps:
|
||||
left_bound = token
|
||||
right_bound = root
|
||||
for token in root.rights:
|
||||
if (token.dep in np_right_deps):
|
||||
left, right = noun_bounds(token)
|
||||
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||
doc[left_bound.i: right.i])):
|
||||
break
|
||||
else:
|
||||
right_bound = right
|
||||
return left_bound, right_bound
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {
|
||||
'noun_chunks': noun_chunks
|
||||
}
|
|
@ -6,37 +6,13 @@ from ...deprecated import PRON_LEMMA
|
|||
|
||||
|
||||
_exc = {
|
||||
"al": [
|
||||
{ORTH: "a", LEMMA: "a", TAG: ADP},
|
||||
{ORTH: "l", LEMMA: "el", TAG: DET}],
|
||||
|
||||
"consigo": [
|
||||
{ORTH: "con", LEMMA: "con"},
|
||||
{ORTH: "sigo", LEMMA: PRON_LEMMA, NORM: "sí"}],
|
||||
|
||||
"conmigo": [
|
||||
{ORTH: "con", LEMMA: "con"},
|
||||
{ORTH: "migo", LEMMA: PRON_LEMMA, NORM: "mí"}],
|
||||
|
||||
"contigo": [
|
||||
{ORTH: "con", LEMMA: "con"},
|
||||
{ORTH: "tigo", LEMMA: PRON_LEMMA, NORM: "ti"}],
|
||||
|
||||
"del": [
|
||||
{ORTH: "de", LEMMA: "de", TAG: ADP},
|
||||
{ORTH: "l", LEMMA: "el", TAG: DET}],
|
||||
|
||||
"pel": [
|
||||
{ORTH: "pe", LEMMA: "per", TAG: ADP},
|
||||
{ORTH: "l", LEMMA: "el", TAG: DET}],
|
||||
|
||||
"pal": [
|
||||
{ORTH: "pa", LEMMA: "para"},
|
||||
{ORTH: "l", LEMMA: "el"}],
|
||||
{ORTH: "l", LEMMA: "el", NORM: "el"}],
|
||||
|
||||
"pala": [
|
||||
{ORTH: "pa", LEMMA: "para"},
|
||||
{ORTH: "la"}]
|
||||
{ORTH: "la", LEMMA: "la", NORM: "la"}]
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
lang = 'fi'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class FinnishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fi'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
lang = 'fi'
|
||||
Defaults = FinnishDefaults
|
||||
|
||||
|
||||
__all__ = ['Finnish']
|
||||
|
|
|
@ -5,30 +5,36 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
|
|||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lemmatizer import LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class French(Language):
|
||||
lang = 'fr'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
token_match = TOKEN_MATCH
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class French(Language):
|
||||
lang = 'fr'
|
||||
Defaults = FrenchDefaults
|
||||
|
||||
|
||||
__all__ = ['French']
|
||||
|
|
File diff suppressed because it is too large
Load Diff
42
spacy/lang/fr/syntax_iterators.py
Normal file
42
spacy/lang/fr/syntax_iterators.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
|
||||
|
||||
def noun_chunks(obj):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {
|
||||
'noun_chunks': noun_chunks
|
||||
}
|
|
@ -9,10 +9,7 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class Hebrew(Language):
|
||||
lang = 'he'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class HebrewDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'he'
|
||||
|
||||
|
@ -20,4 +17,9 @@ class Hebrew(Language):
|
|||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Hebrew(Language):
|
||||
lang = 'he'
|
||||
Defaults = HebrewDefaults
|
||||
|
||||
|
||||
__all__ = ['Hebrew']
|
||||
|
|
|
@ -7,18 +7,17 @@ from .stop_words import STOP_WORDS
|
|||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class Hungarian(Language):
|
||||
lang = 'hu'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class HungarianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'hu'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
@ -32,4 +31,9 @@ class Hungarian(Language):
|
|||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class Hungarian(Language):
|
||||
lang = 'hu'
|
||||
Defaults = HungarianDefaults
|
||||
|
||||
|
||||
__all__ = ['Hungarian']
|
||||
|
|
|
@ -1,18 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_INFIXES
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
||||
from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
LIST_ICONS = [r'[\p{So}--[°]]']
|
||||
|
||||
_currency = r'\$|¢|£|€|¥|฿'
|
||||
_quotes = QUOTES.replace("'", '')
|
||||
|
||||
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
[r'[,.:](?=[{a}])'.format(a=ALPHA)])
|
||||
|
||||
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES)
|
||||
|
||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
||||
|
@ -20,16 +20,14 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
|||
r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency),
|
||||
r'(?<=[{})])-e'.format(ALPHA_LOWER)])
|
||||
|
||||
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])
|
||||
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
|
|
|
@ -5,18 +5,17 @@ from .stop_words import STOP_WORDS
|
|||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class Italian(Language):
|
||||
lang = 'it'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class ItalianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'it'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
@ -26,4 +25,9 @@ class Italian(Language):
|
|||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class Italian(Language):
|
||||
lang = 'it'
|
||||
Defaults = ItalianDefaults
|
||||
|
||||
|
||||
__all__ = ['Italian']
|
||||
|
|
|
@ -125,7 +125,7 @@ def word_shape(text):
|
|||
|
||||
LEX_ATTRS = {
|
||||
attrs.LOWER: lambda string: string.lower(),
|
||||
attrs.NORM: lambda string: string,
|
||||
attrs.NORM: lambda string: string.lower(),
|
||||
attrs.PREFIX: lambda string: string[0],
|
||||
attrs.SUFFIX: lambda string: string[-3:],
|
||||
attrs.CLUSTER: lambda string: 0,
|
||||
|
|
|
@ -6,20 +6,24 @@ from .stop_words import STOP_WORDS
|
|||
from .morph_rules import MORPH_RULES
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
lang = 'nb'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nb'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
lang = 'nb'
|
||||
Defaults = NorwegianDefaults
|
||||
|
||||
|
||||
__all__ = ['Norwegian']
|
||||
|
|
|
@ -4,21 +4,24 @@ from __future__ import unicode_literals
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
|
||||
class Dutch(Language):
|
||||
lang = 'nl'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class DutchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Dutch(Language):
|
||||
lang = 'nl'
|
||||
Defaults = DutchDefaults
|
||||
|
||||
|
||||
__all__ = ['Dutch']
|
||||
|
|
46
spacy/lang/norm_exceptions.py
Normal file
46
spacy/lang/norm_exceptions.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||
# Individual languages can also add their own exceptions and overwrite them -
|
||||
# for example, British vs. American spelling in English.
|
||||
|
||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
||||
# Note that this does not change any other token attributes. Its main purpose
|
||||
# is to normalise the word representations so that equivalent tokens receive
|
||||
# similar representations. For example: $ and € are very different, but they're
|
||||
# both currency symbols. By normalising currency symbols to $, all symbols are
|
||||
# seen as similar, no matter how common they are in the training data.
|
||||
|
||||
|
||||
BASE_NORMS = {
|
||||
"'s": "'s",
|
||||
"'S": "'s",
|
||||
"’s": "'s",
|
||||
"’S": "'s",
|
||||
"’": "'",
|
||||
"‘": "'",
|
||||
"´": "'",
|
||||
"`": "'",
|
||||
"”": '"',
|
||||
"“": '"',
|
||||
"''": '"',
|
||||
"``": '"',
|
||||
"´´": '"',
|
||||
"„": '"',
|
||||
"»": '"',
|
||||
"«": '"',
|
||||
"…": "...",
|
||||
"—": "-",
|
||||
"–": "-",
|
||||
"--": "-",
|
||||
"---": "-",
|
||||
"€": "$",
|
||||
"£": "$",
|
||||
"¥": "$",
|
||||
"฿": "$",
|
||||
"US$": "$",
|
||||
"C$": "$",
|
||||
"A$": "$"
|
||||
}
|
|
@ -5,20 +5,24 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class Polish(Language):
|
||||
lang = 'pl'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class PolishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Polish(Language):
|
||||
lang = 'pl'
|
||||
Defaults = PolishDefaults
|
||||
|
||||
|
||||
__all__ = ['Polish']
|
||||
|
|
|
@ -7,18 +7,17 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .lemmatizer import LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class Portuguese(Language):
|
||||
lang = 'pt'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pt'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
|
@ -29,4 +28,9 @@ class Portuguese(Language):
|
|||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class Portuguese(Language):
|
||||
lang = 'pt'
|
||||
Defaults = PortugueseDefaults
|
||||
|
||||
|
||||
__all__ = ['Portuguese']
|
||||
|
|
|
@ -2,15 +2,16 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
||||
from .char_classes import CURRENCY, UNITS
|
||||
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||
from .char_classes import QUOTES, CURRENCY, UNITS
|
||||
|
||||
|
||||
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
LIST_CURRENCY)
|
||||
LIST_CURRENCY + LIST_ICONS)
|
||||
|
||||
|
||||
_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
["'s", "'S", "’s", "’S"] +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
||||
|
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
|
|||
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
|
||||
|
||||
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
||||
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
|
|
|
@ -7,18 +7,17 @@ from .morph_rules import MORPH_RULES
|
|||
from .lemmatizer import LEMMA_RULES, LOOKUP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
lang = 'sv'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
class SwedishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'sv'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
@ -28,4 +27,9 @@ class Swedish(Language):
|
|||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
lang = 'sv'
|
||||
Defaults = SwedishDefaults
|
||||
|
||||
|
||||
__all__ = ['Swedish']
|
||||
|
|
28
spacy/lang/xx/__init__.py
Normal file
28
spacy/lang/xx/__init__.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class MultiLanguageDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'xx'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
|
||||
|
||||
class MultiLanguage(Language):
|
||||
"""Language class to be used for models that support multiple languages.
|
||||
This module allows models to specify their language ID as 'xx'.
|
||||
"""
|
||||
lang = 'xx'
|
||||
Defaults = MultiLanguageDefaults
|
||||
|
||||
|
||||
__all__ = ['MultiLanguage']
|
|
@ -6,16 +6,23 @@ import dill
|
|||
import numpy
|
||||
from thinc.neural import Model
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.optimizers import Adam, SGD
|
||||
import random
|
||||
import ujson
|
||||
from collections import OrderedDict
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .vocab import Vocab
|
||||
from .tagger import Tagger
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .train import Trainer
|
||||
from .syntax.parser import get_templates
|
||||
from .syntax.nonproj import PseudoProjectivity
|
||||
from .syntax import nonproj
|
||||
|
||||
from .pipeline import NeuralDependencyParser, EntityRecognizer
|
||||
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
|
||||
from .pipeline import NeuralLabeller
|
||||
from .pipeline import SimilarityHook
|
||||
|
||||
from .compat import json_dumps
|
||||
from .attrs import IS_STOP
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
|
@ -23,6 +30,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH
|
|||
from .lang.tag_map import TAG_MAP
|
||||
from .lang.lex_attrs import LEX_ATTRS
|
||||
from . import util
|
||||
from .scorer import Scorer
|
||||
|
||||
|
||||
class BaseDefaults(object):
|
||||
|
@ -80,21 +88,34 @@ class BaseDefaults(object):
|
|||
return NeuralEntityRecognizer(nlp.vocab, **cfg)
|
||||
|
||||
@classmethod
|
||||
def create_pipeline(cls, nlp=None):
|
||||
def create_pipeline(cls, nlp=None, disable=tuple()):
|
||||
meta = nlp.meta if nlp is not None else {}
|
||||
# Resolve strings, like "cnn", "lstm", etc
|
||||
pipeline = []
|
||||
for entry in cls.pipeline:
|
||||
if entry in disable or getattr(entry, 'name', entry) in disable:
|
||||
continue
|
||||
factory = cls.Defaults.factories[entry]
|
||||
pipeline.append(factory(nlp, **meta.get(entry, {})))
|
||||
return pipeline
|
||||
|
||||
factories = {
|
||||
'make_doc': create_tokenizer,
|
||||
'token_vectors': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
|
||||
'tags': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
|
||||
'dependencies': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
|
||||
'entities': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
|
||||
'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
|
||||
'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
||||
'parser': lambda nlp, **cfg: [
|
||||
NeuralDependencyParser(nlp.vocab, **cfg),
|
||||
nonproj.deprojectivize],
|
||||
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
||||
# Temporary compatibility -- delete after pivot
|
||||
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
|
||||
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
||||
'dependencies': lambda nlp, **cfg: [
|
||||
NeuralDependencyParser(nlp.vocab, **cfg),
|
||||
nonproj.deprojectivize,
|
||||
],
|
||||
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
||||
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)]
|
||||
}
|
||||
|
||||
token_match = TOKEN_MATCH
|
||||
|
@ -112,19 +133,39 @@ class BaseDefaults(object):
|
|||
lemma_index = {}
|
||||
morph_rules = {}
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = {}
|
||||
|
||||
|
||||
class Language(object):
|
||||
"""
|
||||
A text-processing pipeline. Usually you'll load this once per process, and
|
||||
pass the instance around your program.
|
||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||
and pass the instance around your application.
|
||||
|
||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||
object and processing pipeline.
|
||||
lang (unicode): Two-letter language ID, i.e. ISO code.
|
||||
"""
|
||||
Defaults = BaseDefaults
|
||||
lang = None
|
||||
|
||||
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
|
||||
self.meta = dict(meta)
|
||||
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={},
|
||||
disable=tuple(), **kwargs):
|
||||
"""Initialise a Language object.
|
||||
|
||||
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
||||
`Language.Defaults.create_vocab`.
|
||||
make_doc (callable): A function that takes text and returns a `Doc`
|
||||
object. Usually a `Tokenizer`.
|
||||
pipeline (list): A list of annotation processes or IDs of annotation,
|
||||
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
|
||||
up in `Language.Defaults.factories`.
|
||||
disable (list): A list of component names to exclude from the pipeline.
|
||||
The disable list has priority over the pipeline list -- if the same
|
||||
string occurs in both, the component is not loaded.
|
||||
meta (dict): Custom meta data for the Language class. Is written to by
|
||||
models to add model meta data.
|
||||
RETURNS (Language): The newly constructed object.
|
||||
"""
|
||||
self.meta = dict(meta)
|
||||
if vocab is True:
|
||||
factory = self.Defaults.create_vocab
|
||||
vocab = factory(self, **meta.get('vocab', {}))
|
||||
|
@ -132,11 +173,15 @@ class Language(object):
|
|||
if make_doc is True:
|
||||
factory = self.Defaults.create_tokenizer
|
||||
make_doc = factory(self, **meta.get('tokenizer', {}))
|
||||
self.make_doc = make_doc
|
||||
self.tokenizer = make_doc
|
||||
if pipeline is True:
|
||||
self.pipeline = self.Defaults.create_pipeline(self)
|
||||
self.pipeline = self.Defaults.create_pipeline(self, disable)
|
||||
elif pipeline:
|
||||
self.pipeline = list(pipeline)
|
||||
# Careful not to do getattr(p, 'name', None) here
|
||||
# If we had disable=[None], we'd disable everything!
|
||||
self.pipeline = [p for p in pipeline
|
||||
if p not in disable
|
||||
and getattr(p, 'name', p) not in disable]
|
||||
# Resolve strings, like "cnn", "lstm", etc
|
||||
for i, entry in enumerate(self.pipeline):
|
||||
if entry in self.Defaults.factories:
|
||||
|
@ -144,82 +189,187 @@ class Language(object):
|
|||
self.pipeline[i] = factory(self, **meta.get(entry, {}))
|
||||
else:
|
||||
self.pipeline = []
|
||||
flat_list = []
|
||||
for pipe in self.pipeline:
|
||||
if isinstance(pipe, list):
|
||||
flat_list.extend(pipe)
|
||||
else:
|
||||
flat_list.append(pipe)
|
||||
self.pipeline = flat_list
|
||||
|
||||
def __call__(self, text, state=None, **disabled):
|
||||
"""
|
||||
Apply the pipeline to some text. The text can span multiple sentences,
|
||||
# Conveniences to access pipeline components
|
||||
@property
|
||||
def tensorizer(self):
|
||||
return self.get_component('tensorizer')
|
||||
|
||||
@property
|
||||
def tagger(self):
|
||||
return self.get_component('tagger')
|
||||
|
||||
@property
|
||||
def parser(self):
|
||||
return self.get_component('parser')
|
||||
|
||||
@property
|
||||
def entity(self):
|
||||
return self.get_component('ner')
|
||||
|
||||
@property
|
||||
def matcher(self):
|
||||
return self.get_component('matcher')
|
||||
|
||||
def get_component(self, name):
|
||||
if self.pipeline in (True, None):
|
||||
return None
|
||||
for proc in self.pipeline:
|
||||
if hasattr(proc, 'name') and proc.name.endswith(name):
|
||||
return proc
|
||||
return None
|
||||
|
||||
def __call__(self, text, disable=[]):
|
||||
"""'Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
Args:
|
||||
text (unicode): The text to be processed.
|
||||
state: Arbitrary
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
RETURNS (Doc): A container for accessing the annotations.
|
||||
|
||||
Returns:
|
||||
doc (Doc): A container for accessing the annotations.
|
||||
|
||||
Example:
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
EXAMPLE:
|
||||
>>> tokens = nlp('An example sentence. Another example sentence.')
|
||||
>>> tokens[0].orth_, tokens[0].head.tag_
|
||||
>>> tokens[0].text, tokens[0].head.tag_
|
||||
('An', 'NN')
|
||||
"""
|
||||
doc = self.make_doc(text)
|
||||
for proc in self.pipeline:
|
||||
name = getattr(proc, 'name', None)
|
||||
if name in disabled and not disabled[name]:
|
||||
if name in disable:
|
||||
continue
|
||||
state = proc(doc, state=state)
|
||||
doc = proc(doc)
|
||||
return doc
|
||||
|
||||
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
||||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
"""Update the models in the pipeline.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
golds (iterable): A batch of `GoldParse` objects.
|
||||
drop (float): The droput rate.
|
||||
sgd (callable): An optimizer.
|
||||
RETURNS (dict): Results from the update.
|
||||
|
||||
EXAMPLE:
|
||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
>>> for epoch in trainer.epochs(gold):
|
||||
>>> for docs, golds in epoch:
|
||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||
"""
|
||||
tok2vec = self.pipeline[0]
|
||||
feats = tok2vec.doc2feats(docs)
|
||||
grads = {}
|
||||
def get_grads(W, dW, key=None):
|
||||
grads[key] = (W, dW)
|
||||
state = {} if state is None else state
|
||||
for process in self.pipeline:
|
||||
if hasattr(process, 'update'):
|
||||
state = process.update(docs, golds,
|
||||
state=state,
|
||||
drop=drop,
|
||||
sgd=get_grads)
|
||||
else:
|
||||
process(docs, state=state)
|
||||
if sgd is not None:
|
||||
pipes = list(self.pipeline[1:])
|
||||
random.shuffle(pipes)
|
||||
for proc in pipes:
|
||||
if not hasattr(proc, 'update'):
|
||||
continue
|
||||
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
|
||||
d_tokvecses = proc.update((docs, tokvecses), golds,
|
||||
drop=drop, sgd=get_grads, losses=losses)
|
||||
if d_tokvecses is not None:
|
||||
bp_tokvecses(d_tokvecses, sgd=sgd)
|
||||
for key, (W, dW) in grads.items():
|
||||
# TODO: Unhack this when thinc improves
|
||||
if isinstance(W, numpy.ndarray):
|
||||
sgd.ops = NumpyOps()
|
||||
else:
|
||||
sgd.ops = CupyOps()
|
||||
sgd(W, dW, key=key)
|
||||
return state
|
||||
# Clear the tensor variable, to free GPU memory.
|
||||
# If we don't do this, the memory leak gets pretty
|
||||
# bad, because we may be holding part of a batch.
|
||||
for doc in docs:
|
||||
doc.tensor = None
|
||||
|
||||
@contextmanager
|
||||
def begin_training(self, gold_tuples, **cfg):
|
||||
def preprocess_gold(self, docs_golds):
|
||||
"""Can be called before training to pre-process gold data. By default,
|
||||
it handles nonprojectivity and adds missing tags to the tag map.
|
||||
|
||||
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
||||
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
|
||||
"""
|
||||
for proc in self.pipeline:
|
||||
if hasattr(proc, 'preprocess_gold'):
|
||||
docs_golds = proc.preprocess_gold(docs_golds)
|
||||
for doc, gold in docs_golds:
|
||||
yield doc, gold
|
||||
|
||||
def begin_training(self, get_gold_tuples, **cfg):
|
||||
"""Allocate models, pre-process training data and acquire a trainer and
|
||||
optimizer. Used as a contextmanager.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
**cfg: Config parameters.
|
||||
YIELDS (tuple): A trainer and an optimizer.
|
||||
|
||||
EXAMPLE:
|
||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
>>> for epoch in trainer.epochs(gold):
|
||||
>>> for docs, golds in epoch:
|
||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||
"""
|
||||
if self.parser:
|
||||
self.pipeline.append(NeuralLabeller(self.vocab))
|
||||
# Populate vocab
|
||||
for _, annots_brackets in gold_tuples:
|
||||
for _, annots_brackets in get_gold_tuples():
|
||||
for annots, _ in annots_brackets:
|
||||
for word in annots[1]:
|
||||
_ = self.vocab[word]
|
||||
# Handle crossing dependencies
|
||||
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
|
||||
contexts = []
|
||||
if cfg.get('use_gpu'):
|
||||
if cfg.get('device', -1) >= 0:
|
||||
import cupy.cuda.device
|
||||
device = cupy.cuda.device.Device(cfg['device'])
|
||||
device.use()
|
||||
Model.ops = CupyOps()
|
||||
Model.Ops = CupyOps
|
||||
print("Use GPU")
|
||||
else:
|
||||
device = None
|
||||
for proc in self.pipeline:
|
||||
if hasattr(proc, 'begin_training'):
|
||||
context = proc.begin_training(gold_tuples,
|
||||
context = proc.begin_training(get_gold_tuples(),
|
||||
pipeline=self.pipeline)
|
||||
contexts.append(context)
|
||||
trainer = Trainer(self, gold_tuples, **cfg)
|
||||
yield trainer, trainer.optimizer
|
||||
learn_rate = util.env_opt('learn_rate', 0.001)
|
||||
beta1 = util.env_opt('optimizer_B1', 0.9)
|
||||
beta2 = util.env_opt('optimizer_B2', 0.999)
|
||||
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||
optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
optimizer.max_grad_norm = max_grad_norm
|
||||
optimizer.device = device
|
||||
return optimizer
|
||||
|
||||
def evaluate(self, docs_golds):
|
||||
docs, golds = zip(*docs_golds)
|
||||
scorer = Scorer()
|
||||
for doc, gold in zip(self.pipe(docs, batch_size=32), golds):
|
||||
scorer.score(doc, gold)
|
||||
doc.tensor = None
|
||||
return scorer
|
||||
|
||||
@contextmanager
|
||||
def use_params(self, params, **cfg):
|
||||
"""Replace weights of models in the pipeline with those provided in the
|
||||
params dictionary. Can be used as a contextmanager, in which case,
|
||||
models go back to their original weights after the block.
|
||||
|
||||
params (dict): A dictionary of parameters keyed by model ID.
|
||||
**cfg: Config parameters.
|
||||
|
||||
EXAMPLE:
|
||||
>>> with nlp.use_params(optimizer.averages):
|
||||
>>> nlp.to_disk('/tmp/checkpoint')
|
||||
"""
|
||||
contexts = [pipe.use_params(params) for pipe
|
||||
in self.pipeline if hasattr(pipe, 'use_params')]
|
||||
# TODO: Having trouble with contextlib
|
||||
|
@ -236,98 +386,141 @@ class Language(object):
|
|||
except StopIteration:
|
||||
pass
|
||||
|
||||
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
|
||||
"""
|
||||
Process texts as a stream, and yield Doc objects in order.
|
||||
def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
|
||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
||||
GIL-free multi-threading.
|
||||
|
||||
Supports GIL-free multi-threading.
|
||||
texts (iterator): A sequence of texts to process.
|
||||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
||||
decide how many to use at run time. Default is 2.
|
||||
batch_size (int): The number of texts to buffer.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
YIELDS (Doc): Documents in the order of the original text.
|
||||
|
||||
Arguments:
|
||||
texts (iterator)
|
||||
tag (bool)
|
||||
parse (bool)
|
||||
entity (bool)
|
||||
EXAMPLE:
|
||||
>>> texts = [u'One document.', u'...', u'Lots of documents']
|
||||
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||
>>> assert doc.is_parsed
|
||||
"""
|
||||
#stream = ((self.make_doc(text), None) for text in texts)
|
||||
stream = ((doc, {}) for doc in texts)
|
||||
docs = (self.make_doc(text) for text in texts)
|
||||
docs = texts
|
||||
for proc in self.pipeline:
|
||||
name = getattr(proc, 'name', None)
|
||||
if name in disabled and not disabled[name]:
|
||||
if name in disable:
|
||||
continue
|
||||
|
||||
if hasattr(proc, 'pipe'):
|
||||
stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
|
||||
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
|
||||
else:
|
||||
stream = (proc(doc, state) for doc, state in stream)
|
||||
for doc, state in stream:
|
||||
# Apply the function, but yield the doc
|
||||
docs = _pipe(proc, docs)
|
||||
for doc in docs:
|
||||
yield doc
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Save the current state to a directory.
|
||||
def to_disk(self, path, disable=tuple()):
|
||||
"""Save the current state to a directory. If a model is loaded, this
|
||||
will include the model.
|
||||
|
||||
Args:
|
||||
path: A path to a directory, which will be created if it doesn't
|
||||
exist. Paths may be either strings or pathlib.Path-like
|
||||
objects.
|
||||
**exclude: Prevent named attributes from being saved.
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
disable (list): Names of pipeline components to disable and prevent
|
||||
from being saved.
|
||||
|
||||
EXAMPLE:
|
||||
>>> nlp.to_disk('/path/to/models')
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
if not path.is_dir():
|
||||
raise IOError("Output path must be a directory")
|
||||
props = {}
|
||||
for name, value in self.__dict__.items():
|
||||
if name in exclude:
|
||||
serializers = OrderedDict((
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
||||
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
||||
))
|
||||
for proc in self.pipeline:
|
||||
if not hasattr(proc, 'name'):
|
||||
continue
|
||||
if hasattr(value, 'to_disk'):
|
||||
value.to_disk(path / name)
|
||||
else:
|
||||
props[name] = value
|
||||
with (path / 'props.pickle').open('wb') as file_:
|
||||
dill.dump(props, file_)
|
||||
if proc.name in disable:
|
||||
continue
|
||||
if not hasattr(proc, 'to_disk'):
|
||||
continue
|
||||
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
||||
util.to_disk(path, serializers, {p: False for p in disable})
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
"""Load the current state from a directory.
|
||||
def from_disk(self, path, disable=tuple()):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it. If the saved `Language` object contains a model, the
|
||||
model will be loaded.
|
||||
|
||||
Args:
|
||||
path: A path to a directory. Paths may be either strings or
|
||||
pathlib.Path-like objects.
|
||||
**exclude: Prevent named attributes from being saved.
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
RETURNS (Language): The modified `Language` object.
|
||||
|
||||
EXAMPLE:
|
||||
>>> from spacy.language import Language
|
||||
>>> nlp = Language().from_disk('/path/to/models')
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
for name in path.iterdir():
|
||||
if name not in exclude and hasattr(self, str(name)):
|
||||
getattr(self, name).from_disk(path / name)
|
||||
with (path / 'props.pickle').open('rb') as file_:
|
||||
bytes_data = file_.read()
|
||||
self.from_bytes(bytes_data, **exclude)
|
||||
deserializers = OrderedDict((
|
||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
|
||||
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
||||
))
|
||||
for proc in self.pipeline:
|
||||
if not hasattr(proc, 'name'):
|
||||
continue
|
||||
if proc.name in disable:
|
||||
continue
|
||||
if not hasattr(proc, 'to_disk'):
|
||||
continue
|
||||
deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
||||
exclude = {p: False for p in disable}
|
||||
if not (path / 'vocab').exists():
|
||||
exclude['vocab'] = True
|
||||
util.from_disk(path, deserializers, exclude)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
def to_bytes(self, disable=[]):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
Args:
|
||||
path: A path to a directory. Paths may be either strings or
|
||||
pathlib.Path-like objects.
|
||||
**exclude: Prevent named attributes from being serialized.
|
||||
disable (list): Nameds of pipeline components to disable and prevent
|
||||
from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `Language` object.
|
||||
"""
|
||||
props = dict(self.__dict__)
|
||||
for key in exclude:
|
||||
if key in props:
|
||||
props.pop(key)
|
||||
return dill.dumps(props, -1)
|
||||
serializers = OrderedDict((
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||
('meta', lambda: ujson.dumps(self.meta))
|
||||
))
|
||||
for i, proc in enumerate(self.pipeline):
|
||||
if getattr(proc, 'name', None) in disable:
|
||||
continue
|
||||
if not hasattr(proc, 'to_bytes'):
|
||||
continue
|
||||
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
|
||||
return util.to_bytes(serializers, {})
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def from_bytes(self, bytes_data, disable=[]):
|
||||
"""Load state from a binary string.
|
||||
|
||||
Args:
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Prevent named attributes from being loaded.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
RETURNS (Language): The `Language` object.
|
||||
"""
|
||||
props = dill.loads(bytes_data)
|
||||
for key, value in props.items():
|
||||
if key not in exclude:
|
||||
setattr(self, key, value)
|
||||
deserializers = OrderedDict((
|
||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
|
||||
('meta', lambda b: self.meta.update(ujson.loads(b)))
|
||||
))
|
||||
for i, proc in enumerate(self.pipeline):
|
||||
if getattr(proc, 'name', None) in disable:
|
||||
continue
|
||||
if not hasattr(proc, 'from_bytes'):
|
||||
continue
|
||||
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
|
||||
msg = util.from_bytes(bytes_data, deserializers, {})
|
||||
return self
|
||||
|
||||
|
||||
def _pipe(func, docs):
|
||||
for doc in docs:
|
||||
func(doc)
|
||||
yield doc
|
||||
|
|
|
@ -27,7 +27,7 @@ cdef class Lexeme:
|
|||
cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
|
||||
cdef SerializedLexemeC lex_data
|
||||
buff = <const unsigned char*>&lex.flags
|
||||
end = <const unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
|
||||
end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||
for i in range(sizeof(lex_data.data)):
|
||||
lex_data.data[i] = buff[i]
|
||||
return lex_data
|
||||
|
@ -35,7 +35,7 @@ cdef class Lexeme:
|
|||
@staticmethod
|
||||
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
|
||||
buff = <unsigned char*>&lex.flags
|
||||
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
|
||||
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||
for i in range(sizeof(lex_data.data)):
|
||||
buff[i] = lex_data.data[i]
|
||||
|
||||
|
|
117
spacy/lexeme.pyx
117
spacy/lexeme.pyx
|
@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
|||
|
||||
|
||||
cdef class Lexeme:
|
||||
"""
|
||||
An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||
"""An entry in the vocabulary. A `Lexeme` has no string context – it's a
|
||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||
tag).
|
||||
"""
|
||||
def __init__(self, Vocab vocab, int orth):
|
||||
"""
|
||||
Create a Lexeme object.
|
||||
def __init__(self, Vocab vocab, attr_t orth):
|
||||
"""Create a Lexeme object.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab): The parent vocabulary
|
||||
orth (int): The orth id of the lexeme.
|
||||
orth (uint64): The orth id of the lexeme.
|
||||
Returns (Lexeme): The newly constructd object.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
|
@ -54,7 +51,7 @@ cdef class Lexeme:
|
|||
if isinstance(other, Lexeme):
|
||||
a = self.orth
|
||||
b = other.orth
|
||||
elif isinstance(other, int):
|
||||
elif isinstance(other, long):
|
||||
a = self.orth
|
||||
b = other
|
||||
elif isinstance(other, str):
|
||||
|
@ -82,35 +79,28 @@ cdef class Lexeme:
|
|||
return self.c.orth
|
||||
|
||||
def set_flag(self, attr_id_t flag_id, bint value):
|
||||
"""
|
||||
Change the value of a boolean flag.
|
||||
"""Change the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to set.
|
||||
value (bool): The new value of the flag.
|
||||
"""
|
||||
Lexeme.c_set_flag(self.c, flag_id, value)
|
||||
|
||||
def check_flag(self, attr_id_t flag_id):
|
||||
"""
|
||||
Check the value of a boolean flag.
|
||||
"""Check the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to query.
|
||||
Returns (bool): The value of the flag.
|
||||
RETURNS (bool): The value of the flag.
|
||||
"""
|
||||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||||
|
||||
def similarity(self, other):
|
||||
"""
|
||||
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
"""Compute a semantic similarity estimate. Defaults to cosine over
|
||||
vectors.
|
||||
|
||||
Arguments:
|
||||
other:
|
||||
The object to compare with. By default, accepts Doc, Span,
|
||||
Token and Lexeme objects.
|
||||
Returns:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
other (object): The object to compare with. By default, accepts `Doc`,
|
||||
`Span`, `Token` and `Lexeme` objects.
|
||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||
"""
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
return 0.0
|
||||
|
@ -119,7 +109,7 @@ cdef class Lexeme:
|
|||
def to_bytes(self):
|
||||
lex_data = Lexeme.c_to_bytes(self.c)
|
||||
start = <const char*>&self.c.flags
|
||||
end = <const char*>&self.c.l2_norm + sizeof(self.c.l2_norm)
|
||||
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
|
||||
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
|
||||
byte_string = b'\0' * sizeof(lex_data.data)
|
||||
byte_chars = <char*>byte_string
|
||||
|
@ -140,22 +130,29 @@ cdef class Lexeme:
|
|||
self.orth = self.c.orth
|
||||
|
||||
property has_vector:
|
||||
"""A boolean value indicating whether a word vector is associated with
|
||||
the object.
|
||||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef int i
|
||||
for i in range(self.vocab.vectors_length):
|
||||
if self.c.vector[i] != 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
return self.vocab.has_vector(self.c.orth)
|
||||
|
||||
property vector_norm:
|
||||
def __get__(self):
|
||||
return self.c.l2_norm
|
||||
"""The L2 norm of the lexeme's vector representation.
|
||||
|
||||
def __set__(self, float value):
|
||||
self.c.l2_norm = value
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
"""
|
||||
def __get__(self):
|
||||
vector = self.vector
|
||||
return numpy.sqrt((vector**2).sum())
|
||||
|
||||
property vector:
|
||||
"""A real-valued meaning representation.
|
||||
|
||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||
representing the lexeme's semantics.
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef int length = self.vocab.vectors_length
|
||||
if length == 0:
|
||||
|
@ -165,27 +162,16 @@ cdef class Lexeme:
|
|||
"model doesn't include word vectors. For more info, see "
|
||||
"the documentation: \n%s\n" % about.__docs_models__
|
||||
)
|
||||
|
||||
vector_view = <float[:length,]>self.c.vector
|
||||
return numpy.asarray(vector_view)
|
||||
return self.vocab.get_vector(self.c.orth)
|
||||
|
||||
def __set__(self, vector):
|
||||
assert len(vector) == self.vocab.vectors_length
|
||||
cdef float value
|
||||
cdef double norm = 0.0
|
||||
for i, value in enumerate(vector):
|
||||
self.c.vector[i] = value
|
||||
norm += value * value
|
||||
self.c.l2_norm = sqrt(norm)
|
||||
self.vocab.set_vector(self.c.orth, vector)
|
||||
|
||||
property rank:
|
||||
def __get__(self):
|
||||
return self.c.id
|
||||
|
||||
property repvec:
|
||||
def __get__(self):
|
||||
raise AttributeError("lex.repvec has been renamed to lex.vector")
|
||||
|
||||
property sentiment:
|
||||
def __get__(self):
|
||||
return self.c.sentiment
|
||||
|
@ -196,33 +182,41 @@ cdef class Lexeme:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.orth]
|
||||
|
||||
property text:
|
||||
"""A unicode representation of the token text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the token.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.orth_
|
||||
|
||||
property lower:
|
||||
def __get__(self): return self.c.lower
|
||||
def __set__(self, int x): self.c.lower = x
|
||||
def __set__(self, attr_t x): self.c.lower = x
|
||||
|
||||
property norm:
|
||||
def __get__(self): return self.c.norm
|
||||
def __set__(self, int x): self.c.norm = x
|
||||
def __set__(self, attr_t x): self.c.norm = x
|
||||
|
||||
property shape:
|
||||
def __get__(self): return self.c.shape
|
||||
def __set__(self, int x): self.c.shape = x
|
||||
def __set__(self, attr_t x): self.c.shape = x
|
||||
|
||||
property prefix:
|
||||
def __get__(self): return self.c.prefix
|
||||
def __set__(self, int x): self.c.prefix = x
|
||||
def __set__(self, attr_t x): self.c.prefix = x
|
||||
|
||||
property suffix:
|
||||
def __get__(self): return self.c.suffix
|
||||
def __set__(self, int x): self.c.suffix = x
|
||||
def __set__(self, attr_t x): self.c.suffix = x
|
||||
|
||||
property cluster:
|
||||
def __get__(self): return self.c.cluster
|
||||
def __set__(self, int x): self.c.cluster = x
|
||||
def __set__(self, attr_t x): self.c.cluster = x
|
||||
|
||||
property lang:
|
||||
def __get__(self): return self.c.lang
|
||||
def __set__(self, int x): self.c.lang = x
|
||||
def __set__(self, attr_t x): self.c.lang = x
|
||||
|
||||
property prob:
|
||||
def __get__(self): return self.c.prob
|
||||
|
@ -230,27 +224,27 @@ cdef class Lexeme:
|
|||
|
||||
property lower_:
|
||||
def __get__(self): return self.vocab.strings[self.c.lower]
|
||||
def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
|
||||
def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x)
|
||||
|
||||
property norm_:
|
||||
def __get__(self): return self.vocab.strings[self.c.norm]
|
||||
def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
|
||||
def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x)
|
||||
|
||||
property shape_:
|
||||
def __get__(self): return self.vocab.strings[self.c.shape]
|
||||
def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
|
||||
def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x)
|
||||
|
||||
property prefix_:
|
||||
def __get__(self): return self.vocab.strings[self.c.prefix]
|
||||
def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x]
|
||||
def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x)
|
||||
|
||||
property suffix_:
|
||||
def __get__(self): return self.vocab.strings[self.c.suffix]
|
||||
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
|
||||
def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x)
|
||||
|
||||
property lang_:
|
||||
def __get__(self): return self.vocab.strings[self.c.lang]
|
||||
def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
|
||||
def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x)
|
||||
|
||||
property flags:
|
||||
def __get__(self): return self.c.flags
|
||||
|
@ -258,7 +252,7 @@ cdef class Lexeme:
|
|||
|
||||
property is_oov:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x)
|
||||
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
|
||||
|
||||
property is_stop:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||
|
@ -308,7 +302,6 @@ cdef class Lexeme:
|
|||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||
|
|
|
@ -87,7 +87,7 @@ ctypedef TokenPatternC* TokenPatternC_ptr
|
|||
ctypedef pair[int, TokenPatternC_ptr] StateC
|
||||
|
||||
|
||||
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
|
||||
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
||||
object token_specs) except NULL:
|
||||
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
|
||||
cdef int i
|
||||
|
@ -99,15 +99,21 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
|
|||
pattern[i].attrs[j].attr = attr
|
||||
pattern[i].attrs[j].value = value
|
||||
i = len(token_specs)
|
||||
pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC))
|
||||
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
|
||||
pattern[i].attrs[0].attr = ID
|
||||
pattern[i].attrs[0].value = entity_id
|
||||
pattern[i].attrs[1].attr = ENT_TYPE
|
||||
pattern[i].attrs[1].value = label
|
||||
pattern[i].nr_attr = 0
|
||||
return pattern
|
||||
|
||||
|
||||
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
|
||||
while pattern.nr_attr != 0:
|
||||
pattern += 1
|
||||
id_attr = pattern[0].attrs[0]
|
||||
assert id_attr.attr == ID
|
||||
return id_attr.value
|
||||
|
||||
|
||||
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||
for attr in pattern.attrs[:pattern.nr_attr]:
|
||||
if get_token_attr(token, attr.attr) != attr.value:
|
||||
|
@ -148,7 +154,7 @@ def _convert_strings(token_specs, string_store):
|
|||
if isinstance(attr, basestring):
|
||||
attr = attrs.IDS.get(attr.upper())
|
||||
if isinstance(value, basestring):
|
||||
value = string_store[value]
|
||||
value = string_store.add(value)
|
||||
if isinstance(value, bool):
|
||||
value = int(value)
|
||||
if attr is not None:
|
||||
|
@ -159,14 +165,14 @@ def _convert_strings(token_specs, string_store):
|
|||
|
||||
|
||||
def merge_phrase(matcher, doc, i, matches):
|
||||
'''Callback to merge a phrase on match'''
|
||||
"""Callback to merge a phrase on match."""
|
||||
ent_id, label, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span.merge(ent_type=label, ent_id=ent_id)
|
||||
|
||||
|
||||
cdef class Matcher:
|
||||
'''Match sequences of tokens, based on pattern rules.'''
|
||||
"""Match sequences of tokens, based on pattern rules."""
|
||||
cdef Pool mem
|
||||
cdef vector[TokenPatternC*] patterns
|
||||
cdef readonly Vocab vocab
|
||||
|
@ -175,37 +181,12 @@ cdef class Matcher:
|
|||
cdef public object _callbacks
|
||||
cdef public object _acceptors
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, vocab):
|
||||
"""
|
||||
Load the matcher and patterns from a file path.
|
||||
def __init__(self, vocab):
|
||||
"""Create the Matcher.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
Path to a JSON-formatted patterns file.
|
||||
vocab (Vocab):
|
||||
The vocabulary that the documents to match over will refer to.
|
||||
Returns:
|
||||
Matcher: The newly constructed object.
|
||||
"""
|
||||
if (path / 'gazetteer.json').exists():
|
||||
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
||||
patterns = ujson.load(file_)
|
||||
else:
|
||||
patterns = {}
|
||||
return cls(vocab, patterns)
|
||||
|
||||
def __init__(self, vocab, patterns={}):
|
||||
"""
|
||||
Create the Matcher.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
The vocabulary object, which must be shared with the documents
|
||||
the matcher will operate on.
|
||||
patterns (dict): Patterns to add to the matcher.
|
||||
Returns:
|
||||
The newly constructed object.
|
||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||
documents the matcher will operate on.
|
||||
RETURNS (Matcher): The newly constructed object.
|
||||
"""
|
||||
self._patterns = {}
|
||||
self._entities = {}
|
||||
|
@ -213,144 +194,111 @@ cdef class Matcher:
|
|||
self._callbacks = {}
|
||||
self.vocab = vocab
|
||||
self.mem = Pool()
|
||||
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
||||
self.add_entity(entity_key, attrs)
|
||||
for spec in specs:
|
||||
self.add_pattern(entity_key, spec, label=etype)
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.vocab, self._patterns), None, None)
|
||||
|
||||
property n_patterns:
|
||||
def __get__(self): return self.patterns.size()
|
||||
def __len__(self):
|
||||
"""Get the number of rules added to the matcher. Note that this only
|
||||
returns the number of rules (identical with the number of IDs), not the
|
||||
number of individual patterns.
|
||||
|
||||
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
||||
acceptor=None, on_match=None):
|
||||
RETURNS (int): The number of rules.
|
||||
"""
|
||||
Add an entity to the matcher.
|
||||
return len(self._patterns)
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
An ID for the entity.
|
||||
attrs:
|
||||
Attributes to associate with the Matcher.
|
||||
if_exists ('raise', 'ignore' or 'update'):
|
||||
Controls what happens if the entity ID already exists. Defaults to 'raise'.
|
||||
acceptor:
|
||||
Callback function to filter matches of the entity.
|
||||
on_match:
|
||||
Callback function to act on matches of the entity.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if if_exists not in ('raise', 'ignore', 'update'):
|
||||
raise ValueError(
|
||||
"Unexpected value for if_exists: %s.\n"
|
||||
"Expected one of: ['raise', 'ignore', 'update']" % if_exists)
|
||||
if attrs is None:
|
||||
attrs = {}
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
if self.has_entity(entity_key):
|
||||
if if_exists == 'raise':
|
||||
raise KeyError(
|
||||
"Tried to add entity %s. Entity exists, and if_exists='raise'.\n"
|
||||
"Set if_exists='ignore' or if_exists='update', or check with "
|
||||
"matcher.has_entity()")
|
||||
elif if_exists == 'ignore':
|
||||
return
|
||||
self._entities[entity_key] = dict(attrs)
|
||||
self._patterns.setdefault(entity_key, [])
|
||||
self._acceptors[entity_key] = acceptor
|
||||
self._callbacks[entity_key] = on_match
|
||||
def __contains__(self, key):
|
||||
"""Check whether the matcher contains rules for a match ID.
|
||||
|
||||
def add_pattern(self, entity_key, token_specs, label=""):
|
||||
key (unicode): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
Add a pattern to the matcher.
|
||||
return len(self._patterns)
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
An ID for the entity.
|
||||
token_specs:
|
||||
Description of the pattern to be matched.
|
||||
label:
|
||||
Label to assign to the matched pattern. Defaults to "".
|
||||
Returns:
|
||||
None
|
||||
def add(self, key, on_match, *patterns):
|
||||
"""Add a match-rule to the matcher.
|
||||
A match-rule consists of: an ID key, an on_match callback, and one or
|
||||
more patterns. If the key exists, the patterns are appended to the
|
||||
previous ones, and the previous on_match callback is replaced. The
|
||||
`on_match` callback will receive the arguments `(matcher, doc, i,
|
||||
matches)`. You can also set `on_match` to `None` to not perform any
|
||||
actions. A pattern consists of one or more `token_specs`, where a
|
||||
`token_spec` is a dictionary mapping attribute IDs to values. Token
|
||||
descriptors can also include quantifiers. There are currently important
|
||||
known problems with the quantifiers – see the docs.
|
||||
"""
|
||||
token_specs = list(token_specs)
|
||||
if len(token_specs) == 0:
|
||||
for pattern in patterns:
|
||||
if len(pattern) == 0:
|
||||
msg = ("Cannot add pattern for zero tokens to matcher.\n"
|
||||
"entity_key: {entity_key}\n"
|
||||
"label: {label}")
|
||||
raise ValueError(msg.format(entity_key=entity_key, label=label))
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
if not self.has_entity(entity_key):
|
||||
self.add_entity(entity_key)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
elif label is None:
|
||||
label = 0
|
||||
spec = _convert_strings(token_specs, self.vocab.strings)
|
||||
"key: {key}\n")
|
||||
raise ValueError(msg.format(key=key))
|
||||
key = self._normalize_key(key)
|
||||
self._patterns.setdefault(key, [])
|
||||
self._callbacks[key] = on_match
|
||||
|
||||
self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec))
|
||||
self._patterns[entity_key].append((label, token_specs))
|
||||
for pattern in patterns:
|
||||
specs = _convert_strings(pattern, self.vocab.strings)
|
||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||
self._patterns[key].append(specs)
|
||||
|
||||
def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
|
||||
self.add_entity(entity_key, attrs=attrs, if_exists='update',
|
||||
acceptor=acceptor, on_match=on_match)
|
||||
for spec in specs:
|
||||
self.add_pattern(entity_key, spec, label=label)
|
||||
def remove(self, key):
|
||||
"""Remove a rule from the matcher. A KeyError is raised if the key does
|
||||
not exist.
|
||||
|
||||
def normalize_entity_key(self, entity_key):
|
||||
if isinstance(entity_key, basestring):
|
||||
return self.vocab.strings[entity_key]
|
||||
key (unicode): The ID of the match rule.
|
||||
"""
|
||||
key = self._normalize_key(key)
|
||||
self._patterns.pop(key)
|
||||
self._callbacks.pop(key)
|
||||
cdef int i = 0
|
||||
while i < self.patterns.size():
|
||||
pattern_key = get_pattern_key(self.patterns.at(i))
|
||||
if pattern_key == key:
|
||||
self.patterns.erase(self.patterns.begin()+i)
|
||||
else:
|
||||
return entity_key
|
||||
i += 1
|
||||
|
||||
def has_entity(self, entity_key):
|
||||
"""
|
||||
Check whether the matcher has an entity.
|
||||
def has_key(self, key):
|
||||
"""Check whether the matcher has a rule with a given key.
|
||||
|
||||
Arguments:
|
||||
entity_key (string or int): The entity key to check.
|
||||
Returns:
|
||||
bool: Whether the matcher has the entity.
|
||||
key (string or int): The key to check.
|
||||
RETURNS (bool): Whether the matcher has the rule.
|
||||
"""
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
return entity_key in self._entities
|
||||
key = self._normalize_key(key)
|
||||
return key in self._patterns
|
||||
|
||||
def get_entity(self, entity_key):
|
||||
"""
|
||||
Retrieve the attributes stored for an entity.
|
||||
def get(self, key, default=None):
|
||||
"""Retrieve the pattern stored for a key.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int): The entity to retrieve.
|
||||
Returns:
|
||||
The entity attributes if present, otherwise None.
|
||||
key (unicode or int): The key to retrieve.
|
||||
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
|
||||
"""
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
if entity_key in self._entities:
|
||||
return self._entities[entity_key]
|
||||
else:
|
||||
return None
|
||||
key = self._normalize_key(key)
|
||||
if key not in self._patterns:
|
||||
return default
|
||||
return (self._callbacks[key], self._patterns[key])
|
||||
|
||||
def __call__(self, Doc doc, acceptor=None):
|
||||
"""
|
||||
Find all token sequences matching the supplied patterns on the Doc.
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document to match over.
|
||||
Returns:
|
||||
list
|
||||
A list of (entity_key, label_id, start, end) tuples,
|
||||
describing the matches. A match tuple describes a span doc[start:end].
|
||||
The label_id and entity_key are both integers.
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in docs:
|
||||
self(doc)
|
||||
yield doc
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
||||
|
||||
doc (Doc): The document to match over.
|
||||
RETURNS (list): A list of `(key, label_id, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
"""
|
||||
if acceptor is not None:
|
||||
raise ValueError(
|
||||
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
|
||||
"functions when you add patterns instead.")
|
||||
cdef vector[StateC] partials
|
||||
cdef int n_partials = 0
|
||||
cdef int q = 0
|
||||
|
@ -388,13 +336,7 @@ cdef class Matcher:
|
|||
end = token_i+1
|
||||
ent_id = state.second[1].attrs[0].value
|
||||
label = state.second[1].attrs[1].value
|
||||
acceptor = self._acceptors.get(ent_id)
|
||||
if acceptor is None:
|
||||
matches.append((ent_id, label, start, end))
|
||||
else:
|
||||
match = acceptor(doc, ent_id, label, start, end)
|
||||
if match:
|
||||
matches.append(match)
|
||||
matches.append((ent_id, start, end))
|
||||
partials.resize(q)
|
||||
# Check whether we open any new patterns on this token
|
||||
for pattern in self.patterns:
|
||||
|
@ -419,13 +361,7 @@ cdef class Matcher:
|
|||
end = token_i+1
|
||||
ent_id = pattern[1].attrs[0].value
|
||||
label = pattern[1].attrs[1].value
|
||||
acceptor = self._acceptors.get(ent_id)
|
||||
if acceptor is None:
|
||||
matches.append((ent_id, label, start, end))
|
||||
else:
|
||||
match = acceptor(doc, ent_id, label, start, end)
|
||||
if match:
|
||||
matches.append(match)
|
||||
matches.append((ent_id, start, end))
|
||||
# Look for open patterns that are actually satisfied
|
||||
for state in partials:
|
||||
while state.second.quantifier in (ZERO, ZERO_PLUS):
|
||||
|
@ -435,36 +371,19 @@ cdef class Matcher:
|
|||
end = len(doc)
|
||||
ent_id = state.second.attrs[0].value
|
||||
label = state.second.attrs[0].value
|
||||
acceptor = self._acceptors.get(ent_id)
|
||||
if acceptor is None:
|
||||
matches.append((ent_id, label, start, end))
|
||||
else:
|
||||
match = acceptor(doc, ent_id, label, start, end)
|
||||
if match:
|
||||
matches.append(match)
|
||||
for i, (ent_id, label, start, end) in enumerate(matches):
|
||||
matches.append((ent_id, start, end))
|
||||
for i, (ent_id, start, end) in enumerate(matches):
|
||||
on_match = self._callbacks.get(ent_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
# TODO: only return (match_id, start, end)
|
||||
return matches
|
||||
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
"""
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
Arguments:
|
||||
docs: A stream of documents.
|
||||
batch_size (int):
|
||||
The number of documents to accumulate into a working set.
|
||||
n_threads (int):
|
||||
The number of threads with which to work on the buffer in parallel,
|
||||
if the Matcher implementation supports multi-threading.
|
||||
Yields:
|
||||
Doc Documents, in order.
|
||||
"""
|
||||
for doc in docs:
|
||||
self(doc)
|
||||
yield doc
|
||||
def _normalize_key(self, key):
|
||||
if isinstance(key, basestring):
|
||||
return self.vocab.strings.add(key)
|
||||
else:
|
||||
return key
|
||||
|
||||
|
||||
def get_bilou(length):
|
||||
|
@ -550,7 +469,7 @@ cdef class PhraseMatcher:
|
|||
self(doc)
|
||||
yield doc
|
||||
|
||||
def accept_match(self, Doc doc, int ent_id, int label, int start, int end):
|
||||
def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
|
||||
assert (end - start) < self.max_length
|
||||
cdef int i, j
|
||||
for i in range(self.max_length):
|
||||
|
|
|
@ -30,6 +30,7 @@ cdef class Morphology:
|
|||
cdef public object n_tags
|
||||
cdef public object reverse_index
|
||||
cdef public object tag_names
|
||||
cdef public object exc
|
||||
|
||||
cdef RichTagC* rich_tags
|
||||
cdef PreshMapArray _cache
|
||||
|
|
|
@ -33,36 +33,43 @@ def _normalize_props(props):
|
|||
|
||||
|
||||
cdef class Morphology:
|
||||
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
||||
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
||||
self.mem = Pool()
|
||||
self.strings = string_store
|
||||
self.tag_map = {}
|
||||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map) + 1
|
||||
self.n_tags = len(tag_map)
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.reverse_index = {}
|
||||
|
||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
attrs = _normalize_props(attrs)
|
||||
self.tag_map[tag_str] = dict(attrs)
|
||||
attrs = _normalize_props(attrs)
|
||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||
self.rich_tags[i].id = i
|
||||
self.rich_tags[i].name = self.strings[tag_str]
|
||||
self.rich_tags[i].name = self.strings.add(tag_str)
|
||||
self.rich_tags[i].morph = 0
|
||||
self.rich_tags[i].pos = attrs[POS]
|
||||
self.reverse_index[self.rich_tags[i].name] = i
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
self.exc = {}
|
||||
if exc is not None:
|
||||
for (tag_str, orth_str), attrs in exc.items():
|
||||
self.add_special_case(tag_str, orth_str, attrs)
|
||||
|
||||
def __reduce__(self):
|
||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
|
||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||
self.exc), None, None)
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||
if isinstance(tag, basestring):
|
||||
tag_id = self.reverse_index[self.strings[tag]]
|
||||
else:
|
||||
tag = self.strings.add(tag)
|
||||
if tag in self.reverse_index:
|
||||
tag_id = self.reverse_index[tag]
|
||||
self.assign_tag_id(token, tag_id)
|
||||
else:
|
||||
token.tag = tag
|
||||
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id >= self.n_tags:
|
||||
|
@ -73,7 +80,7 @@ cdef class Morphology:
|
|||
# the statistical model fails.
|
||||
# Related to Issue #220
|
||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||
tag_id = self.reverse_index[self.strings['SP']]
|
||||
tag_id = self.reverse_index[self.strings.add('SP')]
|
||||
rich_tag = self.rich_tags[tag_id]
|
||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||
if analysis is NULL:
|
||||
|
@ -104,7 +111,8 @@ cdef class Morphology:
|
|||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
"""
|
||||
tag = self.strings[tag_str]
|
||||
self.exc[(tag_str, orth_str)] = dict(attrs)
|
||||
tag = self.strings.add(tag_str)
|
||||
tag_id = self.reverse_index[tag]
|
||||
orth = self.strings[orth_str]
|
||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||
|
@ -140,14 +148,14 @@ cdef class Morphology:
|
|||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||
cdef unicode py_string = self.strings[orth]
|
||||
if self.lemmatizer is None:
|
||||
return self.strings[py_string.lower()]
|
||||
return self.strings.add(py_string.lower())
|
||||
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
||||
return self.strings[py_string.lower()]
|
||||
return self.strings.add(py_string.lower())
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings[lemma_string]
|
||||
lemma = self.strings.add(lemma_string)
|
||||
return lemma
|
||||
|
||||
|
||||
|
|
|
@ -9,12 +9,18 @@ import numpy
|
|||
cimport numpy as np
|
||||
import cytoolz
|
||||
import util
|
||||
from collections import OrderedDict
|
||||
import ujson
|
||||
import msgpack
|
||||
|
||||
from thinc.api import add, layerize, chain, clone, concatenate
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
||||
from thinc.neural._classes.hash_embed import HashEmbed
|
||||
from thinc.neural.util import to_categorical
|
||||
|
||||
from thinc.neural.pooling import Pooling, max_pool, mean_pool
|
||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
||||
from thinc.neural._classes.convolution import ExtractWindow
|
||||
from thinc.neural._classes.resnet import Residual
|
||||
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
||||
|
@ -31,110 +37,194 @@ from .syntax.stateclass cimport StateClass
|
|||
from .gold cimport GoldParse
|
||||
from .morphology cimport Morphology
|
||||
from .vocab cimport Vocab
|
||||
from .syntax import nonproj
|
||||
from .compat import json_dumps
|
||||
|
||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||
from ._ml import Tok2Vec, flatten, get_col, doc2feats
|
||||
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
|
||||
from .parts_of_speech import X
|
||||
|
||||
|
||||
class TokenVectorEncoder(object):
|
||||
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
|
||||
name = 'tok2vec'
|
||||
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
|
||||
name = 'tensorizer'
|
||||
|
||||
@classmethod
|
||||
def Model(cls, width=128, embed_size=5000, **cfg):
|
||||
def Model(cls, width=128, embed_size=7500, **cfg):
|
||||
"""Create a new statistical model for the class.
|
||||
|
||||
width (int): Output size of the model.
|
||||
embed_size (int): Number of vectors in the embedding table.
|
||||
**cfg: Config parameters.
|
||||
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
||||
"""
|
||||
width = util.env_opt('token_vector_width', width)
|
||||
embed_size = util.env_opt('embed_size', embed_size)
|
||||
return Tok2Vec(width, embed_size, preprocess=None)
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
"""Construct a new statistical model. Weights are not allocated on
|
||||
initialisation.
|
||||
|
||||
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
||||
instance with the `Doc` objects it will process.
|
||||
model (Model): A `Model` instance or `True` allocate one later.
|
||||
**cfg: Config parameters.
|
||||
|
||||
EXAMPLE:
|
||||
>>> from spacy.pipeline import TokenVectorEncoder
|
||||
>>> tok2vec = TokenVectorEncoder(nlp.vocab)
|
||||
>>> tok2vec.model = tok2vec.Model(128, 5000)
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.doc2feats = doc2feats()
|
||||
self.model = model
|
||||
|
||||
def __call__(self, docs, state=None):
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
tokvecs = self.predict(docs)
|
||||
self.set_annotations(docs, tokvecs)
|
||||
state = {} if state is None else state
|
||||
state['tokvecs'] = tokvecs
|
||||
return state
|
||||
def __call__(self, doc):
|
||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||
model. Vectors are set to the `Doc.tensor` attribute.
|
||||
|
||||
docs (Doc or iterable): One or more documents to add vectors to.
|
||||
RETURNS (dict or None): Intermediate computations.
|
||||
"""
|
||||
tokvecses = self.predict([doc])
|
||||
self.set_annotations([doc], tokvecses)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
for batch in cytoolz.partition_all(batch_size, stream):
|
||||
docs, states = zip(*batch)
|
||||
tokvecs = self.predict(docs)
|
||||
self.set_annotations(docs, tokvecs)
|
||||
for state in states:
|
||||
state['tokvecs'] = tokvecs
|
||||
yield from zip(docs, states)
|
||||
"""Process `Doc` objects as a stream.
|
||||
|
||||
stream (iterator): A sequence of `Doc` objects to process.
|
||||
batch_size (int): Number of `Doc` objects to group.
|
||||
n_threads (int): Number of threads.
|
||||
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
||||
"""
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
docs = list(docs)
|
||||
tokvecses = self.predict(docs)
|
||||
self.set_annotations(docs, tokvecses)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
"""Return a single tensor for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
RETURNS (object): Vector representations for each token in the documents.
|
||||
"""
|
||||
feats = self.doc2feats(docs)
|
||||
tokvecs = self.model(feats)
|
||||
return tokvecs
|
||||
|
||||
def set_annotations(self, docs, tokvecs):
|
||||
start = 0
|
||||
for doc in docs:
|
||||
doc.tensor = tokvecs[start : start + len(doc)]
|
||||
start += len(doc)
|
||||
def set_annotations(self, docs, tokvecses):
|
||||
"""Set the tensor attribute for a batch of documents.
|
||||
|
||||
def update(self, docs, golds, state=None,
|
||||
drop=0., sgd=None):
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
tokvecs (object): Vector representation for each token in the documents.
|
||||
"""
|
||||
for doc, tokvecs in zip(docs, tokvecses):
|
||||
assert tokvecs.shape[0] == len(doc)
|
||||
doc.tensor = tokvecs
|
||||
|
||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
||||
"""Update the model.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
golds (iterable): A batch of `GoldParse` objects.
|
||||
drop (float): The droput rate.
|
||||
sgd (callable): An optimizer.
|
||||
RETURNS (dict): Results from the update.
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
golds = [golds]
|
||||
state = {} if state is None else state
|
||||
feats = self.doc2feats(docs)
|
||||
tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
|
||||
state['feats'] = feats
|
||||
state['tokvecs'] = tokvecs
|
||||
state['bp_tokvecs'] = bp_tokvecs
|
||||
return state
|
||||
return tokvecs, bp_tokvecs
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
# TODO: implement
|
||||
raise NotImplementedError
|
||||
|
||||
def begin_training(self, gold_tuples, pipeline=None):
|
||||
"""Allocate models, pre-process training data and acquire a trainer and
|
||||
optimizer.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
self.doc2feats = doc2feats()
|
||||
if self.model is True:
|
||||
self.model = self.Model()
|
||||
|
||||
def use_params(self, params):
|
||||
"""Replace weights of models in the pipeline with those provided in the
|
||||
params dictionary.
|
||||
|
||||
params (dict): A dictionary of parameters keyed by model ID.
|
||||
"""
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
serialize = OrderedDict((
|
||||
('model', lambda: self.model.to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes())
|
||||
))
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
if self.model is True:
|
||||
self.model = self.Model()
|
||||
deserialize = OrderedDict((
|
||||
('model', lambda b: self.model.from_bytes(b)),
|
||||
('vocab', lambda b: self.vocab.from_bytes(b))
|
||||
))
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
serialize = OrderedDict((
|
||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||
('vocab', lambda p: self.vocab.to_disk(p))
|
||||
))
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
if self.model is True:
|
||||
self.model = self.Model()
|
||||
deserialize = OrderedDict((
|
||||
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
|
||||
('vocab', lambda p: self.vocab.from_disk(p))
|
||||
))
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
||||
|
||||
class NeuralTagger(object):
|
||||
name = 'nn_tagger'
|
||||
name = 'tagger'
|
||||
def __init__(self, vocab, model=True):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
||||
def __call__(self, doc, state=None):
|
||||
assert state is not None
|
||||
assert 'tokvecs' in state
|
||||
tokvecs = state['tokvecs']
|
||||
tags = self.predict(tokvecs)
|
||||
def __call__(self, doc):
|
||||
tags = self.predict([doc.tensor])
|
||||
self.set_annotations([doc], tags)
|
||||
return state
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
for batch in cytoolz.partition_all(batch_size, stream):
|
||||
docs, states = zip(*batch)
|
||||
tag_ids = self.predict(states[0]['tokvecs'])
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
tokvecs = [d.tensor for d in docs]
|
||||
tag_ids = self.predict(tokvecs)
|
||||
self.set_annotations(docs, tag_ids)
|
||||
for state in states:
|
||||
state['tag_ids'] = tag_ids
|
||||
yield from zip(docs, states)
|
||||
yield from docs
|
||||
|
||||
def predict(self, tokvecs):
|
||||
scores = self.model(tokvecs)
|
||||
scores = self.model.ops.flatten(scores)
|
||||
guesses = scores.argmax(axis=1)
|
||||
if not isinstance(guesses, numpy.ndarray):
|
||||
guesses = guesses.get()
|
||||
guesses = self.model.ops.unflatten(guesses,
|
||||
[tv.shape[0] for tv in tokvecs])
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
|
@ -142,46 +232,48 @@ class NeuralTagger(object):
|
|||
docs = [docs]
|
||||
cdef Doc doc
|
||||
cdef int idx = 0
|
||||
cdef int i, j, tag_id
|
||||
cdef Vocab vocab = self.vocab
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[idx:idx+len(doc)]
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
# Don't clobber preset POS tags
|
||||
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
||||
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
||||
idx += 1
|
||||
doc.is_tagged = True
|
||||
|
||||
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
||||
state = {} if state is None else state
|
||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||
docs, tokvecs = docs_tokvecs
|
||||
|
||||
tokvecs = state['tokvecs']
|
||||
bp_tokvecs = state['bp_tokvecs']
|
||||
if self.model.nI is None:
|
||||
self.model.nI = tokvecs.shape[1]
|
||||
self.model.nI = tokvecs[0].shape[1]
|
||||
|
||||
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
|
||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||
|
||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||
|
||||
bp_tokvecs(d_tokvecs, sgd=sgd)
|
||||
|
||||
state['tag_scores'] = tag_scores
|
||||
state['tag_loss'] = loss
|
||||
return state
|
||||
return d_tokvecs
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
||||
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||
guesses = scores.argmax(axis=1)
|
||||
for gold in golds:
|
||||
for tag in gold.tags:
|
||||
if tag is None:
|
||||
correct[idx] = guesses[idx]
|
||||
else:
|
||||
correct[idx] = tag_index[tag]
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||
d_scores /= d_scores.shape[0]
|
||||
loss = (d_scores**2).sum()
|
||||
d_scores = self.model.ops.asarray(d_scores, dtype='f')
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
return float(loss), d_scores
|
||||
|
||||
def begin_training(self, gold_tuples, pipeline=None):
|
||||
|
@ -195,22 +287,242 @@ class NeuralTagger(object):
|
|||
new_tag_map[tag] = orig_tag_map[tag]
|
||||
else:
|
||||
new_tag_map[tag] = {POS: X}
|
||||
if 'SP' not in new_tag_map:
|
||||
new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X})
|
||||
cdef Vocab vocab = self.vocab
|
||||
if new_tag_map:
|
||||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
||||
vocab.morphology.lemmatizer)
|
||||
self.model = Softmax(self.vocab.morphology.n_tags)
|
||||
print("Tagging", self.model.nO, "tags")
|
||||
vocab.morphology.lemmatizer,
|
||||
exc=vocab.morphology.exc)
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
if self.model is True:
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, token_vector_width):
|
||||
return with_flatten(
|
||||
chain(Maxout(token_vector_width, token_vector_width),
|
||||
Softmax(n_tags, token_vector_width)))
|
||||
|
||||
def use_params(self, params):
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
serialize = OrderedDict((
|
||||
('model', lambda: self.model.to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
|
||||
use_bin_type=True,
|
||||
encoding='utf8'))
|
||||
))
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def load_model(b):
|
||||
if self.model is True:
|
||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||
self.model.from_bytes(b)
|
||||
|
||||
def load_tag_map(b):
|
||||
tag_map = msgpack.loads(b, encoding='utf8')
|
||||
self.vocab.morphology = Morphology(
|
||||
self.vocab.strings, tag_map=tag_map,
|
||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||
exc=self.vocab.morphology.exc)
|
||||
|
||||
deserialize = OrderedDict((
|
||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('tag_map', load_tag_map),
|
||||
('model', lambda b: load_model(b)),
|
||||
))
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
serialize = OrderedDict((
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
|
||||
self.vocab.morphology.tag_map,
|
||||
use_bin_type=True,
|
||||
encoding='utf8'))),
|
||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||
))
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
def load_model(p):
|
||||
if self.model is True:
|
||||
token_vector_width = util.env_opt('token_vector_width', 128)
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||
self.model.from_bytes(p.open('rb').read())
|
||||
|
||||
def load_tag_map(p):
|
||||
with p.open('rb') as file_:
|
||||
tag_map = msgpack.loads(file_.read(), encoding='utf8')
|
||||
self.vocab.morphology = Morphology(
|
||||
self.vocab.strings, tag_map=tag_map,
|
||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||
exc=self.vocab.morphology.exc)
|
||||
|
||||
deserialize = OrderedDict((
|
||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||
('tag_map', load_tag_map),
|
||||
('model', load_model),
|
||||
))
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
||||
|
||||
class NeuralLabeller(NeuralTagger):
|
||||
name = 'nn_labeller'
|
||||
def __init__(self, vocab, model=True):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.labels = {}
|
||||
|
||||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def begin_training(self, gold_tuples, pipeline=None):
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||
for raw_text, annots_brackets in gold_tuples:
|
||||
for annots, brackets in annots_brackets:
|
||||
ids, words, tags, heads, deps, ents = annots
|
||||
for dep in deps:
|
||||
if dep not in self.labels:
|
||||
self.labels[dep] = len(self.labels)
|
||||
token_vector_width = pipeline[0].model.nO
|
||||
if self.model is True:
|
||||
self.model = self.Model(len(self.labels), token_vector_width)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, n_tags, token_vector_width):
|
||||
return with_flatten(
|
||||
chain(Maxout(token_vector_width, token_vector_width),
|
||||
Softmax(n_tags, token_vector_width)))
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||
guesses = scores.argmax(axis=1)
|
||||
for gold in golds:
|
||||
for tag in gold.labels:
|
||||
if tag is None or tag not in self.labels:
|
||||
correct[idx] = guesses[idx]
|
||||
else:
|
||||
correct[idx] = self.labels[tag]
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||
d_scores /= d_scores.shape[0]
|
||||
loss = (d_scores**2).sum()
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
return float(loss), d_scores
|
||||
|
||||
|
||||
class SimilarityHook(object):
|
||||
"""
|
||||
Experimental
|
||||
|
||||
A pipeline component to install a hook for supervised similarity into
|
||||
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
|
||||
model can be any object obeying the Thinc Model interface. By default,
|
||||
the model concatenates the elementwise mean and elementwise max of the two
|
||||
tensors, and compares them using the Cauchy-like similarity function
|
||||
from Chen (2013):
|
||||
|
||||
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||
|
||||
Where W is a vector of dimension weights, initialized to 1.
|
||||
"""
|
||||
name = 'similarity'
|
||||
def __init__(self, vocab, model=True):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
||||
@classmethod
|
||||
def Model(cls, length):
|
||||
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
||||
|
||||
def __call__(self, doc):
|
||||
'''Install similarity hook'''
|
||||
doc.user_hooks['similarity'] = self.predict
|
||||
return doc
|
||||
|
||||
def pipe(self, docs, **kwargs):
|
||||
for doc in docs:
|
||||
yield self(doc)
|
||||
|
||||
def predict(self, doc1, doc2):
|
||||
return self.model.predict([(doc1.tensor, doc2.tensor)])
|
||||
|
||||
def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.):
|
||||
doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2
|
||||
sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
|
||||
drop=drop)
|
||||
d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
|
||||
|
||||
return d_tensor1s, d_tensor2s
|
||||
|
||||
def begin_training(self, _, pipeline=None):
|
||||
"""
|
||||
Allocate model, using width from tensorizer in pipeline.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
if self.model is True:
|
||||
self.model = self.Model(pipeline[0].model.nO)
|
||||
|
||||
def use_params(self, params):
|
||||
"""Replace weights of models in the pipeline with those provided in the
|
||||
params dictionary.
|
||||
|
||||
params (dict): A dictionary of parameters keyed by model ID.
|
||||
"""
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
serialize = OrderedDict((
|
||||
('model', lambda: self.model.to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes())
|
||||
))
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
if self.model is True:
|
||||
self.model = self.Model()
|
||||
deserialize = OrderedDict((
|
||||
('model', lambda b: self.model.from_bytes(b)),
|
||||
('vocab', lambda b: self.vocab.from_bytes(b))
|
||||
))
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
serialize = OrderedDict((
|
||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||
('vocab', lambda p: self.vocab.to_disk(p))
|
||||
))
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
if self.model is True:
|
||||
self.model = self.Model()
|
||||
deserialize = OrderedDict((
|
||||
('model', lambda p: self.model.from_bytes(p.open('rb').read())),
|
||||
('vocab', lambda p: self.vocab.from_disk(p))
|
||||
))
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
||||
|
||||
cdef class EntityRecognizer(LinearParser):
|
||||
"""
|
||||
Annotate named entities on Doc objects.
|
||||
"""
|
||||
"""Annotate named entities on Doc objects."""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
@ -222,9 +534,7 @@ cdef class EntityRecognizer(LinearParser):
|
|||
|
||||
|
||||
cdef class BeamEntityRecognizer(BeamParser):
|
||||
"""
|
||||
Annotate named entities on Doc objects.
|
||||
"""
|
||||
"""Annotate named entities on Doc objects."""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
@ -249,32 +559,18 @@ cdef class NeuralDependencyParser(NeuralParser):
|
|||
name = 'parser'
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
|
||||
cdef class NeuralEntityRecognizer(NeuralParser):
|
||||
name = 'entity'
|
||||
name = 'ner'
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
nr_feature = 6
|
||||
|
||||
def get_token_ids(self, states):
|
||||
cdef StateClass state
|
||||
cdef int n_tokens = 6
|
||||
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
|
||||
for i, state in enumerate(states):
|
||||
ids[i, 0] = state.c.B(0)-1
|
||||
ids[i, 1] = state.c.B(0)
|
||||
ids[i, 2] = state.c.B(1)
|
||||
ids[i, 3] = state.c.E(0)
|
||||
ids[i, 4] = state.c.E(0)-1
|
||||
ids[i, 5] = state.c.E(0)+1
|
||||
for j in range(6):
|
||||
if ids[i, j] >= state.c.length:
|
||||
ids[i, j] = -1
|
||||
if ids[i, j] != -1:
|
||||
ids[i, j] += state.c.offset
|
||||
return ids
|
||||
|
||||
|
||||
def __reduce__(self):
|
||||
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
|
||||
cdef class BeamDependencyParser(BeamParser):
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from libc.stdint cimport int64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
@ -8,6 +9,9 @@ from .typedefs cimport attr_t, hash_t
|
|||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0
|
||||
cdef hash_t hash_utf8(char* utf8_string, int length) nogil
|
||||
|
||||
cdef unicode decode_Utf8Str(const Utf8Str* string)
|
||||
|
||||
|
||||
ctypedef union Utf8Str:
|
||||
|
@ -17,13 +21,11 @@ ctypedef union Utf8Str:
|
|||
|
||||
cdef class StringStore:
|
||||
cdef Pool mem
|
||||
cdef Utf8Str* c
|
||||
cdef int64_t size
|
||||
cdef bint is_frozen
|
||||
|
||||
cdef vector[hash_t] keys
|
||||
cdef public PreshMap _map
|
||||
cdef public PreshMap _oov
|
||||
cdef int64_t _resize_at
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
|
||||
|
|
|
@ -7,11 +7,16 @@ from libc.string cimport memcpy
|
|||
from libc.stdint cimport uint64_t, uint32_t
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
from preshed.maps cimport map_iter, key_t
|
||||
from libc.stdint cimport uint32_t
|
||||
import ujson
|
||||
import dill
|
||||
|
||||
from .symbols import IDS as SYMBOLS_BY_STR
|
||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from libc.stdint cimport uint32_t
|
||||
|
||||
import ujson
|
||||
from . import util
|
||||
from .compat import json_dumps
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
|
@ -27,7 +32,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
|
|||
return hash32(utf8_string, length, 1)
|
||||
|
||||
|
||||
cdef unicode _decode(const Utf8Str* string):
|
||||
cdef unicode decode_Utf8Str(const Utf8Str* string):
|
||||
cdef int i, length
|
||||
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
|
||||
return string.s[1:string.s[0]+1].decode('utf8')
|
||||
|
@ -44,10 +49,10 @@ cdef unicode _decode(const Utf8Str* string):
|
|||
return string.p[i:length + i].decode('utf8')
|
||||
|
||||
|
||||
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
|
||||
cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
|
||||
cdef int n_length_bytes
|
||||
cdef int i
|
||||
cdef Utf8Str string
|
||||
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
|
||||
cdef uint32_t ulength = length
|
||||
if length < sizeof(string.s):
|
||||
string.s[0] = <unsigned char>length
|
||||
|
@ -72,129 +77,166 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
|||
|
||||
|
||||
cdef class StringStore:
|
||||
"""
|
||||
Map strings to and from integer IDs.
|
||||
"""
|
||||
"""Look up strings by 64-bit hashes."""
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
"""
|
||||
Create the StringStore.
|
||||
"""Create the StringStore.
|
||||
|
||||
Arguments:
|
||||
strings: A sequence of unicode strings to add to the store.
|
||||
strings (iterable): A sequence of unicode strings to add to the store.
|
||||
RETURNS (StringStore): The newly constructed object.
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._oov = PreshMap()
|
||||
self._resize_at = 10000
|
||||
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||
self.size = 1
|
||||
self.is_frozen = freeze
|
||||
if strings is not None:
|
||||
for string in strings:
|
||||
_ = self[string]
|
||||
|
||||
property size:
|
||||
def __get__(self):
|
||||
return self.size -1
|
||||
|
||||
def __reduce__(self):
|
||||
# TODO: OOV words, for the is_frozen stuff?
|
||||
if self.is_frozen:
|
||||
raise NotImplementedError(
|
||||
"Currently missing support for pickling StringStore when "
|
||||
"is_frozen=True")
|
||||
return (StringStore, (list(self),))
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
The number of strings in the store.
|
||||
|
||||
Returns:
|
||||
int The number of strings in the store.
|
||||
"""
|
||||
return self.size-1
|
||||
self.add(string)
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""
|
||||
Retrieve a string from a given integer ID, or vice versa.
|
||||
"""Retrieve a string from a given hash, or vice versa.
|
||||
|
||||
Arguments:
|
||||
string_or_id (bytes or unicode or int):
|
||||
The value to encode.
|
||||
Returns:
|
||||
unicode or int: The value to retrieved.
|
||||
string_or_id (bytes, unicode or uint64): The value to encode.
|
||||
Returns (unicode or uint64): The value to be retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||
return 0
|
||||
elif string_or_id == 0:
|
||||
return u''
|
||||
elif string_or_id in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string_or_id]
|
||||
|
||||
cdef bytes byte_string
|
||||
cdef const Utf8Str* utf8str
|
||||
cdef uint64_t int_id
|
||||
cdef uint32_t oov_id
|
||||
if isinstance(string_or_id, (int, long)):
|
||||
int_id = string_or_id
|
||||
oov_id = string_or_id
|
||||
if int_id < <uint64_t>self.size:
|
||||
return _decode(&self.c[int_id])
|
||||
cdef hash_t key
|
||||
|
||||
if isinstance(string_or_id, unicode):
|
||||
key = hash_string(string_or_id)
|
||||
return key
|
||||
elif isinstance(string_or_id, bytes):
|
||||
key = hash_utf8(string_or_id, len(string_or_id))
|
||||
return key
|
||||
elif string_or_id < len(SYMBOLS_BY_INT):
|
||||
return SYMBOLS_BY_INT[string_or_id]
|
||||
else:
|
||||
utf8str = <Utf8Str*>self._oov.get(oov_id)
|
||||
if utf8str is not NULL:
|
||||
return _decode(utf8str)
|
||||
else:
|
||||
raise IndexError(string_or_id)
|
||||
else:
|
||||
if isinstance(string_or_id, bytes):
|
||||
byte_string = <bytes>string_or_id
|
||||
elif isinstance(string_or_id, unicode):
|
||||
byte_string = (<unicode>string_or_id).encode('utf8')
|
||||
else:
|
||||
raise TypeError(type(string_or_id))
|
||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
||||
key = string_or_id
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
if utf8str is NULL:
|
||||
# TODO: We need to use 32 bit here, for compatibility with the
|
||||
# vocabulary values. This makes birthday paradox probabilities
|
||||
# pretty bad.
|
||||
# We could also get unlucky here, and hash into a value that
|
||||
# collides with the 'real' strings.
|
||||
return hash32_utf8(byte_string, len(byte_string))
|
||||
raise KeyError(string_or_id)
|
||||
else:
|
||||
return utf8str - self.c
|
||||
return decode_Utf8Str(utf8str)
|
||||
|
||||
def __contains__(self, unicode string not None):
|
||||
def add(self, string):
|
||||
"""Add a string to the StringStore.
|
||||
|
||||
string (unicode): The string to add.
|
||||
RETURNS (uint64): The string's hash value.
|
||||
"""
|
||||
Check whether a string is in the store.
|
||||
if isinstance(string, unicode):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
key = hash_string(string)
|
||||
self.intern_unicode(string)
|
||||
elif isinstance(string, bytes):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
key = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string))
|
||||
else:
|
||||
raise TypeError(
|
||||
"Can only add unicode or bytes. Got type: %s" % type(string))
|
||||
return key
|
||||
|
||||
def __len__(self):
|
||||
"""The number of strings in the store.
|
||||
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self.keys.size()
|
||||
|
||||
def __contains__(self, string not None):
|
||||
"""Check whether a string is in the store.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to check.
|
||||
Returns bool:
|
||||
Whether the store contains the string.
|
||||
RETURNS (bool): Whether the store contains the string.
|
||||
"""
|
||||
if len(string) == 0:
|
||||
cdef hash_t key
|
||||
if isinstance(string, int) or isinstance(string, long):
|
||||
if string == 0:
|
||||
return True
|
||||
cdef hash_t key = hash_string(string)
|
||||
key = string
|
||||
elif len(string) == 0:
|
||||
return True
|
||||
elif string in SYMBOLS_BY_STR:
|
||||
return True
|
||||
elif isinstance(string, unicode):
|
||||
key = hash_string(string)
|
||||
else:
|
||||
string = string.encode('utf8')
|
||||
key = hash_utf8(string, len(string))
|
||||
if key < len(SYMBOLS_BY_INT):
|
||||
return True
|
||||
else:
|
||||
return self._map.get(key) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
"""
|
||||
Iterate over the strings in the store, in order.
|
||||
"""Iterate over the strings in the store, in order.
|
||||
|
||||
Yields: unicode A string in the store.
|
||||
YIELDS (unicode): A string in the store.
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(self.size):
|
||||
yield _decode(&self.c[i]) if i > 0 else u''
|
||||
cdef hash_t key
|
||||
for i in range(self.keys.size()):
|
||||
key = self.keys[i]
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
yield decode_Utf8Str(utf8str)
|
||||
# TODO: Iterate OOV here?
|
||||
|
||||
def __reduce__(self):
|
||||
strings = [""]
|
||||
for i in range(1, self.size):
|
||||
string = &self.c[i]
|
||||
py_string = _decode(string)
|
||||
strings.append(py_string)
|
||||
strings = list(self)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
strings = list(self)
|
||||
with path.open('w') as file_:
|
||||
file_.write(json_dumps(strings))
|
||||
|
||||
def from_disk(self, path):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
RETURNS (StringStore): The modified `StringStore` object.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
with path.open('r') as file_:
|
||||
strings = ujson.load(file_)
|
||||
self._reset_and_load(strings)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||
"""
|
||||
return ujson.dumps(list(self))
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (StringStore): The `StringStore` object.
|
||||
"""
|
||||
strings = ujson.loads(bytes_data)
|
||||
self._reset_and_load(strings)
|
||||
return self
|
||||
|
||||
def set_frozen(self, bint is_frozen):
|
||||
# TODO
|
||||
self.is_frozen = is_frozen
|
||||
|
@ -202,6 +244,15 @@ cdef class StringStore:
|
|||
def flush_oov(self):
|
||||
self._oov = PreshMap()
|
||||
|
||||
def _reset_and_load(self, strings, freeze=False):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._oov = PreshMap()
|
||||
self.keys.clear()
|
||||
for string in strings:
|
||||
self.add(string)
|
||||
self.is_frozen = freeze
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef bytes byte_string = py_string.encode('utf8')
|
||||
|
@ -223,73 +274,11 @@ cdef class StringStore:
|
|||
key32 = hash32_utf8(utf8_string, length)
|
||||
# Important: Make the OOV store own the memory. That way it's trivial
|
||||
# to flush them all.
|
||||
value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
|
||||
value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
||||
value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
|
||||
self._oov.set(key32, value)
|
||||
return NULL
|
||||
|
||||
if self.size == self._resize_at:
|
||||
self._realloc()
|
||||
self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
self._map.set(key, <void*>&self.c[self.size])
|
||||
self.size += 1
|
||||
return &self.c[self.size-1]
|
||||
|
||||
def dump(self, file_):
|
||||
"""
|
||||
Save the strings to a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to save the strings.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
string_data = ujson.dumps(list(self))
|
||||
if not isinstance(string_data, unicode):
|
||||
string_data = string_data.decode('utf8')
|
||||
# TODO: OOV?
|
||||
file_.write(string_data)
|
||||
|
||||
def load(self, file_):
|
||||
"""
|
||||
Load the strings from a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file from which to load the strings.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
strings = ujson.load(file_)
|
||||
if strings == ['']:
|
||||
return None
|
||||
cdef unicode string
|
||||
for string in strings:
|
||||
# explicit None/len check instead of simple truth testing
|
||||
# (bug in Cython <= 0.23.4)
|
||||
if string is not None and len(string):
|
||||
self.intern_unicode(string)
|
||||
|
||||
def _realloc(self):
|
||||
# We want to map straight to pointers, but they'll be invalidated if
|
||||
# we resize our array. So, first we remap to indices, then we resize,
|
||||
# then we can acquire the new pointers.
|
||||
cdef Pool tmp_mem = Pool()
|
||||
keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
|
||||
cdef key_t key
|
||||
cdef void* value
|
||||
cdef const Utf8Str ptr
|
||||
cdef int i = 0
|
||||
cdef size_t offset
|
||||
while map_iter(self._map.c_map, &i, &key, &value):
|
||||
# Find array index with pointer arithmetic
|
||||
offset = ((<Utf8Str*>value) - self.c)
|
||||
keys[offset] = key
|
||||
|
||||
self._resize_at *= 2
|
||||
cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
|
||||
self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
|
||||
|
||||
self._map = PreshMap(self.size)
|
||||
for i in range(self.size):
|
||||
if keys[i]:
|
||||
self._map.set(keys[i], &self.c[i])
|
||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
self._map.set(key, value)
|
||||
self.keys.push_back(key)
|
||||
return value
|
||||
|
|
|
@ -5,8 +5,6 @@ from .parts_of_speech cimport univ_pos_t
|
|||
|
||||
|
||||
cdef struct LexemeC:
|
||||
float* vector
|
||||
|
||||
flags_t flags
|
||||
|
||||
attr_t lang
|
||||
|
@ -25,11 +23,10 @@ cdef struct LexemeC:
|
|||
|
||||
float prob
|
||||
float sentiment
|
||||
float l2_norm
|
||||
|
||||
|
||||
cdef struct SerializedLexemeC:
|
||||
unsigned char[4*13 + 8] data
|
||||
unsigned char[8 + 8*10 + 4 + 4] data
|
||||
# sizeof(flags_t) # flags
|
||||
# + sizeof(attr_t) # lang
|
||||
# + sizeof(attr_t) # id
|
||||
|
@ -50,7 +47,7 @@ cdef struct Entity:
|
|||
hash_t id
|
||||
int start
|
||||
int end
|
||||
int label
|
||||
attr_t label
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
|
@ -58,12 +55,12 @@ cdef struct TokenC:
|
|||
uint64_t morph
|
||||
univ_pos_t pos
|
||||
bint spacy
|
||||
int tag
|
||||
attr_t tag
|
||||
int idx
|
||||
int lemma
|
||||
int sense
|
||||
attr_t lemma
|
||||
attr_t sense
|
||||
int head
|
||||
int dep
|
||||
attr_t dep
|
||||
bint sent_start
|
||||
|
||||
uint32_t l_kids
|
||||
|
@ -72,5 +69,5 @@ cdef struct TokenC:
|
|||
uint32_t r_edge
|
||||
|
||||
int ent_iob
|
||||
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||
hash_t ent_id
|
||||
|
|
|
@ -82,6 +82,7 @@ cpdef enum symbol_t:
|
|||
ENT_IOB
|
||||
ENT_TYPE
|
||||
HEAD
|
||||
SENT_START
|
||||
SPACY
|
||||
PROB
|
||||
|
||||
|
|
|
@ -84,6 +84,7 @@ IDS = {
|
|||
"ENT_IOB": ENT_IOB,
|
||||
"ENT_TYPE": ENT_TYPE,
|
||||
"HEAD": HEAD,
|
||||
"SENT_START": SENT_START,
|
||||
"SPACY": SPACY,
|
||||
"PROB": PROB,
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ from ..structs cimport TokenC, Entity
|
|||
from ..lexeme cimport Lexeme
|
||||
from ..symbols cimport punct
|
||||
from ..attrs cimport IS_SPACE
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
cdef inline bint is_space_token(const TokenC* token) nogil:
|
||||
|
@ -71,6 +72,45 @@ cdef cppclass StateC:
|
|||
free(this._stack - PADDING)
|
||||
free(this.shifted - PADDING)
|
||||
|
||||
void set_context_tokens(int* ids, int n) nogil:
|
||||
if n == 13:
|
||||
ids[0] = this.B(0)
|
||||
ids[1] = this.B(1)
|
||||
ids[2] = this.S(0)
|
||||
ids[3] = this.S(1)
|
||||
ids[4] = this.S(2)
|
||||
ids[5] = this.L(this.S(0), 1)
|
||||
ids[6] = this.L(this.S(0), 2)
|
||||
ids[6] = this.R(this.S(0), 1)
|
||||
ids[7] = this.L(this.B(0), 1)
|
||||
ids[8] = this.R(this.S(0), 2)
|
||||
ids[9] = this.L(this.S(1), 1)
|
||||
ids[10] = this.L(this.S(1), 2)
|
||||
ids[11] = this.R(this.S(1), 1)
|
||||
ids[12] = this.R(this.S(1), 2)
|
||||
elif n == 6:
|
||||
if this.B(0) >= 0:
|
||||
ids[0] = this.B(0)
|
||||
else:
|
||||
ids[0] = -1
|
||||
ids[1] = this.B(0)
|
||||
ids[2] = this.B(1)
|
||||
ids[3] = this.E(0)
|
||||
if ids[3] >= 1:
|
||||
ids[4] = this.E(0)-1
|
||||
else:
|
||||
ids[4] = -1
|
||||
if (ids[3]+1) < this.length:
|
||||
ids[5] = this.E(0)+1
|
||||
else:
|
||||
ids[5] = -1
|
||||
else:
|
||||
# TODO error =/
|
||||
pass
|
||||
for i in range(n):
|
||||
if ids[i] >= 0:
|
||||
ids[i] += this.offset
|
||||
|
||||
int S(int i) nogil const:
|
||||
if i >= this._s_i:
|
||||
return -1
|
||||
|
@ -238,7 +278,7 @@ cdef cppclass StateC:
|
|||
this._s_i -= 1
|
||||
this.shifted[this.B(0)] = True
|
||||
|
||||
void add_arc(int head, int child, int label) nogil:
|
||||
void add_arc(int head, int child, attr_t label) nogil:
|
||||
if this.has_head(child):
|
||||
this.del_arc(this.H(child), child)
|
||||
|
||||
|
@ -282,7 +322,7 @@ cdef cppclass StateC:
|
|||
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
|
||||
h.l_kids -= 1
|
||||
|
||||
void open_ent(int label) nogil:
|
||||
void open_ent(attr_t label) nogil:
|
||||
this._ents[this._e_i].start = this.B(0)
|
||||
this._ents[this._e_i].label = label
|
||||
this._ents[this._e_i].end = -1
|
||||
|
@ -294,7 +334,7 @@ cdef cppclass StateC:
|
|||
this._ents[this._e_i-1].end = this.B(0)+1
|
||||
this._sent[this.B(0)].ent_iob = 1
|
||||
|
||||
void set_ent_tag(int i, int ent_iob, int ent_type) nogil:
|
||||
void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
|
||||
if 0 <= i < this.length:
|
||||
this._sent[i].ent_iob = ent_iob
|
||||
this._sent[i].ent_type = ent_type
|
||||
|
@ -305,16 +345,18 @@ cdef cppclass StateC:
|
|||
this._break = this._b_i
|
||||
|
||||
void clone(const StateC* src) nogil:
|
||||
this.length = src.length
|
||||
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
|
||||
memcpy(this._stack, src._stack, this.length * sizeof(int))
|
||||
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
|
||||
memcpy(this._ents, src._ents, this.length * sizeof(Entity))
|
||||
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
|
||||
this.length = src.length
|
||||
this._b_i = src._b_i
|
||||
this._s_i = src._s_i
|
||||
this._e_i = src._e_i
|
||||
this._break = src._break
|
||||
this.offset = src.offset
|
||||
this._empty_token = src._empty_token
|
||||
|
||||
void fast_forward() nogil:
|
||||
# space token attachement policy:
|
||||
|
|
|
@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
|
|||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..gold cimport GoldParseC
|
||||
|
|
|
@ -9,10 +9,10 @@ import ctypes
|
|||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
from cymem.cymem cimport Pool
|
||||
from collections import OrderedDict
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, is_space_token
|
||||
from .nonproj import PseudoProjectivity
|
||||
from .nonproj import is_nonproj_tree
|
||||
from .transition_system cimport do_func_t, get_cost_func_t
|
||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||
|
@ -60,7 +60,7 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
|
|||
cost += 1
|
||||
if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
|
||||
cost += 1
|
||||
cost += Break.is_valid(stcls.c, -1) and Break.move_cost(stcls, gold) == 0
|
||||
cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
|
||||
return cost
|
||||
|
||||
|
||||
|
@ -73,7 +73,7 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
|
|||
cost += gold.heads[target] == B_i
|
||||
if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
|
||||
break
|
||||
if Break.is_valid(stcls.c, -1) and Break.move_cost(stcls, gold) == 0:
|
||||
if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
|
||||
cost += 1
|
||||
return cost
|
||||
|
||||
|
@ -84,14 +84,14 @@ cdef weight_t arc_cost(StateClass stcls, const GoldParseC* gold, int head, int c
|
|||
elif stcls.H(child) == gold.heads[child]:
|
||||
return 1
|
||||
# Head in buffer
|
||||
elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != -1:
|
||||
elif gold.heads[child] >= stcls.B(0) and stcls.B(1) != 0:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
|
||||
if gold.labels[child] == -1:
|
||||
if not gold.has_dep[child]:
|
||||
return True
|
||||
elif gold.heads[child] == head:
|
||||
return True
|
||||
|
@ -99,10 +99,10 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
|
|||
return False
|
||||
|
||||
|
||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
|
||||
if gold.labels[child] == -1:
|
||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
|
||||
if not gold.has_dep[child]:
|
||||
return True
|
||||
elif label == -1:
|
||||
elif label == 0:
|
||||
return True
|
||||
elif gold.labels[child] == label:
|
||||
return True
|
||||
|
@ -111,21 +111,20 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label)
|
|||
|
||||
|
||||
cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
|
||||
return gold.labels[word] == -1 or gold.heads[word] == word
|
||||
|
||||
return gold.heads[word] == word or not gold.has_dep[word]
|
||||
|
||||
cdef class Shift:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.push()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
|
||||
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -133,17 +132,17 @@ cdef class Shift:
|
|||
return push_cost(s, gold, s.B(0))
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 0
|
||||
|
||||
|
||||
cdef class Reduce:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.stack_depth() >= 2
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
if st.has_head(st.S(0)):
|
||||
st.pop()
|
||||
else:
|
||||
|
@ -151,7 +150,7 @@ cdef class Reduce:
|
|||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -165,28 +164,28 @@ cdef class Reduce:
|
|||
cost -= 1
|
||||
if gold.heads[S_i] == st.S(0):
|
||||
cost -= 1
|
||||
if Break.is_valid(st.c, -1) and Break.move_cost(st, gold) == 0:
|
||||
if Break.is_valid(st.c, 0) and Break.move_cost(st, gold) == 0:
|
||||
cost -= 1
|
||||
return cost
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 0
|
||||
|
||||
|
||||
cdef class LeftArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.add_arc(st.B(0), st.S(0), label)
|
||||
st.pop()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -204,23 +203,23 @@ cdef class LeftArc:
|
|||
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
|
||||
|
||||
|
||||
cdef class RightArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.add_arc(st.S(0), st.B(0), label)
|
||||
st.push()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -233,13 +232,13 @@ cdef class RightArc:
|
|||
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
|
||||
|
||||
|
||||
cdef class Break:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int i
|
||||
if not USE_BREAK:
|
||||
return False
|
||||
|
@ -251,12 +250,12 @@ cdef class Break:
|
|||
return True
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.set_break(st.B_(0).l_edge)
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -281,13 +280,13 @@ cdef class Break:
|
|||
return cost + 1
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 0
|
||||
|
||||
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
||||
while gold.heads[word] != word and gold.labels[word] != -1 and word >= 0:
|
||||
while gold.heads[word] != word and not gold.has_dep[word] and word >= 0:
|
||||
word = gold.heads[word]
|
||||
if gold.labels[word] == -1:
|
||||
if not gold.has_dep[word]:
|
||||
return -1
|
||||
else:
|
||||
return word
|
||||
|
@ -295,9 +294,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
|||
|
||||
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
||||
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
|
||||
# Ensure sent_start is set to 0 throughout
|
||||
for i in range(st.c.length):
|
||||
st.c._sent[i].sent_start = False
|
||||
st.c._sent[i].l_edge = i
|
||||
st.c._sent[i].r_edge = i
|
||||
st.fast_forward()
|
||||
|
@ -313,21 +310,24 @@ cdef class ArcEager(TransitionSystem):
|
|||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
{
|
||||
SHIFT: [''],
|
||||
REDUCE: [''],
|
||||
RIGHT: [],
|
||||
LEFT: [],
|
||||
BREAK: ['ROOT']})
|
||||
OrderedDict((
|
||||
(SHIFT, ['']),
|
||||
(REDUCE, ['']),
|
||||
(RIGHT, []),
|
||||
(LEFT, []),
|
||||
(BREAK, ['ROOT'])
|
||||
)))
|
||||
seen_actions = set()
|
||||
for label in kwargs.get('left_labels', []):
|
||||
if label.upper() != 'ROOT':
|
||||
if (LEFT, label) not in seen_actions:
|
||||
actions[LEFT].append(label)
|
||||
seen_actions.add((LEFT, label))
|
||||
for label in kwargs.get('right_labels', []):
|
||||
if label.upper() != 'ROOT':
|
||||
if (RIGHT, label) not in seen_actions:
|
||||
actions[RIGHT].append(label)
|
||||
seen_actions.add((RIGHT, label))
|
||||
|
||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
|
@ -338,29 +338,39 @@ cdef class ArcEager(TransitionSystem):
|
|||
if head < child:
|
||||
if (RIGHT, label) not in seen_actions:
|
||||
actions[RIGHT].append(label)
|
||||
seen_actions.add((RIGHT, label))
|
||||
elif head > child:
|
||||
if (LEFT, label) not in seen_actions:
|
||||
actions[LEFT].append(label)
|
||||
seen_actions.add((LEFT, label))
|
||||
return actions
|
||||
|
||||
property action_types:
|
||||
def __get__(self):
|
||||
return (SHIFT, REDUCE, LEFT, RIGHT, BREAK)
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||
end = end or len(gold.heads)
|
||||
if all([tag is None for tag in gold.heads[start:end]]):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def preprocess_gold(self, GoldParse gold):
|
||||
if not self.has_gold(gold):
|
||||
return None
|
||||
for i in range(gold.length):
|
||||
if gold.heads[i] is None: # Missing values
|
||||
gold.c.heads[i] = i
|
||||
gold.c.labels[i] = -1
|
||||
gold.c.has_dep[i] = False
|
||||
else:
|
||||
label = gold.labels[i]
|
||||
gold.c.has_dep[i] = True
|
||||
if label.upper() == 'ROOT':
|
||||
label = 'ROOT'
|
||||
gold.c.heads[i] = gold.heads[i]
|
||||
gold.c.labels[i] = self.strings[label]
|
||||
# Count frequencies, for use in encoder
|
||||
self.freqs[HEAD][gold.c.heads[i] - i] += 1
|
||||
self.freqs[DEP][gold.c.labels[i]] += 1
|
||||
gold.c.labels[i] = self.strings.add(label)
|
||||
return gold
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
if '-' in name:
|
||||
|
@ -374,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
|
|||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
|
||||
def move_name(self, int move, int label):
|
||||
def move_name(self, int move, attr_t label):
|
||||
label_str = self.strings[label]
|
||||
if label_str:
|
||||
return MOVE_NAMES[move] + '-' + label_str
|
||||
else:
|
||||
return MOVE_NAMES[move]
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
cdef Transition t
|
||||
|
@ -414,9 +424,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
return t
|
||||
|
||||
cdef int initialize_state(self, StateC* st) nogil:
|
||||
# Ensure sent_start is set to 0 throughout
|
||||
for i in range(st.length):
|
||||
st._sent[i].sent_start = False
|
||||
st._sent[i].l_edge = i
|
||||
st._sent[i].r_edge = i
|
||||
st.fast_forward()
|
||||
|
@ -432,18 +440,19 @@ cdef class ArcEager(TransitionSystem):
|
|||
|
||||
cdef int set_valid(self, int* output, const StateC* st) nogil:
|
||||
cdef bint[N_MOVES] is_valid
|
||||
is_valid[SHIFT] = Shift.is_valid(st, -1)
|
||||
is_valid[REDUCE] = Reduce.is_valid(st, -1)
|
||||
is_valid[LEFT] = LeftArc.is_valid(st, -1)
|
||||
is_valid[RIGHT] = RightArc.is_valid(st, -1)
|
||||
is_valid[BREAK] = Break.is_valid(st, -1)
|
||||
is_valid[SHIFT] = Shift.is_valid(st, 0)
|
||||
is_valid[REDUCE] = Reduce.is_valid(st, 0)
|
||||
is_valid[LEFT] = LeftArc.is_valid(st, 0)
|
||||
is_valid[RIGHT] = RightArc.is_valid(st, 0)
|
||||
is_valid[BREAK] = Break.is_valid(st, 0)
|
||||
cdef int i
|
||||
for i in range(self.n_moves):
|
||||
output[i] = is_valid[self.c[i].move]
|
||||
|
||||
cdef int set_costs(self, int* is_valid, weight_t* costs,
|
||||
StateClass stcls, GoldParse gold) except -1:
|
||||
cdef int i, move, label
|
||||
cdef int i, move
|
||||
cdef attr_t label
|
||||
cdef label_cost_func_t[N_MOVES] label_cost_funcs
|
||||
cdef move_cost_func_t[N_MOVES] move_cost_funcs
|
||||
cdef weight_t[N_MOVES] move_costs
|
||||
|
@ -461,7 +470,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
label_cost_funcs[RIGHT] = RightArc.label_cost
|
||||
label_cost_funcs[BREAK] = Break.label_cost
|
||||
|
||||
cdef int* labels = gold.c.labels
|
||||
cdef attr_t* labels = gold.c.labels
|
||||
cdef int* heads = gold.c.heads
|
||||
|
||||
n_gold = 0
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..parts_of_speech cimport NOUN, PROPN, PRON
|
||||
from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
|
||||
|
||||
|
||||
def english_noun_chunks(obj):
|
||||
|
@ -12,9 +12,9 @@ def english_noun_chunks(obj):
|
|||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||
'attr', 'ROOT']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings['conj']
|
||||
np_label = doc.vocab.strings['NP']
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
|
@ -48,9 +48,9 @@ def english_noun_chunks(obj):
|
|||
def german_noun_chunks(obj):
|
||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_label = doc.vocab.strings['NP']
|
||||
np_deps = set(doc.vocab.strings[label] for label in labels)
|
||||
close_app = doc.vocab.strings['nk']
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||
close_app = doc.vocab.strings.add('nk')
|
||||
|
||||
rbracket = 0
|
||||
for i, word in enumerate(obj):
|
||||
|
@ -66,4 +66,79 @@ def german_noun_chunks(obj):
|
|||
yield word.left_edge.i, rbracket, np_label
|
||||
|
||||
|
||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks}
|
||||
def es_noun_chunks(obj):
|
||||
doc = obj.doc
|
||||
np_label = doc.vocab.strings['NP']
|
||||
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
||||
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
||||
stop_labels = ['punct']
|
||||
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
|
||||
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
|
||||
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
|
||||
|
||||
def next_token(token):
|
||||
try:
|
||||
return token.nbor()
|
||||
except:
|
||||
return None
|
||||
|
||||
def noun_bounds(root):
|
||||
def is_verb_token(token):
|
||||
return token.pos in [VERB, AUX]
|
||||
|
||||
left_bound = root
|
||||
for token in reversed(list(root.lefts)):
|
||||
if token.dep in np_left_deps:
|
||||
left_bound = token
|
||||
right_bound = root
|
||||
for token in root.rights:
|
||||
if (token.dep in np_right_deps):
|
||||
left, right = noun_bounds(token)
|
||||
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||
doc[left_bound.i: right.i])):
|
||||
break
|
||||
else:
|
||||
right_bound = right
|
||||
return left_bound, right_bound
|
||||
|
||||
token = doc[0]
|
||||
while token and token.i < len(doc):
|
||||
if token.pos in [PROPN, NOUN, PRON]:
|
||||
left, right = noun_bounds(token)
|
||||
yield left.i, right.i+1, np_label
|
||||
token = right
|
||||
token = next_token(token)
|
||||
|
||||
|
||||
def french_noun_chunks(obj):
|
||||
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
|
||||
|
||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
|
||||
'es': es_noun_chunks, 'fr': french_noun_chunks}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from .transition_system cimport TransitionSystem
|
||||
from .transition_system cimport Transition
|
||||
from ..gold cimport GoldParseC
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
cdef class BiluoPushDown(TransitionSystem):
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from thinc.typedefs cimport weight_t
|
||||
from collections import OrderedDict
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil:
|
|||
|
||||
|
||||
cdef class BiluoPushDown(TransitionSystem):
|
||||
def __init__(self, *args, **kwargs):
|
||||
TransitionSystem.__init__(self, *args, **kwargs)
|
||||
|
||||
def __reduce__(self):
|
||||
labels_by_action = OrderedDict()
|
||||
cdef Transition t
|
||||
for trans in self.c[:self.n_moves]:
|
||||
label_str = self.strings[trans.label]
|
||||
labels_by_action.setdefault(trans.move, []).append(label_str)
|
||||
return (BiluoPushDown, (self.strings, labels_by_action),
|
||||
None, None)
|
||||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
{
|
||||
MISSING: [''],
|
||||
BEGIN: [],
|
||||
IN: [],
|
||||
LAST: [],
|
||||
UNIT: [],
|
||||
OUT: ['']
|
||||
})
|
||||
OrderedDict((
|
||||
(MISSING, ['']),
|
||||
(BEGIN, []),
|
||||
(IN, []),
|
||||
(LAST, []),
|
||||
(UNIT, []),
|
||||
(OUT, [''])
|
||||
)))
|
||||
seen_entities = set()
|
||||
for entity_type in kwargs.get('entity_types', []):
|
||||
if entity_type in seen_entities:
|
||||
|
@ -87,32 +100,30 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
def __get__(self):
|
||||
return (BEGIN, IN, LAST, UNIT, OUT)
|
||||
|
||||
def move_name(self, int move, int label):
|
||||
def move_name(self, int move, attr_t label):
|
||||
if move == OUT:
|
||||
return 'O'
|
||||
elif move == 'MISSING':
|
||||
elif move == MISSING:
|
||||
return 'M'
|
||||
else:
|
||||
return MOVE_NAMES[move] + '-' + self.strings[label]
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
def has_gold(self, GoldParse gold, start=0, end=None):
|
||||
end = end or len(gold.ner)
|
||||
if all([tag == '-' for tag in gold.ner[start:end]]):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def preprocess_gold(self, GoldParse gold):
|
||||
if not self.has_gold(gold):
|
||||
return None
|
||||
for i in range(gold.length):
|
||||
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
|
||||
# Count frequencies, for use in encoder
|
||||
if gold.c.ner[i].move in (BEGIN, UNIT):
|
||||
self.freqs[ENT_IOB][3] += 1
|
||||
self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1
|
||||
elif gold.c.ner[i].move in (IN, LAST):
|
||||
self.freqs[ENT_IOB][2] += 1
|
||||
self.freqs[ENT_TYPE][0] += 1
|
||||
elif gold.c.ner[i].move == OUT:
|
||||
self.freqs[ENT_IOB][1] += 1
|
||||
self.freqs[ENT_TYPE][0] += 1
|
||||
else:
|
||||
self.freqs[ENT_IOB][1] += 1
|
||||
self.freqs[ENT_TYPE][0] += 1
|
||||
return gold
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
cdef attr_t label
|
||||
if name == '-' or name == None:
|
||||
move_str = 'M'
|
||||
label = 0
|
||||
|
@ -122,7 +133,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if label_str.startswith('!'):
|
||||
label_str = label_str[1:]
|
||||
move_str = 'x'
|
||||
label = self.strings[label_str]
|
||||
label = self.strings.add(label_str)
|
||||
else:
|
||||
move_str = name
|
||||
label = 0
|
||||
|
@ -135,7 +146,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
else:
|
||||
raise KeyError(name)
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
cdef Transition t
|
||||
|
@ -184,21 +195,21 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
|
||||
cdef class Missing:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* s, int label) nogil:
|
||||
cdef int transition(StateC* s, attr_t label) nogil:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 9000
|
||||
|
||||
|
||||
cdef class Begin:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
# Ensure we don't clobber preset entities. If no entity preset,
|
||||
# ent_iob is 0
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
|
@ -222,16 +233,16 @@ cdef class Begin:
|
|||
return label != 0 and not st.entity_is_open()
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.open_ent(label)
|
||||
st.set_ent_tag(st.B(0), 3, label)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING:
|
||||
return 0
|
||||
|
@ -251,7 +262,7 @@ cdef class Begin:
|
|||
|
||||
cdef class In:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
if preset_ent_iob == 2:
|
||||
return False
|
||||
|
@ -267,17 +278,17 @@ cdef class In:
|
|||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.set_ent_tag(st.B(0), 1, label)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
move = IN
|
||||
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||
|
||||
if g_act == MISSING:
|
||||
|
@ -303,24 +314,24 @@ cdef class In:
|
|||
|
||||
cdef class Last:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
if st.B_(1).ent_iob == 1:
|
||||
return False
|
||||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.close_ent()
|
||||
st.set_ent_tag(st.B(0), 1, label)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
move = LAST
|
||||
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING:
|
||||
return 0
|
||||
|
@ -345,7 +356,7 @@ cdef class Last:
|
|||
|
||||
cdef class Unit:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
if preset_ent_iob == 2:
|
||||
return False
|
||||
|
@ -358,7 +369,7 @@ cdef class Unit:
|
|||
return label != 0 and not st.entity_is_open()
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.open_ent(label)
|
||||
st.close_ent()
|
||||
st.set_ent_tag(st.B(0), 3, label)
|
||||
|
@ -366,9 +377,9 @@ cdef class Unit:
|
|||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING:
|
||||
return 0
|
||||
|
@ -388,7 +399,7 @@ cdef class Unit:
|
|||
|
||||
cdef class Out:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
if preset_ent_iob == 3:
|
||||
return False
|
||||
|
@ -397,15 +408,15 @@ cdef class Out:
|
|||
return not st.entity_is_open()
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.set_ent_tag(st.B(0), 2, 0)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING or g_act == ISNT:
|
||||
return 0
|
||||
|
|
|
@ -14,4 +14,8 @@ cdef class Parser:
|
|||
cdef readonly TransitionSystem moves
|
||||
cdef readonly object cfg
|
||||
|
||||
cdef void _parse_step(self, StateC* state,
|
||||
const float* feat_weights,
|
||||
int nr_class, int nr_feat, int nr_piece) nogil
|
||||
|
||||
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from collections import Counter
|
||||
from collections import Counter, OrderedDict
|
||||
import ujson
|
||||
import contextlib
|
||||
|
||||
|
@ -18,6 +18,7 @@ import dill
|
|||
import numpy.random
|
||||
cimport numpy as np
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.exc cimport PyErr_CheckSignals
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
|
@ -28,26 +29,29 @@ from thinc.linear.avgtron cimport AveragedPerceptron
|
|||
from thinc.linalg cimport VecVec
|
||||
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
||||
from thinc.extra.eg cimport Example
|
||||
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport MapStruct
|
||||
from preshed.maps cimport map_get
|
||||
|
||||
from thinc.api import layerize, chain
|
||||
from thinc.api import layerize, chain, noop, clone
|
||||
from thinc.neural import Model, Affine, ELU, ReLu, Maxout
|
||||
from thinc.neural.ops import NumpyOps
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
|
||||
from .. import util
|
||||
from ..util import get_async, get_cuda_stream
|
||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||
from .._ml import Tok2Vec, doc2feats
|
||||
from .._ml import Tok2Vec, doc2feats, rebatch
|
||||
from ..compat import json_dumps
|
||||
|
||||
from . import _parse_features
|
||||
from ._parse_features cimport CONTEXT_SIZE
|
||||
from ._parse_features cimport fill_context
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .nonproj import PseudoProjectivity
|
||||
from . import nonproj
|
||||
from .transition_system import OracleError
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..structs cimport TokenC
|
||||
|
@ -104,55 +108,64 @@ cdef class precompute_hiddens:
|
|||
cached = gpu_cached
|
||||
self.nF = cached.shape[1]
|
||||
self.nO = cached.shape[2]
|
||||
self.nP = cached.shape[3]
|
||||
self.nP = getattr(lower_model, 'nP', 1)
|
||||
self.ops = lower_model.ops
|
||||
self._features = numpy.zeros((batch_size, self.nO, self.nP), dtype='f')
|
||||
self._features = numpy.zeros((batch_size, self.nO*self.nP), dtype='f')
|
||||
self._is_synchronized = False
|
||||
self._cuda_stream = cuda_stream
|
||||
self._cached = cached
|
||||
self._bp_hiddens = bp_features
|
||||
|
||||
cdef const float* get_feat_weights(self) except NULL:
|
||||
if not self._is_synchronized \
|
||||
and self._cuda_stream is not None:
|
||||
self._cuda_stream.synchronize()
|
||||
self._is_synchronized = True
|
||||
return <float*>self._cached.data
|
||||
|
||||
def __call__(self, X):
|
||||
return self.begin_update(X)[0]
|
||||
|
||||
def begin_update(self, token_ids, drop=0.):
|
||||
self._features.fill(0)
|
||||
if not self._is_synchronized \
|
||||
and self._cuda_stream is not None:
|
||||
self._cuda_stream.synchronize()
|
||||
self._is_synchronized = True
|
||||
# This is tricky, but (assuming GPU available);
|
||||
# - Input to forward on CPU
|
||||
# - Output from forward on CPU
|
||||
# - Input to backward on GPU!
|
||||
# - Output from backward on GPU
|
||||
cdef np.ndarray state_vector = self._features[:len(token_ids)]
|
||||
cdef np.ndarray hiddens = self._cached
|
||||
bp_hiddens = self._bp_hiddens
|
||||
|
||||
feat_weights = self.get_feat_weights()
|
||||
cdef int[:, ::1] ids = token_ids
|
||||
self._sum_features(<float*>state_vector.data,
|
||||
<float*>hiddens.data, &ids[0,0],
|
||||
sum_state_features(<float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||
|
||||
output, bp_output = self._apply_nonlinearity(state_vector)
|
||||
|
||||
def backward(d_output, sgd=None):
|
||||
def backward(d_state_vector, sgd=None):
|
||||
if bp_nonlinearity is not None:
|
||||
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
|
||||
# This will usually be on GPU
|
||||
if isinstance(d_output, numpy.ndarray):
|
||||
d_output = self.ops.xp.array(d_output)
|
||||
d_state_vector = bp_output(d_output, sgd)
|
||||
if isinstance(d_state_vector, numpy.ndarray):
|
||||
d_state_vector = self.ops.xp.array(d_state_vector)
|
||||
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
|
||||
return d_tokens
|
||||
return output, backward
|
||||
return state_vector, backward
|
||||
|
||||
def _apply_nonlinearity(self, X):
|
||||
if self.nP < 2:
|
||||
return X.reshape(X.shape[:2]), lambda dX, sgd=None: dX.reshape(X.shape)
|
||||
best, which = self.ops.maxout(X)
|
||||
return best, lambda dX, sgd=None: self.ops.backprop_maxout(dX, which, self.nP)
|
||||
def _nonlinearity(self, state_vector):
|
||||
if self.nP == 1:
|
||||
return state_vector, None
|
||||
state_vector = state_vector.reshape(
|
||||
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
|
||||
best, which = self.ops.maxout(state_vector)
|
||||
def backprop(d_best, sgd=None):
|
||||
return self.ops.backprop_maxout(d_best, which, self.nP)
|
||||
return best, backprop
|
||||
|
||||
cdef void _sum_features(self, float* output,
|
||||
|
||||
|
||||
cdef void sum_state_features(float* output,
|
||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef const float* feature
|
||||
|
@ -220,25 +233,43 @@ cdef class Parser:
|
|||
Base class of the DependencyParser and EntityRecognizer.
|
||||
"""
|
||||
@classmethod
|
||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
|
||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg):
|
||||
depth = util.env_opt('parser_hidden_depth', depth)
|
||||
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||
maxout_pieces = util.env_opt('parser_maxout_pieces', 1)
|
||||
lower = PrecomputableMaxouts(hidden_width,
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
|
||||
if parser_maxout_pieces == 1:
|
||||
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
|
||||
nF=cls.nr_feature,
|
||||
nI=token_vector_width,
|
||||
pieces=maxout_pieces)
|
||||
nI=token_vector_width)
|
||||
else:
|
||||
lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
|
||||
nF=cls.nr_feature,
|
||||
nP=parser_maxout_pieces,
|
||||
nI=token_vector_width)
|
||||
|
||||
with Model.use_device('cpu'):
|
||||
if depth == 0:
|
||||
upper = chain()
|
||||
upper.is_noop = True
|
||||
else:
|
||||
upper = chain(
|
||||
Maxout(hidden_width),
|
||||
zero_init(Affine(nr_class))
|
||||
clone(Maxout(hidden_width), (depth-1)),
|
||||
zero_init(Affine(nr_class, drop_factor=0.0))
|
||||
)
|
||||
upper.is_noop = False
|
||||
# TODO: This is an unfortunate hack atm!
|
||||
# Used to set input dimensions in network.
|
||||
lower.begin_training(lower.ops.allocate((500, token_vector_width)))
|
||||
upper.begin_training(upper.ops.allocate((500, hidden_width)))
|
||||
return lower, upper
|
||||
cfg = {
|
||||
'nr_class': nr_class,
|
||||
'depth': depth,
|
||||
'token_vector_width': token_vector_width,
|
||||
'hidden_width': hidden_width,
|
||||
'maxout_pieces': parser_maxout_pieces
|
||||
}
|
||||
return (lower, upper), cfg
|
||||
|
||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
||||
"""
|
||||
|
@ -274,7 +305,7 @@ cdef class Parser:
|
|||
def __reduce__(self):
|
||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
def __call__(self, Doc tokens, state=None):
|
||||
def __call__(self, Doc doc):
|
||||
"""
|
||||
Apply the parser or entity recognizer, setting the annotations onto the Doc object.
|
||||
|
||||
|
@ -283,10 +314,11 @@ cdef class Parser:
|
|||
Returns:
|
||||
None
|
||||
"""
|
||||
self.parse_batch([tokens], state['tokvecs'])
|
||||
return state
|
||||
states = self.parse_batch([doc], [doc.tensor])
|
||||
self.set_annotations([doc], states)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||
def pipe(self, docs, int batch_size=1000, int n_threads=2):
|
||||
"""
|
||||
Process a stream of documents.
|
||||
|
||||
|
@ -301,93 +333,217 @@ cdef class Parser:
|
|||
cdef StateClass parse_state
|
||||
cdef Doc doc
|
||||
queue = []
|
||||
for batch in cytoolz.partition_all(batch_size, stream):
|
||||
batch = list(batch)
|
||||
docs, states = zip(*batch)
|
||||
parse_states = self.parse_batch(docs, states[0]['tokvecs'])
|
||||
for docs in cytoolz.partition_all(batch_size, docs):
|
||||
docs = list(docs)
|
||||
tokvecs = [d.tensor for d in docs]
|
||||
parse_states = self.parse_batch(docs, tokvecs)
|
||||
self.set_annotations(docs, parse_states)
|
||||
yield from zip(docs, states)
|
||||
yield from docs
|
||||
|
||||
def parse_batch(self, docs, tokvecses):
|
||||
cdef:
|
||||
precompute_hiddens state2vec
|
||||
StateClass state
|
||||
Pool mem
|
||||
const float* feat_weights
|
||||
StateC* st
|
||||
vector[StateC*] next_step, this_step
|
||||
int nr_class, nr_feat, nr_piece, nr_dim, nr_state
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
|
||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||
|
||||
nr_state = len(docs)
|
||||
nr_class = self.moves.n_moves
|
||||
nr_dim = tokvecs.shape[1]
|
||||
nr_feat = self.nr_feature
|
||||
|
||||
def parse_batch(self, docs, tokvecs):
|
||||
cuda_stream = get_cuda_stream()
|
||||
state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
|
||||
cuda_stream, 0.0)
|
||||
nr_piece = state2vec.nP
|
||||
|
||||
states = self.moves.init_batch(docs)
|
||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs,
|
||||
cuda_stream, 0.0)
|
||||
for state in states:
|
||||
if not state.c.is_final():
|
||||
next_step.push_back(state.c)
|
||||
|
||||
todo = [st for st in states if not st.is_final()]
|
||||
while todo:
|
||||
token_ids = self.get_token_ids(states)
|
||||
vectors = state2vec(token_ids)
|
||||
feat_weights = state2vec.get_feat_weights()
|
||||
cdef int i
|
||||
cdef np.ndarray token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
|
||||
cdef np.ndarray is_valid = numpy.zeros((nr_state, nr_class), dtype='i')
|
||||
cdef np.ndarray scores
|
||||
c_token_ids = <int*>token_ids.data
|
||||
c_is_valid = <int*>is_valid.data
|
||||
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
||||
while not next_step.empty():
|
||||
if not has_hidden:
|
||||
for i in cython.parallel.prange(
|
||||
next_step.size(), num_threads=6, nogil=True):
|
||||
self._parse_step(next_step[i],
|
||||
feat_weights, nr_class, nr_feat, nr_piece)
|
||||
else:
|
||||
for i in range(next_step.size()):
|
||||
st = next_step[i]
|
||||
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
|
||||
self.moves.set_valid(&c_is_valid[i*nr_class], st)
|
||||
vectors = state2vec(token_ids[:next_step.size()])
|
||||
scores = vec2scores(vectors)
|
||||
self.transition_batch(states, scores)
|
||||
todo = [st for st in states if not st.is_final()]
|
||||
c_scores = <float*>scores.data
|
||||
for i in range(next_step.size()):
|
||||
st = next_step[i]
|
||||
guess = arg_max_if_valid(
|
||||
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
|
||||
action = self.moves.c[guess]
|
||||
action.do(st, action.label)
|
||||
this_step, next_step = next_step, this_step
|
||||
next_step.clear()
|
||||
for st in this_step:
|
||||
if not st.is_final():
|
||||
next_step.push_back(st)
|
||||
return states
|
||||
|
||||
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
||||
assert state is not None
|
||||
assert 'tokvecs' in state
|
||||
assert 'bp_tokvecs' in state
|
||||
cdef void _parse_step(self, StateC* state,
|
||||
const float* feat_weights,
|
||||
int nr_class, int nr_feat, int nr_piece) nogil:
|
||||
'''This only works with no hidden layers -- fast but inaccurate'''
|
||||
#for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
|
||||
# self._parse_step(next_step[i], feat_weights, nr_class, nr_feat)
|
||||
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
||||
scores = <float*>calloc(nr_class * nr_piece, sizeof(float))
|
||||
is_valid = <int*>calloc(nr_class, sizeof(int))
|
||||
|
||||
state.set_context_tokens(token_ids, nr_feat)
|
||||
sum_state_features(scores,
|
||||
feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece)
|
||||
self.moves.set_valid(is_valid, state)
|
||||
guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece)
|
||||
action = self.moves.c[guess]
|
||||
action.do(state, action.label)
|
||||
|
||||
free(is_valid)
|
||||
free(scores)
|
||||
free(token_ids)
|
||||
|
||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
docs, tokvec_lists = docs_tokvecs
|
||||
tokvecs = self.model[0].ops.flatten(tokvec_lists)
|
||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||
docs = [docs]
|
||||
golds = [golds]
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
for gold in golds:
|
||||
self.moves.preprocess_gold(gold)
|
||||
|
||||
tokvecs = state['tokvecs']
|
||||
bp_tokvecs = state['bp_tokvecs']
|
||||
|
||||
states = self.moves.init_batch(docs)
|
||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
|
||||
drop)
|
||||
|
||||
todo = [(s, g) for s, g in zip(states, golds) if not s.is_final()]
|
||||
0.0)
|
||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||
if not s.is_final() and g is not None]
|
||||
if not todo:
|
||||
return None
|
||||
|
||||
backprops = []
|
||||
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
|
||||
cdef float loss = 0.
|
||||
cutoff = max(1, len(todo) // 10)
|
||||
while len(todo) >= cutoff:
|
||||
n_steps = 0
|
||||
while todo:
|
||||
states, golds = zip(*todo)
|
||||
|
||||
token_ids = self.get_token_ids(states)
|
||||
vector, bp_vector = state2vec.begin_update(token_ids, drop=drop)
|
||||
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
|
||||
if drop != 0:
|
||||
mask = vec2scores.ops.get_dropout_mask(vector.shape, drop)
|
||||
vector *= mask
|
||||
scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
|
||||
|
||||
d_scores = self.get_batch_loss(states, golds, scores)
|
||||
d_vector = bp_scores(d_scores, sgd=sgd)
|
||||
loss += (d_scores**2).sum()
|
||||
d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
|
||||
if drop != 0:
|
||||
d_vector *= mask
|
||||
|
||||
if not isinstance(tokvecs, state2vec.ops.xp.ndarray):
|
||||
backprops.append((token_ids, d_vector, bp_vector))
|
||||
else:
|
||||
if isinstance(self.model[0].ops, CupyOps) \
|
||||
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to CPU, asynchronously
|
||||
backprops.append((
|
||||
get_async(cuda_stream, token_ids),
|
||||
get_async(cuda_stream, d_vector),
|
||||
bp_vector
|
||||
))
|
||||
else:
|
||||
backprops.append((token_ids, d_vector, bp_vector))
|
||||
self.transition_batch(states, scores)
|
||||
todo = [st for st in todo if not st[0].is_final()]
|
||||
if losses is not None:
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
n_steps += 1
|
||||
if n_steps >= max_steps:
|
||||
break
|
||||
self._make_updates(d_tokvecs,
|
||||
backprops, sgd, cuda_stream)
|
||||
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
|
||||
|
||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||
"""Make a square batch, of length equal to the shortest doc. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
where N is the shortest doc. We'll make two states, one representing
|
||||
long_doc[:N], and another representing long_doc[N:]."""
|
||||
cdef:
|
||||
StateClass state
|
||||
Transition action
|
||||
whole_states = self.moves.init_batch(whole_docs)
|
||||
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
|
||||
max_moves = 0
|
||||
states = []
|
||||
golds = []
|
||||
for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
|
||||
gold = self.moves.preprocess_gold(gold)
|
||||
if gold is None:
|
||||
continue
|
||||
oracle_actions = self.moves.get_oracle_sequence(doc, gold)
|
||||
start = 0
|
||||
while start < len(doc):
|
||||
state = state.copy()
|
||||
n_moves = 0
|
||||
while state.B(0) < start and not state.is_final():
|
||||
action = self.moves.c[oracle_actions.pop(0)]
|
||||
action.do(state.c, action.label)
|
||||
n_moves += 1
|
||||
has_gold = self.moves.has_gold(gold, start=start,
|
||||
end=start+max_length)
|
||||
if not state.is_final() and has_gold:
|
||||
states.append(state)
|
||||
golds.append(gold)
|
||||
max_moves = max(max_moves, n_moves)
|
||||
start += min(max_length, len(doc)-start)
|
||||
max_moves = max(max_moves, len(oracle_actions))
|
||||
return states, golds, max_moves
|
||||
|
||||
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
|
||||
# Tells CUDA to block, so our async copies complete.
|
||||
if cuda_stream is not None:
|
||||
cuda_stream.synchronize()
|
||||
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
|
||||
xp = state2vec.ops.xp # Handle for numpy/cupy
|
||||
for token_ids, d_vector, bp_vector in backprops:
|
||||
xp = get_array_module(d_tokvecs)
|
||||
for ids, d_vector, bp_vector in backprops:
|
||||
d_state_features = bp_vector(d_vector, sgd=sgd)
|
||||
active_feats = token_ids * (token_ids >= 0)
|
||||
active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1))
|
||||
active_feats = ids * (ids >= 0)
|
||||
active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1))
|
||||
if hasattr(xp, 'scatter_add'):
|
||||
xp.scatter_add(d_tokvecs,
|
||||
token_ids, d_state_features * active_feats)
|
||||
ids, d_state_features * active_feats)
|
||||
else:
|
||||
xp.add.at(d_tokvecs,
|
||||
token_ids, d_state_features * active_feats)
|
||||
bp_tokvecs(d_tokvecs, sgd)
|
||||
state['parser_loss'] = loss
|
||||
return state
|
||||
ids, d_state_features * active_feats)
|
||||
|
||||
@property
|
||||
def move_names(self):
|
||||
names = []
|
||||
for i in range(self.moves.n_moves):
|
||||
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
|
||||
names.append(name)
|
||||
return names
|
||||
|
||||
def get_batch_model(self, batch_size, tokvecs, stream, dropout):
|
||||
lower, upper = self.model
|
||||
|
@ -400,9 +556,12 @@ cdef class Parser:
|
|||
def get_token_ids(self, states):
|
||||
cdef StateClass state
|
||||
cdef int n_tokens = self.nr_feature
|
||||
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='C')
|
||||
cdef np.ndarray ids = numpy.zeros((len(states), n_tokens),
|
||||
dtype='i', order='C')
|
||||
c_ids = <int*>ids.data
|
||||
for i, state in enumerate(states):
|
||||
state.set_context_tokens(ids[i])
|
||||
state.c.set_context_tokens(c_ids, n_tokens)
|
||||
c_ids += ids.shape[1]
|
||||
return ids
|
||||
|
||||
def transition_batch(self, states, float[:, ::1] scores):
|
||||
|
@ -445,7 +604,6 @@ cdef class Parser:
|
|||
self.moves.finalize_doc(doc)
|
||||
|
||||
def add_label(self, label):
|
||||
# Doesn't set label into serializer -- subclasses override it to do that.
|
||||
for action in self.moves.action_types:
|
||||
added = self.moves.add_action(action, label)
|
||||
if added:
|
||||
|
@ -456,12 +614,18 @@ cdef class Parser:
|
|||
def begin_training(self, gold_tuples, **cfg):
|
||||
if 'model' in cfg:
|
||||
self.model = cfg['model']
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
||||
for action, labels in actions.items():
|
||||
for label in labels:
|
||||
self.moves.add_action(action, label)
|
||||
if self.model is True:
|
||||
self.model = self.Model(self.moves.n_moves, **cfg)
|
||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||
self.cfg.update(cfg)
|
||||
|
||||
def preprocess_gold(self, docs_golds):
|
||||
for doc, gold in docs_golds:
|
||||
yield doc, gold
|
||||
|
||||
def use_params(self, params):
|
||||
# Can't decorate cdef class :(. Workaround.
|
||||
|
@ -469,21 +633,75 @@ cdef class Parser:
|
|||
with self.model[1].use_params(params):
|
||||
yield
|
||||
|
||||
def to_disk(self, path):
|
||||
def to_disk(self, path, **exclude):
|
||||
serializers = {
|
||||
'lower_model': lambda p: p.open('wb').write(
|
||||
self.model[0].to_bytes()),
|
||||
'upper_model': lambda p: p.open('wb').write(
|
||||
self.model[1].to_bytes()),
|
||||
'vocab': lambda p: self.vocab.to_disk(p),
|
||||
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
||||
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
||||
}
|
||||
util.to_disk(path, serializers, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
deserializers = {
|
||||
'vocab': lambda p: self.vocab.from_disk(p),
|
||||
'moves': lambda p: self.moves.from_disk(p, strings=False),
|
||||
'cfg': lambda p: self.cfg.update(ujson.load(p.open())),
|
||||
'model': lambda p: None
|
||||
}
|
||||
util.from_disk(path, deserializers, exclude)
|
||||
if 'model' not in exclude:
|
||||
path = util.ensure_path(path)
|
||||
with (path / 'model.bin').open('wb') as file_:
|
||||
dill.dump(self.model, file_)
|
||||
if self.model is True:
|
||||
self.model, cfg = self.Model(**self.cfg)
|
||||
else:
|
||||
cfg = {}
|
||||
with (path / 'lower_model').open('rb') as file_:
|
||||
bytes_data = file_.read()
|
||||
self.model[0].from_bytes(bytes_data)
|
||||
with (path / 'upper_model').open('rb') as file_:
|
||||
bytes_data = file_.read()
|
||||
self.model[1].from_bytes(bytes_data)
|
||||
self.cfg.update(cfg)
|
||||
return self
|
||||
|
||||
def from_disk(self, path):
|
||||
path = util.ensure_path(path)
|
||||
with (path / 'model.bin').open('wb') as file_:
|
||||
self.model = dill.load(file_)
|
||||
def to_bytes(self, **exclude):
|
||||
serializers = OrderedDict((
|
||||
('lower_model', lambda: self.model[0].to_bytes()),
|
||||
('upper_model', lambda: self.model[1].to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||
('cfg', lambda: ujson.dumps(self.cfg))
|
||||
))
|
||||
if 'model' in exclude:
|
||||
exclude['lower_model'] = True
|
||||
exclude['upper_model'] = True
|
||||
exclude.pop('model')
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def to_bytes(self):
|
||||
pass
|
||||
|
||||
def from_bytes(self, data):
|
||||
pass
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
deserializers = OrderedDict((
|
||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
||||
('lower_model', lambda b: None),
|
||||
('upper_model', lambda b: None)
|
||||
))
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
if 'model' not in exclude:
|
||||
if self.model is True:
|
||||
self.model, cfg = self.Model(self.moves.n_moves)
|
||||
else:
|
||||
cfg = {}
|
||||
if 'lower_model' in msg:
|
||||
self.model[0].from_bytes(msg['lower_model'])
|
||||
if 'upper_model' in msg:
|
||||
self.model[1].from_bytes(msg['upper_model'])
|
||||
self.cfg.update(cfg)
|
||||
return self
|
||||
|
||||
|
||||
class ParserStateError(ValueError):
|
||||
|
@ -521,6 +739,19 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
|||
return best
|
||||
|
||||
|
||||
cdef int arg_maxout_if_valid(const weight_t* scores, const int* is_valid,
|
||||
int n, int nP) nogil:
|
||||
cdef int best = -1
|
||||
cdef float best_score = 0
|
||||
for i in range(n):
|
||||
if is_valid[i] >= 1:
|
||||
for j in range(nP):
|
||||
if best == -1 or scores[i*nP+j] > best_score:
|
||||
best = i
|
||||
best_score = scores[i*nP+j]
|
||||
return best
|
||||
|
||||
|
||||
cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
|
||||
int nr_class) except -1:
|
||||
cdef weight_t score = 0
|
||||
|
|
|
@ -1,10 +1,17 @@
|
|||
# coding: utf-8
|
||||
"""
|
||||
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||
for doing pseudo-projective parsing implementation uses the HEAD decoration
|
||||
scheme.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
from copy import copy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..attrs import DEP, HEAD
|
||||
|
||||
DELIMITER = '||'
|
||||
|
||||
|
||||
def ancestors(tokenid, heads):
|
||||
# returns all words going from the word up the path to the root
|
||||
|
@ -60,95 +67,77 @@ def is_nonproj_tree(heads):
|
|||
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
|
||||
|
||||
|
||||
class PseudoProjectivity:
|
||||
# implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||
# for doing pseudo-projective parsing
|
||||
# implementation uses the HEAD decoration scheme
|
||||
def decompose(label):
|
||||
return label.partition(DELIMITER)[::2]
|
||||
|
||||
delimiter = '||'
|
||||
|
||||
@classmethod
|
||||
def decompose(cls, label):
|
||||
return label.partition(cls.delimiter)[::2]
|
||||
def is_decorated(label):
|
||||
return label.find(DELIMITER) != -1
|
||||
|
||||
@classmethod
|
||||
def is_decorated(cls, label):
|
||||
return label.find(cls.delimiter) != -1
|
||||
|
||||
@classmethod
|
||||
def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30):
|
||||
def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
|
||||
preprocessed = []
|
||||
freqs = {}
|
||||
for raw_text, sents in gold_tuples:
|
||||
prepro_sents = []
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
proj_heads,deco_labels = cls.projectivize(heads,labels)
|
||||
proj_heads,deco_labels = projectivize(heads,labels)
|
||||
# set the label to ROOT for each root dependent
|
||||
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
|
||||
# count label frequencies
|
||||
if label_freq_cutoff > 0:
|
||||
for label in deco_labels:
|
||||
if cls.is_decorated(label):
|
||||
if is_decorated(label):
|
||||
freqs[label] = freqs.get(label,0) + 1
|
||||
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
|
||||
preprocessed.append((raw_text, prepro_sents))
|
||||
|
||||
if label_freq_cutoff > 0:
|
||||
return cls._filter_labels(preprocessed,label_freq_cutoff,freqs)
|
||||
return _filter_labels(preprocessed,label_freq_cutoff,freqs)
|
||||
return preprocessed
|
||||
|
||||
|
||||
@classmethod
|
||||
def projectivize(cls, heads, labels):
|
||||
def projectivize(heads, labels):
|
||||
# use the algorithm by Nivre & Nilsson 2005
|
||||
# assumes heads to be a proper tree, i.e. connected and cycle-free
|
||||
# returns a new pair (heads,labels) which encode
|
||||
# a projective and decorated tree
|
||||
proj_heads = copy(heads)
|
||||
smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
|
||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||
if smallest_np_arc == None: # this sentence is already projective
|
||||
return proj_heads, copy(labels)
|
||||
while smallest_np_arc != None:
|
||||
cls._lift(smallest_np_arc, proj_heads)
|
||||
smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
|
||||
deco_labels = cls._decorate(heads, proj_heads, labels)
|
||||
_lift(smallest_np_arc, proj_heads)
|
||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||
deco_labels = _decorate(heads, proj_heads, labels)
|
||||
return proj_heads, deco_labels
|
||||
|
||||
|
||||
@classmethod
|
||||
def deprojectivize(cls, tokens):
|
||||
def deprojectivize(tokens):
|
||||
# reattach arcs with decorated labels (following HEAD scheme)
|
||||
# for each decorated arc X||Y, search top-down, left-to-right,
|
||||
# breadth-first until hitting a Y then make this the new head
|
||||
#parse = tokens.to_array([HEAD, DEP])
|
||||
for token in tokens:
|
||||
if cls.is_decorated(token.dep_):
|
||||
newlabel,headlabel = cls.decompose(token.dep_)
|
||||
newhead = cls._find_new_head(token,headlabel)
|
||||
if is_decorated(token.dep_):
|
||||
newlabel,headlabel = decompose(token.dep_)
|
||||
newhead = _find_new_head(token,headlabel)
|
||||
token.head = newhead
|
||||
token.dep_ = newlabel
|
||||
return tokens
|
||||
|
||||
# tokens.attach(token,newhead,newlabel)
|
||||
#parse[token.i,1] = tokens.vocab.strings[newlabel]
|
||||
#parse[token.i,0] = newhead.i - token.i
|
||||
#tokens.from_array([HEAD, DEP],parse)
|
||||
|
||||
|
||||
@classmethod
|
||||
def _decorate(cls, heads, proj_heads, labels):
|
||||
def _decorate(heads, proj_heads, labels):
|
||||
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
||||
assert(len(heads) == len(proj_heads) == len(labels))
|
||||
deco_labels = []
|
||||
for tokenid,head in enumerate(heads):
|
||||
if head != proj_heads[tokenid]:
|
||||
deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head]))
|
||||
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
|
||||
else:
|
||||
deco_labels.append(labels[tokenid])
|
||||
return deco_labels
|
||||
|
||||
|
||||
@classmethod
|
||||
def _get_smallest_nonproj_arc(cls, heads):
|
||||
def _get_smallest_nonproj_arc(heads):
|
||||
# return the smallest non-proj arc or None
|
||||
# where size is defined as the distance between dep and head
|
||||
# and ties are broken left to right
|
||||
|
@ -162,8 +151,7 @@ class PseudoProjectivity:
|
|||
return smallest_np_arc
|
||||
|
||||
|
||||
@classmethod
|
||||
def _lift(cls, tokenid, heads):
|
||||
def _lift(tokenid, heads):
|
||||
# reattaches a word to it's grandfather
|
||||
head = heads[tokenid]
|
||||
ghead = heads[head]
|
||||
|
@ -171,8 +159,7 @@ class PseudoProjectivity:
|
|||
heads[tokenid] = ghead if head != ghead else tokenid
|
||||
|
||||
|
||||
@classmethod
|
||||
def _find_new_head(cls, token, headlabel):
|
||||
def _find_new_head(token, headlabel):
|
||||
# search through the tree starting from the head of the given token
|
||||
# returns the id of the first descendant with the given label
|
||||
# if there is none, return the current head (no change)
|
||||
|
@ -190,15 +177,14 @@ class PseudoProjectivity:
|
|||
return token.head
|
||||
|
||||
|
||||
@classmethod
|
||||
def _filter_labels(cls, gold_tuples, cutoff, freqs):
|
||||
def _filter_labels(gold_tuples, cutoff, freqs):
|
||||
# throw away infrequent decorated labels
|
||||
# can't learn them reliably anyway and keeps label set smaller
|
||||
filtered = []
|
||||
for raw_text, sents in gold_tuples:
|
||||
filtered_sents = []
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
|
||||
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
|
||||
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
||||
filtered.append((raw_text, filtered_sents))
|
||||
return filtered
|
||||
|
|
|
@ -33,7 +33,6 @@ from ._parse_features cimport CONTEXT_SIZE
|
|||
from ._parse_features cimport fill_context
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .nonproj import PseudoProjectivity
|
||||
from .transition_system import OracleError
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..structs cimport TokenC
|
||||
|
|
|
@ -4,6 +4,7 @@ from cymem.cymem cimport Pool
|
|||
cimport cython
|
||||
|
||||
from ..structs cimport TokenC, Entity
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from ..vocab cimport EMPTY_LEXEME
|
||||
from ._state cimport StateC
|
||||
|
@ -105,19 +106,19 @@ cdef class StateClass:
|
|||
cdef inline void unshift(self) nogil:
|
||||
self.c.unshift()
|
||||
|
||||
cdef inline void add_arc(self, int head, int child, int label) nogil:
|
||||
cdef inline void add_arc(self, int head, int child, attr_t label) nogil:
|
||||
self.c.add_arc(head, child, label)
|
||||
|
||||
cdef inline void del_arc(self, int head, int child) nogil:
|
||||
self.c.del_arc(head, child)
|
||||
|
||||
cdef inline void open_ent(self, int label) nogil:
|
||||
cdef inline void open_ent(self, attr_t label) nogil:
|
||||
self.c.open_ent(label)
|
||||
|
||||
cdef inline void close_ent(self) nogil:
|
||||
self.c.close_ent()
|
||||
|
||||
cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil:
|
||||
cdef inline void set_ent_tag(self, int i, int ent_iob, attr_t ent_type) nogil:
|
||||
self.c.set_ent_tag(i, ent_iob, ent_type)
|
||||
|
||||
cdef inline void set_break(self, int i) nogil:
|
||||
|
|
|
@ -41,6 +41,11 @@ cdef class StateClass:
|
|||
def is_final(self):
|
||||
return self.c.is_final()
|
||||
|
||||
def copy(self):
|
||||
cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length)
|
||||
new_state.c.clone(self.c)
|
||||
return new_state
|
||||
|
||||
def print_state(self, words):
|
||||
words = list(words) + ['_']
|
||||
top = words[self.S(0)] + '_%d' % self.S_(0).head
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from ..typedefs cimport attr_t
|
||||
from ..structs cimport TokenC
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
|
@ -13,20 +14,22 @@ from ._state cimport StateC
|
|||
cdef struct Transition:
|
||||
int clas
|
||||
int move
|
||||
int label
|
||||
attr_t label
|
||||
|
||||
weight_t score
|
||||
|
||||
bint (*is_valid)(const StateC* state, int label) nogil
|
||||
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
|
||||
int (*do)(StateC* state, int label) nogil
|
||||
bint (*is_valid)(const StateC* state, attr_t label) nogil
|
||||
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
|
||||
int (*do)(StateC* state, attr_t label) nogil
|
||||
|
||||
|
||||
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
||||
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
|
||||
attr_tlabel) nogil
|
||||
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
|
||||
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
||||
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
|
||||
gold, attr_t label) nogil
|
||||
|
||||
ctypedef int (*do_func_t)(StateC* state, int label) nogil
|
||||
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
||||
|
||||
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
|
||||
|
||||
|
@ -36,18 +39,16 @@ cdef class TransitionSystem:
|
|||
cdef Transition* c
|
||||
cdef readonly int n_moves
|
||||
cdef int _size
|
||||
cdef public int root_label
|
||||
cdef public attr_t root_label
|
||||
cdef public freqs
|
||||
cdef init_state_t init_beam_state
|
||||
|
||||
cdef int initialize_state(self, StateC* state) nogil
|
||||
cdef int finalize_state(self, StateC* state) nogil
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *
|
||||
|
||||
cdef int set_valid(self, int* output, const StateC* st) nogil
|
||||
|
||||
|
|
|
@ -5,11 +5,14 @@ from __future__ import unicode_literals
|
|||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
from collections import defaultdict
|
||||
from collections import defaultdict, OrderedDict
|
||||
import ujson
|
||||
|
||||
from .. import util
|
||||
from ..structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
|
@ -26,7 +29,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
|||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
def __init__(self, StringStore string_table, dict labels_by_action, _freqs=None):
|
||||
def __init__(self, StringStore string_table, labels_by_action):
|
||||
self.mem = Pool()
|
||||
self.strings = string_table
|
||||
self.n_moves = 0
|
||||
|
@ -34,28 +37,20 @@ cdef class TransitionSystem:
|
|||
|
||||
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
|
||||
|
||||
for action, label_strs in sorted(labels_by_action.items()):
|
||||
for action, label_strs in labels_by_action.items():
|
||||
for label_str in label_strs:
|
||||
self.add_action(int(action), label_str)
|
||||
self.root_label = self.strings['ROOT']
|
||||
self.freqs = {} if _freqs is None else _freqs
|
||||
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
|
||||
self.freqs[attr] = defaultdict(int)
|
||||
self.freqs[attr][0] = 1
|
||||
# Ensure we've seen heads. Need an official dependency length limit...
|
||||
for i in range(10024):
|
||||
self.freqs[HEAD][i] = 1
|
||||
self.freqs[HEAD][-i] = 1
|
||||
self.root_label = self.strings.add('ROOT')
|
||||
self.init_beam_state = _init_state
|
||||
|
||||
def __reduce__(self):
|
||||
labels_by_action = {}
|
||||
labels_by_action = OrderedDict()
|
||||
cdef Transition t
|
||||
for trans in self.c[:self.n_moves]:
|
||||
label_str = self.strings[trans.label]
|
||||
labels_by_action.setdefault(trans.move, []).append(label_str)
|
||||
return (self.__class__,
|
||||
(self.strings, labels_by_action, self.freqs),
|
||||
(self.strings, labels_by_action),
|
||||
None, None)
|
||||
|
||||
def init_batch(self, docs):
|
||||
|
@ -69,6 +64,29 @@ cdef class TransitionSystem:
|
|||
offset += len(doc)
|
||||
return states
|
||||
|
||||
def get_oracle_sequence(self, doc, GoldParse gold):
|
||||
cdef Pool mem = Pool()
|
||||
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
|
||||
is_valid = <int*>mem.alloc(self.n_moves, sizeof(int))
|
||||
|
||||
cdef StateClass state = StateClass(doc, offset=0)
|
||||
self.initialize_state(state.c)
|
||||
history = []
|
||||
while not state.is_final():
|
||||
self.set_costs(is_valid, costs, state, gold)
|
||||
for i in range(self.n_moves):
|
||||
if is_valid[i] and costs[i] <= 0:
|
||||
action = self.c[i]
|
||||
history.append(i)
|
||||
action.do(state.c, action.label)
|
||||
break
|
||||
else:
|
||||
print(gold.words)
|
||||
print(gold.ner)
|
||||
print(history)
|
||||
raise ValueError("Could not find gold move")
|
||||
return history
|
||||
|
||||
cdef int initialize_state(self, StateC* state) nogil:
|
||||
pass
|
||||
|
||||
|
@ -78,13 +96,13 @@ cdef class TransitionSystem:
|
|||
def finalize_doc(self, doc):
|
||||
pass
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
def preprocess_gold(self, GoldParse gold):
|
||||
raise NotImplementedError
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
raise NotImplementedError
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
raise NotImplementedError
|
||||
|
||||
def is_valid(self, StateClass stcls, move_name):
|
||||
|
@ -100,24 +118,76 @@ cdef class TransitionSystem:
|
|||
StateClass stcls, GoldParse gold) except -1:
|
||||
cdef int i
|
||||
self.set_valid(is_valid, stcls.c)
|
||||
cdef int n_gold = 0
|
||||
for i in range(self.n_moves):
|
||||
if is_valid[i]:
|
||||
costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
|
||||
n_gold += costs[i] <= 0
|
||||
else:
|
||||
costs[i] = 9000
|
||||
if n_gold <= 0:
|
||||
print(gold.words)
|
||||
print(gold.ner)
|
||||
print([gold.c.ner[i].clas for i in range(gold.length)])
|
||||
print([gold.c.ner[i].move for i in range(gold.length)])
|
||||
print([gold.c.ner[i].label for i in range(gold.length)])
|
||||
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise "
|
||||
"the entity recognizer\n"
|
||||
"The transition system has %d actions." % (self.n_moves))
|
||||
|
||||
def add_action(self, int action, label):
|
||||
if not isinstance(label, int):
|
||||
label = self.strings[label]
|
||||
def add_action(self, int action, label_name):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, int):
|
||||
label_id = self.strings.add(label_name)
|
||||
else:
|
||||
label_id = label_name
|
||||
# Check we're not creating a move we already have, so that this is
|
||||
# idempotent
|
||||
for trans in self.c[:self.n_moves]:
|
||||
if trans.move == action and trans.label == label:
|
||||
if trans.move == action and trans.label == label_id:
|
||||
return 0
|
||||
if self.n_moves >= self._size:
|
||||
self._size *= 2
|
||||
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||
|
||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label)
|
||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
assert self.c[self.n_moves].label == label_id
|
||||
self.n_moves += 1
|
||||
return 1
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
with path.open('wb') as file_:
|
||||
file_.write(self.to_bytes(**exclude))
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
with path.open('rb') as file_:
|
||||
byte_data = file_.read()
|
||||
self.from_bytes(byte_data, **exclude)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
transitions = []
|
||||
for trans in self.c[:self.n_moves]:
|
||||
transitions.append({
|
||||
'clas': trans.clas,
|
||||
'move': trans.move,
|
||||
'label': self.strings[trans.label],
|
||||
'name': self.move_name(trans.move, trans.label)
|
||||
})
|
||||
serializers = {
|
||||
'transitions': lambda: ujson.dumps(transitions),
|
||||
'strings': lambda: self.strings.to_bytes()
|
||||
}
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
transitions = []
|
||||
deserializers = {
|
||||
'transitions': lambda b: transitions.extend(ujson.loads(b)),
|
||||
'strings': lambda b: self.strings.from_bytes(b)
|
||||
}
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
for trans in transitions:
|
||||
self.add_action(trans['move'], trans['label'])
|
||||
return self
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
from collections import defaultdict
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -15,7 +14,6 @@ from .tokens.doc cimport Doc
|
|||
from .attrs cimport TAG
|
||||
from .gold cimport GoldParse
|
||||
from .attrs cimport *
|
||||
from . import util
|
||||
|
||||
|
||||
cpdef enum:
|
||||
|
@ -108,55 +106,15 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|||
|
||||
|
||||
cdef class Tagger:
|
||||
"""
|
||||
Annotate part-of-speech tags on Doc objects.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, vocab, require=False):
|
||||
"""
|
||||
Load the statistical model from the supplied path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
vocab (Vocab):
|
||||
The vocabulary. Must be shared by the documents to be processed.
|
||||
require (bool):
|
||||
Whether to raise an error if the files are not found.
|
||||
Returns (Tagger):
|
||||
The newly created object.
|
||||
"""
|
||||
# TODO: Change this to expect config.json when we don't have to
|
||||
# support old data.
|
||||
path = util.ensure_path(path)
|
||||
if (path / 'templates.json').exists():
|
||||
with (path / 'templates.json').open('r', encoding='utf8') as file_:
|
||||
templates = ujson.load(file_)
|
||||
elif require:
|
||||
raise IOError(
|
||||
"Required file %s/templates.json not found when loading Tagger" % str(path))
|
||||
else:
|
||||
templates = cls.feature_templates
|
||||
self = cls(vocab, model=None, feature_templates=templates)
|
||||
|
||||
if (path / 'model').exists():
|
||||
self.model.load(str(path / 'model'))
|
||||
elif require:
|
||||
raise IOError(
|
||||
"Required file %s/model not found when loading Tagger" % str(path))
|
||||
return self
|
||||
"""Annotate part-of-speech tags on Doc objects."""
|
||||
|
||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
||||
"""
|
||||
Create a Tagger.
|
||||
"""Create a Tagger.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
The vocabulary object. Must be shared with documents to be processed.
|
||||
model (thinc.linear.AveragedPerceptron):
|
||||
The statistical model.
|
||||
Returns (Tagger):
|
||||
The newly constructed object.
|
||||
vocab (Vocab): The vocabulary object. Must be shared with documents to
|
||||
be processed.
|
||||
model (thinc.linear.AveragedPerceptron): The statistical model.
|
||||
RETURNS (Tagger): The newly constructed object.
|
||||
"""
|
||||
if model is None:
|
||||
model = TaggerModel(cfg.get('features', self.feature_templates),
|
||||
|
@ -186,13 +144,9 @@ cdef class Tagger:
|
|||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""
|
||||
Apply the tagger, setting the POS tags onto the Doc object.
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The tokens to be tagged.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if tokens.length == 0:
|
||||
return 0
|
||||
|
@ -215,34 +169,25 @@ cdef class Tagger:
|
|||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
"""
|
||||
Tag a stream of documents.
|
||||
"""Tag a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to tag.
|
||||
batch_size (int):
|
||||
The number of documents to accumulate into a working set.
|
||||
n_threads (int):
|
||||
The number of threads with which to work on the buffer in parallel,
|
||||
if the Matcher implementation supports multi-threading.
|
||||
Yields:
|
||||
Doc Documents, in order.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the Matcher implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in stream:
|
||||
self(doc)
|
||||
yield doc
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||
"""
|
||||
Update the statistical model, with tags supplied for the given document.
|
||||
"""Update the statistical model, with tags supplied for the given document.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document to update on.
|
||||
gold (GoldParse):
|
||||
Manager for the gold-standard tags.
|
||||
Returns (int):
|
||||
Number of tags correct.
|
||||
doc (Doc): The document to update on.
|
||||
gold (GoldParse): Manager for the gold-standard tags.
|
||||
RETURNS (int): Number of tags predicted correctly.
|
||||
"""
|
||||
gold_tag_strs = gold.tags
|
||||
assert len(tokens) == len(gold_tag_strs)
|
||||
|
|
|
@ -13,21 +13,32 @@ Tests for spaCy modules and classes live in their own directories of the same na
|
|||
2. [Dos and don'ts](#dos-and-donts)
|
||||
3. [Parameters](#parameters)
|
||||
4. [Fixtures](#fixtures)
|
||||
5. [Helpers and utilities](#helpers-and-utilities)
|
||||
6. [Contributing to the tests](#contributing-to-the-tests)
|
||||
5. [Testing models](#testing-models)
|
||||
6. [Helpers and utilities](#helpers-and-utilities)
|
||||
7. [Contributing to the tests](#contributing-to-the-tests)
|
||||
|
||||
|
||||
## Running the tests
|
||||
|
||||
To show print statements, run the tests with `py.test -s`. To abort after the
|
||||
first failure, run them with `py.test -x`.
|
||||
|
||||
```bash
|
||||
py.test spacy # run basic tests
|
||||
py.test spacy --models # run basic and model tests
|
||||
py.test spacy --models --en # run basic and English model tests
|
||||
py.test spacy --models --all # run basic and all model tests
|
||||
py.test spacy --slow # run basic and slow tests
|
||||
py.test spacy --models --slow # run all tests
|
||||
py.test spacy --models --all --slow # run all tests
|
||||
```
|
||||
|
||||
To show print statements, run the tests with `py.test -s`. To abort after the first failure, run them with `py.test -x`.
|
||||
You can also run tests in a specific file or directory, or even only one
|
||||
specific test:
|
||||
|
||||
```bash
|
||||
py.test spacy/tests/tokenizer # run all tests in directory
|
||||
py.test spacy/tests/tokenizer/test_exceptions.py # run all tests in file
|
||||
py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji # run specific test
|
||||
```
|
||||
|
||||
## Dos and don'ts
|
||||
|
||||
|
@ -83,14 +94,9 @@ These are the main fixtures that are currently available:
|
|||
| Fixture | Description |
|
||||
| --- | --- |
|
||||
| `tokenizer` | Creates **all available** language tokenizers and runs the test for **each of them**. |
|
||||
| `en_tokenizer` | Creates an English `Tokenizer` object. |
|
||||
| `de_tokenizer` | Creates a German `Tokenizer` object. |
|
||||
| `hu_tokenizer` | Creates a Hungarian `Tokenizer` object. |
|
||||
| `en_vocab` | Creates an English `Vocab` object. |
|
||||
| `en_entityrecognizer` | Creates an English `EntityRecognizer` object. |
|
||||
| `lemmatizer` | Creates a `Lemmatizer` object from the installed language data (`None` if no data is found).
|
||||
| `EN` | Creates an instance of `English`. Only use for tests that require the models. |
|
||||
| `DE` | Creates an instance of `German`. Only use for tests that require the models. |
|
||||
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
|
||||
| `en_vocab`, `en_entityrecognizer`, ... | Creates an instance of the English `Vocab`, `EntityRecognizer` object etc. |
|
||||
| `EN`, `DE`, ... | Creates a language class with a loaded model. For more info, see [Testing models](#testing-models). |
|
||||
| `text_file` | Creates an instance of `StringIO` to simulate reading from and writing to files. |
|
||||
| `text_file_b` | Creates an instance of `ByteIO` to simulate reading from and writing to files. |
|
||||
|
||||
|
@ -103,6 +109,48 @@ def test_module_do_something(en_tokenizer):
|
|||
|
||||
If all tests in a file require a specific configuration, or use the same complex example, it can be helpful to create a separate fixture. This fixture should be added at the top of each file. Make sure to use descriptive names for these fixtures and don't override any of the global fixtures listed above. **From looking at a test, it should immediately be clear which fixtures are used, and where they are coming from.**
|
||||
|
||||
## Testing models
|
||||
|
||||
Models should only be loaded and tested **if absolutely necessary** – for example, if you're specifically testing a model's performance, or if your test is related to model loading. If you only need an annotated `Doc`, you should use the `get_doc()` helper function to create it manually instead.
|
||||
|
||||
To specify which language models a test is related to, set the language ID as an argument of `@pytest.mark.models`. This allows you to later run the tests with `--models --en`. You can then use the `EN` [fixture](#fixtures) to get a language
|
||||
class with a loaded model.
|
||||
|
||||
```python
|
||||
@pytest.mark.models('en')
|
||||
def test_english_model(EN):
|
||||
doc = EN(u'This is a test')
|
||||
```
|
||||
|
||||
> ⚠️ **Important note:** In order to test models, they need to be installed as a packge. The [conftest.py](conftest.py) includes a list of all available models, mapped to their IDs, e.g. `en`. Unless otherwise specified, each model that's installed in your environment will be imported and tested. If you don't have a model installed, **the test will be skipped**.
|
||||
|
||||
Under the hood, `pytest.importorskip` is used to import a model package and skip the test if the package is not installed. The `EN` fixture for example gets all
|
||||
available models for `en`, [parametrizes](#parameters) them to run the test for *each of them*, and uses `load_test_model()` to import the model and run the test, or skip it if the model is not installed.
|
||||
|
||||
### Testing specific models
|
||||
|
||||
Using the `load_test_model()` helper function, you can also write tests for specific models, or combinations of them:
|
||||
|
||||
```python
|
||||
from .util import load_test_model
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_en_md_only():
|
||||
nlp = load_test_model('en_core_web_md')
|
||||
# test something specific to en_core_web_md
|
||||
|
||||
@pytest.mark.models('en', 'fr')
|
||||
@pytest.mark.parametrize('model', ['en_core_web_md', 'fr_depvec_web_lg'])
|
||||
def test_different_models(model):
|
||||
nlp = load_test_model(model)
|
||||
# test something specific to the parametrized models
|
||||
```
|
||||
|
||||
### Known issues and future improvements
|
||||
|
||||
Using `importorskip` on a list of model packages is not ideal and we're looking to improve this in the future. But at the moment, it's the best way to ensure that tests are performed on specific model packages only, and that you'll always be able to run the tests, even if you don't have *all available models* installed. (If the tests made a call to `spacy.load('en')` instead, this would load whichever model you've created an `en` shortcut for. This may be one of spaCy's default models, but it could just as easily be your own custom English model.)
|
||||
|
||||
The current setup also doesn't provide an easy way to only run tests on specific model versions. The `minversion` keyword argument on `pytest.importorskip` can take care of this, but it currently only checks for the package's `__version__` attribute. An alternative solution would be to load a model package's meta.json and skip if the model's version does not match the one specified in the test.
|
||||
|
||||
## Helpers and utilities
|
||||
|
||||
|
@ -152,11 +200,11 @@ print([token.dep_ for token in doc])
|
|||
|
||||
**Note:** There's currently no way of setting the serializer data for the parser without loading the models. If this is relevant to your test, constructing the `Doc` via `get_doc()` won't work.
|
||||
|
||||
|
||||
### Other utilities
|
||||
|
||||
| Name | Description |
|
||||
| --- | --- |
|
||||
| `load_test_model` | Load a model if it's installed as a package, otherwise skip test. |
|
||||
| `apply_transition_sequence(parser, doc, sequence)` | Perform a series of pre-specified transitions, to put the parser in a desired state. |
|
||||
| `add_vecs_to_vocab(vocab, vectors)` | Add list of vector tuples (`[("text", [1, 2, 3])]`) to given vocab. All vectors need to have the same length. |
|
||||
| `get_cosine(vec1, vec2)` | Get cosine for two given vectors. |
|
||||
|
|
|
@ -1,25 +1,50 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..strings import StringStore
|
||||
from ..lemmatizer import Lemmatizer
|
||||
from ..attrs import ORTH, TAG, HEAD, DEP
|
||||
from .. import util
|
||||
|
||||
from io import StringIO, BytesIO
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from .util import load_test_model
|
||||
from ..tokens import Doc
|
||||
from ..strings import StringStore
|
||||
from .. import util
|
||||
|
||||
|
||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
|
||||
'nl', 'pl', 'pt', 'sv']
|
||||
'nl', 'pl', 'pt', 'sv', 'xx']
|
||||
_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
|
||||
'de': ['de_core_news_md'],
|
||||
'fr': ['fr_depvec_web_lg'],
|
||||
'xx': ['xx_ent_web_md']}
|
||||
|
||||
|
||||
@pytest.fixture(params=_languages)
|
||||
def tokenizer(request):
|
||||
lang = util.get_lang_class(request.param)
|
||||
return lang.Defaults.create_tokenizer()
|
||||
# only used for tests that require loading the models
|
||||
# in all other cases, use specific instances
|
||||
|
||||
@pytest.fixture(params=_models['en'])
|
||||
def EN(request):
|
||||
return load_test_model(request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=_models['de'])
|
||||
def DE(request):
|
||||
return load_test_model(request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=_models['fr'])
|
||||
def FR(request):
|
||||
return load_test_model(request.param)
|
||||
|
||||
|
||||
#@pytest.fixture(params=_languages)
|
||||
#def tokenizer(request):
|
||||
#lang = util.get_lang_class(request.param)
|
||||
#return lang.Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture
|
||||
def tokenizer():
|
||||
return util.get_lang_class('xx').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -47,7 +72,7 @@ def de_tokenizer():
|
|||
return util.get_lang_class('de').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
@pytest.fixture
|
||||
def fr_tokenizer():
|
||||
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
||||
|
||||
|
@ -91,11 +116,6 @@ def en_entityrecognizer():
|
|||
return util.get_lang_class('en').Defaults.create_entity()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
return util.get_lang_class('en').Defaults.create_lemmatizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def text_file():
|
||||
return StringIO()
|
||||
|
@ -105,22 +125,6 @@ def text_file_b():
|
|||
return BytesIO()
|
||||
|
||||
|
||||
# only used for tests that require loading the models
|
||||
# in all other cases, use specific instances
|
||||
@pytest.fixture(scope="session")
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def DE():
|
||||
return German()
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def FR():
|
||||
return French()
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption("--models", action="store_true",
|
||||
help="include tests that require full models")
|
||||
|
@ -129,8 +133,18 @@ def pytest_addoption(parser):
|
|||
parser.addoption("--slow", action="store_true",
|
||||
help="include slow tests")
|
||||
|
||||
for lang in _languages + ['all']:
|
||||
parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
|
||||
|
||||
|
||||
def pytest_runtest_setup(item):
|
||||
for opt in ['models', 'vectors', 'slow']:
|
||||
if opt in item.keywords and not item.config.getoption("--%s" % opt):
|
||||
pytest.skip("need --%s option to run" % opt)
|
||||
|
||||
# Check if test is marked with models and has arguments set, i.e. specific
|
||||
# language. If so, skip test if flag not set.
|
||||
if item.get_marker('models'):
|
||||
for arg in item.get_marker('models').args:
|
||||
if not item.config.getoption("--%s" % arg) and not item.config.getoption("--all"):
|
||||
pytest.skip("need --%s or --all option to run" % arg)
|
||||
|
|
|
@ -102,7 +102,7 @@ def test_doc_api_getitem(en_tokenizer):
|
|||
def test_doc_api_serialize(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
new_tokens = get_doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
||||
assert tokens.string == new_tokens.string
|
||||
assert tokens.text == new_tokens.text
|
||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||
|
||||
|
@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
|
|||
assert doc[6].right_edge.text == ','
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,vectors', [
|
||||
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
|
||||
])
|
||||
|
|
|
@ -68,6 +68,7 @@ def test_doc_token_api_is_properties(en_vocab):
|
|||
assert doc[5].like_email
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,vectors', [
|
||||
("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
|
||||
])
|
||||
|
@ -99,8 +100,8 @@ def test_doc_token_api_ancestors(en_tokenizer):
|
|||
assert [t.text for t in doc[1].ancestors] == ["saw"]
|
||||
assert [t.text for t in doc[2].ancestors] == []
|
||||
|
||||
assert doc[2].is_ancestor_of(doc[7])
|
||||
assert not doc[6].is_ancestor_of(doc[2])
|
||||
assert doc[2].is_ancestor(doc[7])
|
||||
assert not doc[6].is_ancestor(doc[2])
|
||||
|
||||
|
||||
def test_doc_token_api_head_setter(en_tokenizer):
|
||||
|
@ -155,3 +156,15 @@ def test_doc_token_api_head_setter(en_tokenizer):
|
|||
assert doc[3].left_edge.i == 0
|
||||
assert doc[4].left_edge.i == 0
|
||||
assert doc[2].left_edge.i == 0
|
||||
|
||||
|
||||
def test_sent_start(en_tokenizer):
|
||||
doc = en_tokenizer(u'This is a sentence. This is another.')
|
||||
assert not doc[0].sent_start
|
||||
assert not doc[5].sent_start
|
||||
doc[5].sent_start = True
|
||||
assert doc[5].sent_start
|
||||
assert not doc[0].sent_start
|
||||
doc.is_parsed = True
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
|
|
@ -1,72 +0,0 @@
|
|||
# coding: utf-8
|
||||
|
||||
import pytest
|
||||
import numpy
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
class TestModelSanity:
|
||||
"""
|
||||
This is to make sure the model works as expected. The tests make sure that
|
||||
values are properly set.
|
||||
Tests are not meant to evaluate the content of the output, only make sure
|
||||
the output is formally okay.
|
||||
"""
|
||||
@pytest.fixture(scope='class', params=['en','de'])
|
||||
def example(self, request, EN, DE):
|
||||
assert EN.entity != None
|
||||
assert DE.entity != None
|
||||
if request.param == 'en':
|
||||
doc = EN(u'There was a stranger standing at the big ' +
|
||||
u'street talking to herself.')
|
||||
elif request.param == 'de':
|
||||
doc = DE(u'An der großen Straße stand eine merkwürdige ' +
|
||||
u'Gestalt und führte Selbstgespräche.')
|
||||
return doc
|
||||
|
||||
def test_tokenization(self, example):
|
||||
# tokenization should split the document into tokens
|
||||
assert len(example) > 1
|
||||
|
||||
def test_tagging(self, example):
|
||||
# if tagging was done properly, pos tags shouldn't be empty
|
||||
assert example.is_tagged
|
||||
assert all( t.pos != 0 for t in example )
|
||||
assert all( t.tag != 0 for t in example )
|
||||
|
||||
def test_parsing(self, example):
|
||||
# if parsing was done properly
|
||||
# - dependency labels shouldn't be empty
|
||||
# - the head of some tokens should not be root
|
||||
assert example.is_parsed
|
||||
assert all( t.dep != 0 for t in example )
|
||||
assert any( t.dep != i for i,t in enumerate(example) )
|
||||
|
||||
def test_ner(self, example):
|
||||
# if ner was done properly, ent_iob shouldn't be empty
|
||||
assert all([t.ent_iob != 0 for t in example])
|
||||
|
||||
def test_vectors(self, example):
|
||||
# if vectors are available, they should differ on different words
|
||||
# this isn't a perfect test since this could in principle fail
|
||||
# in a sane model as well,
|
||||
# but that's very unlikely and a good indicator if something is wrong
|
||||
vector0 = example[0].vector
|
||||
vector1 = example[1].vector
|
||||
vector2 = example[2].vector
|
||||
assert not numpy.array_equal(vector0,vector1)
|
||||
assert not numpy.array_equal(vector0,vector2)
|
||||
assert not numpy.array_equal(vector1,vector2)
|
||||
|
||||
def test_probs(self, example):
|
||||
# if frequencies/probabilities are okay, they should differ for
|
||||
# different words
|
||||
# this isn't a perfect test since this could in principle fail
|
||||
# in a sane model as well,
|
||||
# but that's very unlikely and a good indicator if something is wrong
|
||||
prob0 = example[0].prob
|
||||
prob1 = example[1].prob
|
||||
prob2 = example[2].prob
|
||||
assert not prob0 == prob1
|
||||
assert not prob0 == prob2
|
||||
assert not prob1 == prob2
|
|
@ -8,20 +8,33 @@ import pytest
|
|||
|
||||
|
||||
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
|
||||
def test_tokenizer_splits_contractions(de_tokenizer, text):
|
||||
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
||||
def test_tokenizer_handles_abbr(de_tokenizer, text):
|
||||
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_handles_exc_in_text(de_tokenizer):
|
||||
def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
|
||||
text = "Ich bin z.Zt. im Urlaub."
|
||||
tokens = de_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert tokens[2].text == "z.Zt."
|
||||
assert tokens[2].lemma_ == "zur Zeit"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
|
||||
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
||||
tokens = de_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
|
||||
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
||||
tokens = de_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
77
spacy/tests/lang/de/test_models.py
Normal file
77
spacy/tests/lang/de/test_models.py
Normal file
|
@ -0,0 +1,77 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def example(DE):
|
||||
"""
|
||||
This is to make sure the model works as expected. The tests make sure that
|
||||
values are properly set. Tests are not meant to evaluate the content of the
|
||||
output, only make sure the output is formally okay.
|
||||
"""
|
||||
assert DE.entity != None
|
||||
return DE('An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
|
||||
|
||||
|
||||
@pytest.mark.models('de')
|
||||
def test_de_models_tokenization(example):
|
||||
# tokenization should split the document into tokens
|
||||
assert len(example) > 1
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.models('de')
|
||||
def test_de_models_tagging(example):
|
||||
# if tagging was done properly, pos tags shouldn't be empty
|
||||
assert example.is_tagged
|
||||
assert all(t.pos != 0 for t in example)
|
||||
assert all(t.tag != 0 for t in example)
|
||||
|
||||
|
||||
@pytest.mark.models('de')
|
||||
def test_de_models_parsing(example):
|
||||
# if parsing was done properly
|
||||
# - dependency labels shouldn't be empty
|
||||
# - the head of some tokens should not be root
|
||||
assert example.is_parsed
|
||||
assert all(t.dep != 0 for t in example)
|
||||
assert any(t.dep != i for i,t in enumerate(example))
|
||||
|
||||
|
||||
@pytest.mark.models('de')
|
||||
def test_de_models_ner(example):
|
||||
# if ner was done properly, ent_iob shouldn't be empty
|
||||
assert all([t.ent_iob != 0 for t in example])
|
||||
|
||||
|
||||
@pytest.mark.models('de')
|
||||
def test_de_models_vectors(example):
|
||||
# if vectors are available, they should differ on different words
|
||||
# this isn't a perfect test since this could in principle fail
|
||||
# in a sane model as well,
|
||||
# but that's very unlikely and a good indicator if something is wrong
|
||||
vector0 = example[0].vector
|
||||
vector1 = example[1].vector
|
||||
vector2 = example[2].vector
|
||||
assert not numpy.array_equal(vector0,vector1)
|
||||
assert not numpy.array_equal(vector0,vector2)
|
||||
assert not numpy.array_equal(vector1,vector2)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.models('de')
|
||||
def test_de_models_probs(example):
|
||||
# if frequencies/probabilities are okay, they should differ for
|
||||
# different words
|
||||
# this isn't a perfect test since this could in principle fail
|
||||
# in a sane model as well,
|
||||
# but that's very unlikely and a good indicator if something is wrong
|
||||
prob0 = example[0].prob
|
||||
prob1 = example[1].prob
|
||||
prob2 = example[2].prob
|
||||
assert not prob0 == prob1
|
||||
assert not prob0 == prob2
|
||||
assert not prob1 == prob2
|
35
spacy/tests/lang/de/test_parser.py
Normal file
35
spacy/tests/lang/de/test_parser.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...util import get_doc
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
||||
text = "Eine Tasse steht auf dem Tisch."
|
||||
heads = [1, 1, 0, -1, 1, -2, -4]
|
||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
|
||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
|
||||
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "Eine Tasse "
|
||||
assert chunks[1].text_with_ws == "dem Tisch "
|
||||
|
||||
|
||||
def test_de_extended_chunk(de_tokenizer):
|
||||
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
||||
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
|
||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
|
||||
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "Die Sängerin "
|
||||
assert chunks[1].text_with_ws == "einer Tasse Kaffee "
|
||||
assert chunks[2].text_with_ws == "Arien "
|
|
@ -1,87 +0,0 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokens are created correctly for contractions."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokenizer_handles_basic_contraction(en_tokenizer):
|
||||
text = "don't giggle"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == "n't"
|
||||
text = "i said don't!"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].text == "!"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
||||
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
||||
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||
tokens = en_tokenizer(text_poss)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == "'s"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
||||
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
||||
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
||||
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'ll"
|
||||
assert tokens[1].lemma_ == "will"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
||||
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
||||
tokens_lower = en_tokenizer(text_lower)
|
||||
tokens_title = en_tokenizer(text_title)
|
||||
assert tokens_title[0].text == tokens_lower[0].text.title()
|
||||
assert tokens_lower[0].text == tokens_title[0].text.lower()
|
||||
assert tokens_lower[1].text == tokens_title[1].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
||||
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
||||
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
||||
tokens = en_tokenizer(pron + contraction)
|
||||
assert tokens[0].text == pron
|
||||
assert tokens[1].text == contraction
|
||||
|
||||
|
||||
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
||||
tokens = en_tokenizer(exc)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
||||
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||
tokens = en_tokenizer(wo_punct)
|
||||
assert len(tokens) == 2
|
||||
tokens = en_tokenizer(w_punct)
|
||||
assert len(tokens) == 3
|
|
@ -1,19 +1,96 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
|
||||
text = "don't giggle"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == "n't"
|
||||
text = "i said don't!"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].text == "!"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
||||
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
||||
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||
tokens = en_tokenizer(text_poss)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == "'s"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
||||
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
||||
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
||||
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'ll"
|
||||
assert tokens[1].lemma_ == "will"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
||||
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
||||
tokens_lower = en_tokenizer(text_lower)
|
||||
tokens_title = en_tokenizer(text_title)
|
||||
assert tokens_title[0].text == tokens_lower[0].text.title()
|
||||
assert tokens_lower[0].text == tokens_title[0].text.lower()
|
||||
assert tokens_lower[1].text == tokens_title[1].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
||||
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
||||
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
||||
tokens = en_tokenizer(pron + contraction)
|
||||
assert tokens[0].text == pron
|
||||
assert tokens[1].text == contraction
|
||||
|
||||
|
||||
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
||||
tokens = en_tokenizer(exc)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
||||
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||
tokens = en_tokenizer(wo_punct)
|
||||
assert len(tokens) == 2
|
||||
tokens = en_tokenizer(w_punct)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
||||
def test_tokenizer_handles_abbr(en_tokenizer, text):
|
||||
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_handles_exc_in_text(en_tokenizer):
|
||||
def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
|
||||
text = "It's mediocre i.e. bad."
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
|
@ -21,7 +98,19 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer):
|
|||
|
||||
|
||||
@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"])
|
||||
def test_tokenizer_handles_times(en_tokenizer, text):
|
||||
def test_en_tokenizer_handles_times(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].lemma_ in ["a.m.", "p.m."]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])])
|
||||
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
|
||||
tokens = en_tokenizer(text)
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
|
||||
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].norm_ == norm
|
||||
|
|
|
@ -7,7 +7,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
def test_simple_punct(en_tokenizer):
|
||||
def test_en_simple_punct(en_tokenizer):
|
||||
text = "to walk, do foo"
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].idx == 0
|
||||
|
@ -17,7 +17,7 @@ def test_simple_punct(en_tokenizer):
|
|||
assert tokens[4].idx == 12
|
||||
|
||||
|
||||
def test_complex_punct(en_tokenizer):
|
||||
def test_en_complex_punct(en_tokenizer):
|
||||
text = "Tom (D., Ill.)!"
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].idx == 0
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user