Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"

This reverts commit c9ba3d3c2d, reversing
changes made to 92c26a35d4.
This commit is contained in:
Matthew Honnibal 2018-03-27 19:23:02 +02:00
parent 8b7a74570f
commit 1f7229f40f
67 changed files with 4799 additions and 1040 deletions

View File

@ -32,7 +32,7 @@ test_script:
# Note that you must use the environment variable %PYTHON% to refer to # Note that you must use the environment variable %PYTHON% to refer to
# the interpreter you're using - Appveyor does not do anything special # the interpreter you're using - Appveyor does not do anything special
# to put the Python version you want to use on PATH. # to put the Python version you want to use on PATH.
- "%PYTHON%\\python.exe -m pytest spacy/" - "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
after_test: after_test:
# This step builds your wheels. # This step builds your wheels.

11
.buildkite/train.yml Normal file
View File

@ -0,0 +1,11 @@
steps:
-
command: "fab env clean make test wheel"
label: ":dizzy: :python:"
artifact_paths: "dist/*.whl"
- wait
- trigger: "spacy-train-from-wheel"
label: ":dizzy: :train:"
build:
env:
SPACY_VERSION: "{$SPACY_VERSION}"

View File

@ -182,7 +182,7 @@ If you've made a contribution to spaCy, you should fill in the
[spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that [spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that
your contribution can be used across the project. If you agree to be bound by your contribution can be used across the project. If you agree to be bound by
the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md) the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md)
and include it with your pull request, or sumit it separately to and include it with your pull request, or submit it separately to
[`.github/contributors/`](/.github/contributors). The name of the file should be [`.github/contributors/`](/.github/contributors). The name of the file should be
your GitHub username, with the extension `.md`. For example, the user your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`. example_user would create the file `.github/contributors/example_user.md`.

392
examples/training/conllu.py Normal file
View File

@ -0,0 +1,392 @@
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
.conllu format for development data, allowing the official scorer to be used.
'''
from __future__ import unicode_literals
import plac
import tqdm
import attr
from pathlib import Path
import re
import sys
import json
import spacy
import spacy.util
from spacy.tokens import Token, Doc
from spacy.gold import GoldParse
from spacy.syntax.nonproj import projectivize
from collections import defaultdict, Counter
from timeit import default_timer as timer
from spacy.matcher import Matcher
import itertools
import random
import numpy.random
import cytoolz
import conll17_ud_eval
import spacy.lang.zh
import spacy.lang.ja
spacy.lang.zh.Chinese.Defaults.use_jieba = False
spacy.lang.ja.Japanese.Defaults.use_janome = False
random.seed(0)
numpy.random.seed(0)
def minibatch_by_words(items, size=5000):
random.shuffle(items)
if isinstance(size, int):
size_ = itertools.repeat(size)
else:
size_ = size
items = iter(items)
while True:
batch_size = next(size_)
batch = []
while batch_size >= 0:
try:
doc, gold = next(items)
except StopIteration:
if batch:
yield batch
return
batch_size -= len(doc)
batch.append((doc, gold))
if batch:
yield batch
else:
break
################
# Data reading #
################
space_re = re.compile('\s+')
def split_text(text):
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
max_doc_length=None, limit=None):
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
include Doc objects created using nlp.make_doc and then aligned against
the gold-standard sequences. If oracle_segments=True, include Doc objects
created from the gold-standard segments. At least one must be True.'''
if not raw_text and not oracle_segments:
raise ValueError("At least one of raw_text or oracle_segments must be True")
paragraphs = split_text(text_file.read())
conllu = read_conllu(conllu_file)
# sd is spacy doc; cd is conllu doc
# cs is conllu sent, ct is conllu token
docs = []
golds = []
for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
sent_annots = []
for cs in cd:
sent = defaultdict(list)
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
if '.' in id_:
continue
if '-' in id_:
continue
id_ = int(id_)-1
head = int(head)-1 if head != '0' else id_
sent['words'].append(word)
sent['tags'].append(tag)
sent['heads'].append(head)
sent['deps'].append('ROOT' if dep == 'root' else dep)
sent['spaces'].append(space_after == '_')
sent['entities'] = ['-'] * len(sent['words'])
sent['heads'], sent['deps'] = projectivize(sent['heads'],
sent['deps'])
if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
golds.append(GoldParse(docs[-1], **sent))
sent_annots.append(sent)
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
doc, gold = _make_gold(nlp, None, sent_annots)
sent_annots = []
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
return docs, golds
if raw_text and sent_annots:
doc, gold = _make_gold(nlp, None, sent_annots)
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
return docs, golds
return docs, golds
def read_conllu(file_):
docs = []
sent = []
doc = []
for line in file_:
if line.startswith('# newdoc'):
if doc:
docs.append(doc)
doc = []
elif line.startswith('#'):
continue
elif not line.strip():
if sent:
doc.append(sent)
sent = []
else:
sent.append(list(line.strip().split('\t')))
if len(sent[-1]) != 10:
print(repr(line))
raise ValueError
if sent:
doc.append(sent)
if doc:
docs.append(doc)
return docs
def _make_gold(nlp, text, sent_annots):
# Flatten the conll annotations, and adjust the head indices
flat = defaultdict(list)
for sent in sent_annots:
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
flat[field].extend(sent[field])
# Construct text if necessary
assert len(flat['words']) == len(flat['spaces'])
if text is None:
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
doc = nlp.make_doc(text)
flat.pop('spaces')
gold = GoldParse(doc, **flat)
return doc, gold
#############################
# Data transforms for spaCy #
#############################
def golds_to_gold_tuples(docs, golds):
'''Get out the annoying 'tuples' format used by begin_training, given the
GoldParse objects.'''
tuples = []
for doc, gold in zip(docs, golds):
text = doc.text
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
sents = [((ids, words, tags, heads, labels, iob), [])]
tuples.append((text, sents))
return tuples
##############
# Evaluation #
##############
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
with text_loc.open('r', encoding='utf8') as text_file:
texts = split_text(text_file.read())
docs = list(nlp.pipe(texts))
with sys_loc.open('w', encoding='utf8') as out_file:
write_conllu(docs, out_file)
with gold_loc.open('r', encoding='utf8') as gold_file:
gold_ud = conll17_ud_eval.load_conllu(gold_file)
with sys_loc.open('r', encoding='utf8') as sys_file:
sys_ud = conll17_ud_eval.load_conllu(sys_file)
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
return scores
def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start:end+1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets:
doc.merge(start_char, end_char)
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent):
file_.write(token._.get_conllu_lines(k) + '\n')
file_.write('\n')
def print_progress(itn, losses, ud_scores):
fields = {
'dep_loss': losses.get('parser', 0.0),
'tag_loss': losses.get('tagger', 0.0),
'words': ud_scores['Words'].f1 * 100,
'sents': ud_scores['Sentences'].f1 * 100,
'tags': ud_scores['XPOS'].f1 * 100,
'uas': ud_scores['UAS'].f1 * 100,
'las': ud_scores['LAS'].f1 * 100,
}
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
if itn == 0:
print('\t'.join(header))
tpl = '\t'.join((
'{:d}',
'{dep_loss:.1f}',
'{las:.1f}',
'{uas:.1f}',
'{tags:.1f}',
'{sents:.1f}',
'{words:.1f}',
))
print(tpl.format(itn, **fields))
#def get_sent_conllu(sent, sent_id):
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
def get_token_conllu(token, i):
if token._.begins_fused:
n = 1
while token.nbor(n)._.inside_fused:
n += 1
id_ = '%d-%d' % (i, i+n)
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
else:
lines = []
if token.head.i == token.i:
head = 0
else:
head = i + (token.head.i - token.i) + 1
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
str(head), token.dep_.lower(), '_', '_']
lines.append('\t'.join(fields))
return '\n'.join(lines)
Token.set_extension('get_conllu_lines', method=get_token_conllu)
Token.set_extension('begins_fused', default=False)
Token.set_extension('inside_fused', default=False)
##################
# Initialization #
##################
def load_nlp(corpus, config):
lang = corpus.split('_')[0]
nlp = spacy.blank(lang)
if config.vectors:
nlp.vocab.from_disk(config.vectors / 'vocab')
return nlp
def initialize_pipeline(nlp, docs, golds, config):
nlp.add_pipe(nlp.create_pipe('parser'))
if config.multitask_tag:
nlp.parser.add_multitask_objective('tag')
if config.multitask_sent:
nlp.parser.add_multitask_objective('sent_start')
nlp.parser.moves.add_action(2, 'subtok')
nlp.add_pipe(nlp.create_pipe('tagger'))
for gold in golds:
for tag in gold.tags:
if tag is not None:
nlp.tagger.add_label(tag)
# Replace labels that didn't make the frequency cutoff
actions = set(nlp.parser.labels)
label_set = set([act.split('-')[1] for act in actions if '-' in act])
for gold in golds:
for i, label in enumerate(gold.labels):
if label is not None and label not in label_set:
gold.labels[i] = label.split('||')[0]
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
########################
# Command line helpers #
########################
@attr.s
class Config(object):
vectors = attr.ib(default=None)
max_doc_length = attr.ib(default=10)
multitask_tag = attr.ib(default=True)
multitask_sent = attr.ib(default=True)
nr_epoch = attr.ib(default=30)
batch_size = attr.ib(default=1000)
dropout = attr.ib(default=0.2)
@classmethod
def load(cls, loc):
with Path(loc).open('r', encoding='utf8') as file_:
cfg = json.load(file_)
return cls(**cfg)
class Dataset(object):
def __init__(self, path, section):
self.path = path
self.section = section
self.conllu = None
self.text = None
for file_path in self.path.iterdir():
name = file_path.parts[-1]
if section in name and name.endswith('conllu'):
self.conllu = file_path
elif section in name and name.endswith('txt'):
self.text = file_path
if self.conllu is None:
msg = "Could not find .txt file in {path} for {section}"
raise IOError(msg.format(section=section, path=path))
if self.text is None:
msg = "Could not find .txt file in {path} for {section}"
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
class TreebankPaths(object):
def __init__(self, ud_path, treebank, **cfg):
self.train = Dataset(ud_path / treebank, 'train')
self.dev = Dataset(ud_path / treebank, 'dev')
self.lang = self.train.lang
@plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
"positional", None, str),
parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "positional", None, Config.load),
limit=("Size limit", "option", "n", int)
)
def main(ud_dir, parses_dir, config, corpus, limit=0):
paths = TreebankPaths(ud_dir, corpus)
if not (parses_dir / corpus).exists():
(parses_dir / corpus).mkdir()
print("Train and evaluate", corpus, "using lang", paths.lang)
nlp = load_nlp(paths.lang, config)
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
max_doc_length=config.max_doc_length, limit=limit)
optimizer = initialize_pipeline(nlp, docs, golds, config)
for i in range(config.nr_epoch):
docs = [nlp.make_doc(doc.text) for doc in docs]
batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
losses = {}
n_train_words = sum(len(doc) for doc in docs)
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
for batch in batches:
batch_docs, batch_gold = zip(*batch)
pbar.update(sum(len(doc) for doc in batch_docs))
nlp.update(batch_docs, batch_gold, sgd=optimizer,
drop=config.dropout, losses=losses)
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
with nlp.use_params(optimizer.averages):
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
print_progress(i, losses, scores)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,88 +0,0 @@
#!/usr/bin/env python
# coding: utf8
"""Export spaCy model vectors for use in TensorBoard's standalone embedding projector.
https://github.com/tensorflow/embedding-projector-standalone
Usage:
python vectors_tensorboard_standalone.py ./myVectorModel ./output [name]
This outputs two files that have to be copied into the "oss_data" of the standalone projector:
[name]_labels.tsv - metadata such as human readable labels for vectors
[name]_tensors.bytes - numpy.ndarray of numpy.float32 precision vectors
"""
from __future__ import unicode_literals
import json
import math
from os import path
import numpy
import plac
import spacy
import tqdm
@plac.annotations(
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
out_loc=("Path to output folder writing tensors and labels data", "positional", None, str),
name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
)
def main(vectors_loc, out_loc, name="spaCy_vectors"):
# A tab-separated file that contains information about the vectors for visualization
#
# Learn more: https://www.tensorflow.org/programmers_guide/embedding#metadata
meta_file = "{}_labels.tsv".format(name)
out_meta_file = path.join(out_loc, meta_file)
print('Loading spaCy vectors model: {}'.format(vectors_loc))
model = spacy.load(vectors_loc)
print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
voacb_strings = [
w for w in tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
if model.vocab.has_vector(w)
]
vector_count = len(voacb_strings)
print('Building Projector labels for {} vectors: {}'.format(vector_count, out_meta_file))
vector_dimensions = model.vocab.vectors.shape[1]
tf_vectors_variable = numpy.zeros((vector_count, vector_dimensions), dtype=numpy.float32)
# Write a tab-separated file that contains information about the vectors for visualization
#
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
with open(out_meta_file, 'wb') as file_metadata:
# Define columns in the first row
file_metadata.write("Text\tFrequency\n".encode('utf-8'))
# Write out a row for each vector that we add to the tensorflow variable we created
vec_index = 0
for text in tqdm.tqdm(voacb_strings, total=len(voacb_strings), leave=False):
# https://github.com/tensorflow/tensorflow/issues/9094
text = '<Space>' if text.lstrip() == '' else text
lex = model.vocab[text]
# Store vector data and metadata
tf_vectors_variable[vec_index] = numpy.float64(model.vocab.get_vector(text))
file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * len(voacb_strings)).encode('utf-8'))
vec_index += 1
# Write out "[name]_tensors.bytes" file for standalone embeddings projector to load
tensor_path = '{}_tensors.bytes'.format(name)
tf_vectors_variable.tofile(path.join(out_loc, tensor_path))
print('Done.')
print('Add the following entry to "oss_data/oss_demo_projector_config.json"')
print(json.dumps({
"tensorName": name,
"tensorShape": [vector_count, vector_dimensions],
"tensorPath": 'oss_data/{}'.format(tensor_path),
"metadataPath": 'oss_data/{}'.format(meta_file)
}, indent=2))
if __name__ == '__main__':
plac.call(main)

81
fabfile.py vendored
View File

@ -1,49 +1,92 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import contextlib
from pathlib import Path
from fabric.api import local, lcd, env, settings, prefix from fabric.api import local, lcd, env, settings, prefix
from fabtools.python import virtualenv
from os import path, environ from os import path, environ
import shutil
PWD = path.dirname(__file__) PWD = path.dirname(__file__)
ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env' ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
VENV_DIR = path.join(PWD, ENV) VENV_DIR = Path(PWD) / ENV
def env(lang='python2.7'): @contextlib.contextmanager
if path.exists(VENV_DIR): def virtualenv(name, create=False, python='/usr/bin/python3.6'):
python = Path(python).resolve()
env_path = VENV_DIR
if create:
if env_path.exists():
shutil.rmtree(str(env_path))
local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR))
def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
return local('source {}/bin/activate && {}'.format(env_path, cmd),
shell='/bin/bash', capture=False)
yield wrapped_local
def env(lang='python3.6'):
if VENV_DIR.exists():
local('rm -rf {env}'.format(env=VENV_DIR)) local('rm -rf {env}'.format(env=VENV_DIR))
local('pip install virtualenv') if lang.startswith('python3'):
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR))
else:
local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang))
local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR))
with virtualenv(VENV_DIR) as venv_local:
print(venv_local('python --version', capture=True))
venv_local('pip install --upgrade setuptools --no-cache-dir')
venv_local('pip install pytest --no-cache-dir')
venv_local('pip install wheel --no-cache-dir')
venv_local('pip install -r requirements.txt --no-cache-dir')
venv_local('pip install pex --no-cache-dir')
def install(): def install():
with virtualenv(VENV_DIR): with virtualenv(VENV_DIR) as venv_local:
local('pip install --upgrade setuptools') venv_local('pip install dist/*.tar.gz')
local('pip install dist/*.tar.gz')
local('pip install pytest')
def make(): def make():
with virtualenv(VENV_DIR):
with lcd(path.dirname(__file__)): with lcd(path.dirname(__file__)):
local('pip install cython') local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace',
local('pip install murmurhash') shell='/bin/bash')
local('pip install -r requirements.txt')
local('python setup.py build_ext --inplace')
def sdist(): def sdist():
with virtualenv(VENV_DIR): with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)): with lcd(path.dirname(__file__)):
local('python setup.py sdist') local('python setup.py sdist')
def wheel():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
venv_local('python setup.py bdist_wheel')
def pex():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
sha = local('git rev-parse --short HEAD', capture=True)
venv_local('pex dist/*.whl -e spacy -o dist/spacy-%s.pex' % sha,
direct=True)
def clean(): def clean():
with lcd(path.dirname(__file__)): with lcd(path.dirname(__file__)):
local('python setup.py clean --all') local('rm -f dist/*.whl')
local('rm -f dist/*.pex')
with virtualenv(VENV_DIR) as venv_local:
venv_local('python setup.py clean --all')
def test(): def test():
with virtualenv(VENV_DIR): with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)): with lcd(path.dirname(__file__)):
local('py.test -x spacy/tests') venv_local('pytest -x spacy/tests')
def train():
args = environ.get('SPACY_TRAIN_ARGS', '')
with virtualenv(VENV_DIR) as venv_local:
venv_local('spacy train {args}'.format(args=args))

View File

@ -5,8 +5,8 @@ cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0 preshed>=1.0.0,<2.0.0
thinc>=6.11.1.dev10,<6.12.0 thinc>=6.11.1.dev10,<6.12.0
murmurhash>=0.28,<0.29 murmurhash>=0.28,<0.29
cytoolz>=0.9.0,<0.10.0
plac<1.0.0,>=0.9.6 plac<1.0.0,>=0.9.6
six
ujson>=1.35 ujson>=1.35
dill>=0.2,<0.3 dill>=0.2,<0.3
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
@ -16,4 +16,3 @@ pytest>=3.0.6,<4.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
msgpack-python==0.5.4 msgpack-python==0.5.4
msgpack-numpy==0.4.1 msgpack-numpy==0.4.1
html5lib==1.0b8

View File

@ -18,6 +18,7 @@ PACKAGES = find_packages()
MOD_NAMES = [ MOD_NAMES = [
'spacy._align',
'spacy.parts_of_speech', 'spacy.parts_of_speech',
'spacy.strings', 'spacy.strings',
'spacy.lexeme', 'spacy.lexeme',
@ -191,8 +192,6 @@ def setup_package():
'preshed>=1.0.0,<2.0.0', 'preshed>=1.0.0,<2.0.0',
'thinc>=6.11.1.dev10,<6.12.0', 'thinc>=6.11.1.dev10,<6.12.0',
'plac<1.0.0,>=0.9.6', 'plac<1.0.0,>=0.9.6',
'six',
'html5lib==1.0b8',
'pathlib', 'pathlib',
'ujson>=1.35', 'ujson>=1.35',
'dill>=0.2,<0.3', 'dill>=0.2,<0.3',
@ -201,6 +200,7 @@ def setup_package():
'ftfy>=4.4.2,<5.0.0', 'ftfy>=4.4.2,<5.0.0',
'msgpack-python==0.5.4', 'msgpack-python==0.5.4',
'msgpack-numpy==0.4.1'], 'msgpack-numpy==0.4.1'],
setup_requires=['wheel'],
classifiers=[ classifiers=[
'Development Status :: 5 - Production/Stable', 'Development Status :: 5 - Production/Stable',
'Environment :: Console', 'Environment :: Console',

View File

@ -8,6 +8,7 @@ if __name__ == '__main__':
import sys import sys
from spacy.cli import download, link, info, package, train, convert from spacy.cli import download, link, info, package, train, convert
from spacy.cli import vocab, init_model, profile, evaluate, validate from spacy.cli import vocab, init_model, profile, evaluate, validate
from spacy.cli import ud_train, ud_evaluate
from spacy.util import prints from spacy.util import prints
commands = { commands = {
@ -15,7 +16,9 @@ if __name__ == '__main__':
'link': link, 'link': link,
'info': info, 'info': info,
'train': train, 'train': train,
'ud-train': ud_train,
'evaluate': evaluate, 'evaluate': evaluate,
'ud-evaluate': ud_evaluate,
'convert': convert, 'convert': convert,
'package': package, 'package': package,
'vocab': vocab, 'vocab': vocab,

251
spacy/_align.pyx Normal file
View File

@ -0,0 +1,251 @@
# cython: infer_types=True
'''Do Levenshtein alignment, for evaluation of tokenized input.
Random notes:
r i n g
0 1 2 3 4
r 1 0 1 2 3
a 2 1 1 2 3
n 3 2 2 1 2
g 4 3 3 2 1
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
2,2: (3,3)
3,2: (4,3)
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
We know the costs to transition:
S[:i] -> T[:j] (at D[i,j])
S[:i+1] -> T[:j] (at D[i+1,j])
S[:i] -> T[:j+1] (at D[i,j+1])
Further, we now we can tranform:
S[:i+1] -> S[:i] (DEL) for 1,
T[:j+1] -> T[:j] (INS) for 1.
S[i+1] -> T[j+1] (SUB) for 0 or 1
Therefore we have the costs:
SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j])
i.e. D[i, j] + S[i+1] != T[j+1]
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
i.e. D[i+1,j] + 1
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i])
i.e. D[i,j+1] + 1
Source string S has length m, with index i
Target string T has length n, with index j
Output two alignment vectors: i2j (length m) and j2i (length n)
# function LevenshteinDistance(char s[1..m], char t[1..n]):
# for all i and j, d[i,j] will hold the Levenshtein distance between
# the first i characters of s and the first j characters of t
# note that d has (m+1)*(n+1) values
# set each element in d to zero
ring rang
- r i n g
- 0 0 0 0 0
r 0 0 0 0 0
a 0 0 0 0 0
n 0 0 0 0 0
g 0 0 0 0 0
# source prefixes can be transformed into empty string by
# dropping all characters
# d[i, 0] := i
ring rang
- r i n g
- 0 0 0 0 0
r 1 0 0 0 0
a 2 0 0 0 0
n 3 0 0 0 0
g 4 0 0 0 0
# target prefixes can be reached from empty source prefix
# by inserting every character
# d[0, j] := j
- r i n g
- 0 1 2 3 4
r 1 0 0 0 0
a 2 0 0 0 0
n 3 0 0 0 0
g 4 0 0 0 0
'''
from __future__ import unicode_literals
from libc.stdint cimport uint32_t
import numpy
cimport numpy as np
from .compat import unicode_
from murmurhash.mrmr cimport hash32
def align(S, T):
cdef int m = len(S)
cdef int n = len(T)
cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
cdef np.ndarray S_arr = _convert_sequence(S)
cdef np.ndarray T_arr = _convert_sequence(T)
fill_matrix(<int*>matrix.data,
<const int*>S_arr.data, m, <const int*>T_arr.data, n)
fill_i2j(i2j, matrix)
fill_j2i(j2i, matrix)
for i in range(i2j.shape[0]):
if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
i2j[i] = -1
for j in range(j2i.shape[0]):
if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
j2i[j] = -1
return matrix[-1,-1], i2j, j2i, matrix
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
'''Let's say we had:
Guess: [aa bb cc dd]
Truth: [aa bbcc dd]
i2j: [0, None, -2, 2]
j2i: [0, -2, 3]
We want:
i2j_multi: {1: 1, 2: 1}
j2i_multi: {}
'''
i2j_miss = _get_regions(i2j, i_lengths)
j2i_miss = _get_regions(j2i, j_lengths)
i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
return i2j_multi, j2i_multi
def _get_regions(alignment, lengths):
regions = {}
start = None
offset = 0
for i in range(len(alignment)):
if alignment[i] < 0:
if start is None:
start = offset
regions.setdefault(start, [])
regions[start].append(i)
else:
start = None
offset += lengths[i]
return regions
def _get_mapping(miss1, miss2, lengths1, lengths2):
i2j = {}
j2i = {}
for start, region1 in miss1.items():
if not region1 or start not in miss2:
continue
region2 = miss2[start]
if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
j = region2.pop(0)
buff = []
# Consume tokens from region 1, until we meet the length of the
# first token in region2. If we do, align the tokens. If
# we exceed the length, break.
while region1:
buff.append(region1.pop(0))
if sum(lengths1[i] for i in buff) == lengths2[j]:
for i in buff:
i2j[i] = j
j2i[j] = buff[-1]
j += 1
buff = []
elif sum(lengths1[i] for i in buff) > lengths2[j]:
break
else:
if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
for i in buff:
i2j[i] = j
j2i[j] = buff[-1]
return i2j, j2i
def _convert_sequence(seq):
if isinstance(seq, numpy.ndarray):
return numpy.ascontiguousarray(seq, dtype='uint32_t')
cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
cdef bytes item_bytes
for i, item in enumerate(seq):
if isinstance(item, unicode):
item_bytes = item.encode('utf8')
else:
item_bytes = item
output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
return output
cdef void fill_matrix(int* D,
const int* S, int m, const int* T, int n) nogil:
m1 = m+1
n1 = n+1
for i in range(m1*n1):
D[i] = 0
for i in range(m1):
D[i*n1] = i
for j in range(n1):
D[j] = j
cdef int sub_cost, ins_cost, del_cost
for j in range(n):
for i in range(m):
i_j = i*n1 + j
i1_j1 = (i+1)*n1 + j+1
i1_j = (i+1)*n1 + j
i_j1 = i*n1 + j+1
if S[i] != T[j]:
sub_cost = D[i_j] + 1
else:
sub_cost = D[i_j]
del_cost = D[i_j1] + 1
ins_cost = D[i1_j] + 1
best = min(min(sub_cost, ins_cost), del_cost)
D[i1_j1] = best
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
j = D.shape[1]-2
cdef int i = D.shape[0]-2
while i >= 0:
while D[i+1, j] < D[i+1, j+1]:
j -= 1
if D[i, j+1] < D[i+1, j+1]:
i2j[i] = -1
else:
i2j[i] = j
j -= 1
i -= 1
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
i = D.shape[0]-2
cdef int j = D.shape[1]-2
while j >= 0:
while D[i, j+1] < D[i+1, j+1]:
i -= 1
if D[i+1, j] < D[i+1, j+1]:
j2i[j] = -1
else:
j2i[j] = i
i -= 1
j -= 1

251
spacy/_matcher2_notes.py Normal file
View File

@ -0,0 +1,251 @@
import pytest
class Vocab(object):
pass
class Doc(list):
def __init__(self, vocab, words=None):
list.__init__(self)
self.extend([Token(i, w) for i, w in enumerate(words)])
class Token(object):
def __init__(self, i, word):
self.i = i
self.text = word
def find_matches(patterns, doc):
init_states = [(pattern, 0, None) for pattern in patterns]
curr_states = []
matches = []
for token in doc:
nexts = []
for state in (curr_states + init_states):
matches, nexts = transition(state, token, matches, nexts)
curr_states = nexts
return matches
def transition(state, token, matches, nexts):
action = get_action(state, token)
is_match, keep_state, advance_state = [bool(int(c)) for c in action]
pattern, i, start = state
if start is None:
start = token.i
if is_match:
matches.append((pattern, start, token.i+1))
if advance_state:
nexts.append((pattern, i+1, start))
if keep_state:
# TODO: This needs to be zero-width :(.
nexts.append((pattern, i, start))
return (matches, nexts)
def get_action(state, token):
'''We need to consider:
a) Does the token match the specification? [Yes, No]
b) What's the quantifier? [1, 0+, ?]
c) Is this the last specification? [final, non-final]
We can transition in the following ways:
a) Do we emit a match?
b) Do we add a state with (next state, next token)?
c) Do we add a state with (next state, same token)?
d) Do we add a state with (same state, next token)?
We'll code the actions as boolean strings, so 0000 means no to all 4,
1000 means match but no states added, etc.
1:
Yes, final:
1000
Yes, non-final:
0100
No, final:
0000
No, non-final
0000
0+:
Yes, final:
1001
Yes, non-final:
0111
No, final:
1000 (note: Don't include last token!)
No, non-final:
0010
?:
Yes, final:
1000
Yes, non-final:
0100
No, final:
1000 (note: Don't include last token!)
No, non-final:
0010
Problem: If a quantifier is matching, we're adding a lot of open partials
'''
is_match = get_is_match(state, token)
operator = get_operator(state, token)
is_final = get_is_final(state, token)
raise NotImplementedError
def get_is_match(state, token):
pattern, i, start = state
is_match = token.text == pattern[i]['spec']
if pattern[i].get('invert'):
return not is_match
else:
return is_match
def get_is_final(state, token):
pattern, i, start = state
return i == len(pattern)-1
def get_operator(state, token):
pattern, i, start = state
return pattern[i].get('op', '1')
########################
# Tests for get_action #
########################
def test_get_action_simple_match():
pattern = [{'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '100'
def test_get_action_simple_reject():
pattern = [{'spec': 'b', 'op': '1'}]
doc = Doc(Vocab(), words=['a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '000'
def test_get_action_simple_match_match():
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '001'
state = (pattern, 1, 0)
action = get_action(state, doc[1])
assert action == '100'
def test_get_action_simple_match_reject():
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '001'
state = (pattern, 1, 0)
action = get_action(state, doc[1])
assert action == '000'
def test_get_action_simple_match_reject():
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '001'
state = (pattern, 1, 0)
action = get_action(state, doc[1])
assert action == '000'
def test_get_action_plus_match():
pattern = [{'spec': 'a', 'op': '1+'}]
doc = Doc(Vocab(), words=['a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '110'
def test_get_action_plus_match_match():
pattern = [{'spec': 'a', 'op': '1+'}]
doc = Doc(Vocab(), words=['a', 'a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '110'
state = (pattern, 0, 0)
action = get_action(state, doc[1])
assert action == '110'
##########################
# Tests for find_matches #
##########################
def test_find_matches_simple_accept():
pattern = [{'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['a'])
matches = find_matches([pattern], doc)
assert matches == [(pattern, 0, 1)]
def test_find_matches_simple_reject():
pattern = [{'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['b'])
matches = find_matches([pattern], doc)
assert matches == []
def test_find_matches_match_twice():
pattern = [{'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'a'])
matches = find_matches([pattern], doc)
assert matches == [(pattern, 0, 1), (pattern, 1, 2)]
def test_find_matches_longer_pattern():
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'b'])
matches = find_matches([pattern], doc)
assert matches == [(pattern, 0, 2)]
def test_find_matches_two_patterns():
patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]
doc = Doc(Vocab(), words=['a', 'b'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]
def test_find_matches_two_patterns_overlap():
patterns = [[{'spec': 'a'}, {'spec': 'b'}],
[{'spec': 'b'}, {'spec': 'c'}]]
doc = Doc(Vocab(), words=['a', 'b', 'c'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]
def test_find_matches_greedy():
patterns = [[{'spec': 'a', 'op': '1+'}]]
doc = Doc(Vocab(), words=['a'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 1)]
doc = Doc(Vocab(), words=['a', 'a'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]
def test_find_matches_non_greedy():
patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]]
doc = Doc(Vocab(), words=['b'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 1)]

View File

@ -64,23 +64,6 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
return (X, lengths), finish_update return (X, lengths), finish_update
@layerize
def _logistic(X, drop=0.):
xp = get_array_module(X)
if not isinstance(X, xp.ndarray):
X = xp.asarray(X)
# Clip to range (-10, 10)
X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y))
return dX
return Y, logistic_bwd
def _zero_init(model): def _zero_init(model):
def _zero_init_impl(self, X, y): def _zero_init_impl(self, X, y):
self.W.fill(0) self.W.fill(0)
@ -144,8 +127,8 @@ class PrecomputableAffine(Model):
self.nF = nF self.nF = nF
def begin_update(self, X, drop=0.): def begin_update(self, X, drop=0.):
Yf = self.ops.xp.dot(X, Yf = self.ops.gemm(X,
self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T) self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP)) Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
Yf = self._add_padding(Yf) Yf = self._add_padding(Yf)
@ -161,11 +144,11 @@ class PrecomputableAffine(Model):
Wopfi = self.W.transpose((1, 2, 0, 3)) Wopfi = self.W.transpose((1, 2, 0, 3))
Wopfi = self.ops.xp.ascontiguousarray(Wopfi) Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI)) Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi) dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
# Reuse the buffer # Reuse the buffer
dWopfi = Wopfi; dWopfi.fill(0.) dWopfi = Wopfi; dWopfi.fill(0.)
self.ops.xp.dot(dY.T, Xf, out=dWopfi) self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI)) dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
# (o, p, f, i) --> (f, o, p, i) # (o, p, f, i) --> (f, o, p, i)
self.d_W += dWopfi.transpose((2, 0, 1, 3)) self.d_W += dWopfi.transpose((2, 0, 1, 3))
@ -467,6 +450,7 @@ def SpacyVectors(docs, drop=0.):
def build_text_classifier(nr_class, width=64, **cfg): def build_text_classifier(nr_class, width=64, **cfg):
depth = cfg.get('depth', 2)
nr_vector = cfg.get('nr_vector', 5000) nr_vector = cfg.get('nr_vector', 5000)
pretrained_dims = cfg.get('pretrained_dims', 0) pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate, with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
@ -518,7 +502,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
LN(Maxout(width, vectors_width)) LN(Maxout(width, vectors_width))
>> Residual( >> Residual(
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
) ** 2, pad=2 ) ** depth, pad=depth
) )
>> flatten_add_lengths >> flatten_add_lengths
>> ParametricAttention(width) >> ParametricAttention(width)
@ -531,8 +515,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
_preprocess_doc _preprocess_doc
>> LinearModel(nr_class) >> LinearModel(nr_class)
) )
#model = linear_model >> logistic
model = ( model = (
(linear_model | cnn_model) (linear_model | cnn_model)
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))

View File

@ -9,7 +9,7 @@ __uri__ = 'https://spacy.io'
__author__ = 'Explosion AI' __author__ = 'Explosion AI'
__email__ = 'contact@explosion.ai' __email__ = 'contact@explosion.ai'
__license__ = 'MIT' __license__ = 'MIT'
__release__ = True __release__ = False
__docs_models__ = 'https://spacy.io/usage/models' __docs_models__ = 'https://spacy.io/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'

View File

@ -131,7 +131,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'NumValue', 'PartType', 'Polite', 'StyleVariant', 'NumValue', 'PartType', 'Polite', 'StyleVariant',
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', 'Animacy' # U20 'Polarity', 'PrepCase', 'Animacy' # U20
] ]
for key in morph_keys: for key in morph_keys:
if key in stringy_attrs: if key in stringy_attrs:

View File

@ -9,3 +9,5 @@ from .convert import convert
from .vocab import make_vocab as vocab from .vocab import make_vocab as vocab
from .init_model import init_model from .init_model import init_model
from .validate import validate from .validate import validate
from .ud_train import main as ud_train
from .conll17_ud_eval import main as ud_evaluate

View File

@ -0,0 +1,571 @@
#!/usr/bin/env python
# CoNLL 2017 UD Parsing evaluation script.
#
# Compatible with Python 2.7 and 3.2+, can be used either as a module
# or a standalone executable.
#
# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL),
# Faculty of Mathematics and Physics, Charles University, Czech Republic.
#
# Changelog:
# - [02 Jan 2017] Version 0.9: Initial release
# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation
# - [10 Mar 2017] Version 1.0: Add documentation and test
# Compare HEADs correctly using aligned words
# Allow evaluation with errorneous spaces in forms
# Compare forms in LCS case insensitively
# Detect cycles and multiple root nodes
# Compute AlignedAccuracy
# Command line usage
# ------------------
# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file
#
# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics
# is printed
# - if -v is given, several metrics are printed (as precision, recall, F1 score,
# and in case the metric is computed on aligned words also accuracy on these):
# - Tokens: how well do the gold tokens match system tokens
# - Sentences: how well do the gold sentences match system sentences
# - Words: how well can the gold words be aligned to system words
# - UPOS: using aligned words, how well does UPOS match
# - XPOS: using aligned words, how well does XPOS match
# - Feats: using aligned words, how well does FEATS match
# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
# - Lemmas: using aligned words, how well does LEMMA match
# - UAS: using aligned words, how well does HEAD match
# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
# - if weights_file is given (with lines containing deprel-weight pairs),
# one more metric is shown:
# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight
# API usage
# ---------
# - load_conllu(file)
# - loads CoNLL-U file from given file object to an internal representation
# - the file object should return str on both Python 2 and Python 3
# - raises UDError exception if the given file cannot be loaded
# - evaluate(gold_ud, system_ud)
# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
# - raises UDError if the concatenated tokens of gold and system file do not match
# - returns a dictionary with the metrics described above, each metrics having
# three fields: precision, recall and f1
# Description of token matching
# -----------------------------
# In order to match tokens of gold file and system file, we consider the text
# resulting from concatenation of gold tokens and text resulting from
# concatenation of system tokens. These texts should match -- if they do not,
# the evaluation fails.
#
# If the texts do match, every token is represented as a range in this original
# text, and tokens are equal only if their range is the same.
# Description of word matching
# ----------------------------
# When matching words of gold file and system file, we first match the tokens.
# The words which are also tokens are matched as tokens, but words in multi-word
# tokens have to be handled differently.
#
# To handle multi-word tokens, we start by finding "multi-word spans".
# Multi-word span is a span in the original text such that
# - it contains at least one multi-word token
# - all multi-word tokens in the span (considering both gold and system ones)
# are completely inside the span (i.e., they do not "stick out")
# - the multi-word span is as small as possible
#
# For every multi-word span, we align the gold and system words completely
# inside this span using LCS on their FORMs. The words not intersecting
# (even partially) any multi-word span are then aligned as tokens.
from __future__ import division
from __future__ import print_function
import argparse
import io
import sys
import unittest
# CoNLL-U column names
ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)
# UD Error is used when raising exceptions in this module
class UDError(Exception):
pass
# Load given CoNLL-U file into internal representation
def load_conllu(file):
# Internal representation classes
class UDRepresentation:
def __init__(self):
# Characters of all the tokens in the whole file.
# Whitespace between tokens is not included.
self.characters = []
# List of UDSpan instances with start&end indices into `characters`.
self.tokens = []
# List of UDWord instances.
self.words = []
# List of UDSpan instances with start&end indices into `characters`.
self.sentences = []
class UDSpan:
def __init__(self, start, end, characters):
self.start = start
# Note that self.end marks the first position **after the end** of span,
# so we can use characters[start:end] or range(start, end).
self.end = end
self.characters = characters
@property
def text(self):
return ''.join(self.characters[self.start:self.end])
def __str__(self):
return self.text
def __repr__(self):
return self.text
class UDWord:
def __init__(self, span, columns, is_multiword):
# Span of this word (or MWT, see below) within ud_representation.characters.
self.span = span
# 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
self.columns = columns
# is_multiword==True means that this word is part of a multi-word token.
# In that case, self.span marks the span of the whole multi-word token.
self.is_multiword = is_multiword
# Reference to the UDWord instance representing the HEAD (or None if root).
self.parent = None
# Let's ignore language-specific deprel subtypes.
self.columns[DEPREL] = columns[DEPREL].split(':')[0]
ud = UDRepresentation()
# Load the CoNLL-U file
index, sentence_start = 0, None
linenum = 0
while True:
line = file.readline()
linenum += 1
if not line:
break
line = line.rstrip("\r\n")
# Handle sentence start boundaries
if sentence_start is None:
# Skip comments
if line.startswith("#"):
continue
# Start a new sentence
ud.sentences.append(UDSpan(index, 0, ud.characters))
sentence_start = len(ud.words)
if not line:
# Add parent UDWord links and check there are no cycles
def process_word(word):
if word.parent == "remapping":
raise UDError("There is a cycle in a sentence")
if word.parent is None:
head = int(word.columns[HEAD])
if head > len(ud.words) - sentence_start:
raise UDError("Line {}: HEAD '{}' points outside of the sentence".format(
linenum, word.columns[HEAD]))
if head:
parent = ud.words[sentence_start + head - 1]
word.parent = "remapping"
process_word(parent)
word.parent = parent
for word in ud.words[sentence_start:]:
process_word(word)
# Check there is a single root node
if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
raise UDError("There are multiple roots in a sentence")
# End the sentence
ud.sentences[-1].end = index
sentence_start = None
continue
# Read next token/word
columns = line.split("\t")
if len(columns) != 10:
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line))
# Skip empty nodes
if "." in columns[ID]:
continue
# Delete spaces from FORM so gold.characters == system.characters
# even if one of them tokenizes the space.
columns[FORM] = columns[FORM].replace(" ", "")
if not columns[FORM]:
raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum)
# Save token
ud.characters.extend(columns[FORM])
ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters))
index += len(columns[FORM])
# Handle multi-word tokens to save word(s)
if "-" in columns[ID]:
try:
start, end = map(int, columns[ID].split("-"))
except:
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
for _ in range(start, end + 1):
word_line = file.readline().rstrip("\r\n")
word_columns = word_line.split("\t")
if len(word_columns) != 10:
print(columns)
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line))
ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
# Basic tokens/words
else:
try:
word_id = int(columns[ID])
except:
raise UDError("Cannot parse word ID '{}'".format(columns[ID]))
if word_id != len(ud.words) - sentence_start + 1:
raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1))
try:
head_id = int(columns[HEAD])
except:
raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD]))
if head_id < 0:
raise UDError("HEAD cannot be negative")
ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
if sentence_start is not None:
raise UDError("The CoNLL-U file does not end with empty line")
return ud
# Evaluate the gold and system treebanks (loaded using load_conllu).
def evaluate(gold_ud, system_ud, deprel_weights=None):
class Score:
def __init__(self, gold_total, system_total, correct, aligned_total=None):
self.precision = correct / system_total if system_total else 0.0
self.recall = correct / gold_total if gold_total else 0.0
self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
class AlignmentWord:
def __init__(self, gold_word, system_word):
self.gold_word = gold_word
self.system_word = system_word
self.gold_parent = None
self.system_parent_gold_aligned = None
class Alignment:
def __init__(self, gold_words, system_words):
self.gold_words = gold_words
self.system_words = system_words
self.matched_words = []
self.matched_words_map = {}
def append_aligned_words(self, gold_word, system_word):
self.matched_words.append(AlignmentWord(gold_word, system_word))
self.matched_words_map[system_word] = gold_word
def fill_parents(self):
# We represent root parents in both gold and system data by '0'.
# For gold data, we represent non-root parent by corresponding gold word.
# For system data, we represent non-root parent by either gold word aligned
# to parent system nodes, or by None if no gold words is aligned to the parent.
for words in self.matched_words:
words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0
words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \
if words.system_word.parent is not None else 0
def lower(text):
if sys.version_info < (3, 0) and isinstance(text, str):
return text.decode("utf-8").lower()
return text.lower()
def spans_score(gold_spans, system_spans):
correct, gi, si = 0, 0, 0
while gi < len(gold_spans) and si < len(system_spans):
if system_spans[si].start < gold_spans[gi].start:
si += 1
elif gold_spans[gi].start < system_spans[si].start:
gi += 1
else:
correct += gold_spans[gi].end == system_spans[si].end
si += 1
gi += 1
return Score(len(gold_spans), len(system_spans), correct)
def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
gold, system, aligned, correct = 0, 0, 0, 0
for word in alignment.gold_words:
gold += weight_fn(word)
for word in alignment.system_words:
system += weight_fn(word)
for words in alignment.matched_words:
aligned += weight_fn(words.gold_word)
if key_fn is None:
# Return score for whole aligned words
return Score(gold, system, aligned)
for words in alignment.matched_words:
if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned):
correct += weight_fn(words.gold_word)
return Score(gold, system, correct, aligned)
def beyond_end(words, i, multiword_span_end):
if i >= len(words):
return True
if words[i].is_multiword:
return words[i].span.start >= multiword_span_end
return words[i].span.end > multiword_span_end
def extend_end(word, multiword_span_end):
if word.is_multiword and word.span.end > multiword_span_end:
return word.span.end
return multiword_span_end
def find_multiword_span(gold_words, system_words, gi, si):
# We know gold_words[gi].is_multiword or system_words[si].is_multiword.
# Find the start of the multiword span (gs, ss), so the multiword span is minimal.
# Initialize multiword_span_end characters index.
if gold_words[gi].is_multiword:
multiword_span_end = gold_words[gi].span.end
if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start:
si += 1
else: # if system_words[si].is_multiword
multiword_span_end = system_words[si].span.end
if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start:
gi += 1
gs, ss = gi, si
# Find the end of the multiword span
# (so both gi and si are pointing to the word following the multiword span end).
while not beyond_end(gold_words, gi, multiword_span_end) or \
not beyond_end(system_words, si, multiword_span_end):
if gi < len(gold_words) and (si >= len(system_words) or
gold_words[gi].span.start <= system_words[si].span.start):
multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
gi += 1
else:
multiword_span_end = extend_end(system_words[si], multiword_span_end)
si += 1
return gs, ss, gi, si
def compute_lcs(gold_words, system_words, gi, si, gs, ss):
lcs = [[0] * (si - ss) for i in range(gi - gs)]
for g in reversed(range(gi - gs)):
for s in reversed(range(si - ss)):
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
return lcs
def align_words(gold_words, system_words):
alignment = Alignment(gold_words, system_words)
gi, si = 0, 0
while gi < len(gold_words) and si < len(system_words):
if gold_words[gi].is_multiword or system_words[si].is_multiword:
# A: Multi-word tokens => align via LCS within the whole "multiword span".
gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
if si > ss and gi > gs:
lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
# Store aligned words
s, g = 0, 0
while g < gi - gs and s < si - ss:
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
g += 1
s += 1
elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0):
g += 1
else:
s += 1
else:
# B: No multi-word token => align according to spans.
if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end):
alignment.append_aligned_words(gold_words[gi], system_words[si])
gi += 1
si += 1
elif gold_words[gi].span.start <= system_words[si].span.start:
gi += 1
else:
si += 1
alignment.fill_parents()
return alignment
# Check that underlying character sequences do match
if gold_ud.characters != system_ud.characters:
index = 0
while gold_ud.characters[index] == system_ud.characters[index]:
index += 1
raise UDError(
"The concatenation of tokens in gold file and in system file differ!\n" +
"First 20 differing characters in gold file: '{}' and system file: '{}'".format(
"".join(gold_ud.characters[index:index + 20]),
"".join(system_ud.characters[index:index + 20])
)
)
# Align words
alignment = align_words(gold_ud.words, system_ud.words)
# Compute the F1-scores
result = {
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
"Words": alignment_score(alignment, None),
"UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
"XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
"AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
"UAS": alignment_score(alignment, lambda w, parent: parent),
"LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
}
# Add WeightedLAS if weights are given
if deprel_weights is not None:
def weighted_las(word):
return deprel_weights.get(word.columns[DEPREL], 1.0)
result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las)
return result
def load_deprel_weights(weights_file):
if weights_file is None:
return None
deprel_weights = {}
for line in weights_file:
# Ignore comments and empty lines
if line.startswith("#") or not line.strip():
continue
columns = line.rstrip("\r\n").split()
if len(columns) != 2:
raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line))
deprel_weights[columns[0]] = float(columns[1])
return deprel_weights
def load_conllu_file(path):
_file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
return load_conllu(_file)
def evaluate_wrapper(args):
# Load CoNLL-U files
gold_ud = load_conllu_file(args.gold_file)
system_ud = load_conllu_file(args.system_file)
# Load weights if requested
deprel_weights = load_deprel_weights(args.weights)
return evaluate(gold_ud, system_ud, deprel_weights)
def main():
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("gold_file", type=str,
help="Name of the CoNLL-U file with the gold data.")
parser.add_argument("system_file", type=str,
help="Name of the CoNLL-U file with the predicted data.")
parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None,
metavar="deprel_weights_file",
help="Compute WeightedLAS using given weights for Universal Dependency Relations.")
parser.add_argument("--verbose", "-v", default=0, action="count",
help="Print all metrics.")
args = parser.parse_args()
# Use verbose if weights are supplied
if args.weights is not None and not args.verbose:
args.verbose = 1
# Evaluate
evaluation = evaluate_wrapper(args)
# Print the evaluation
if not args.verbose:
print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
else:
metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"]
if args.weights is not None:
metrics.append("WeightedLAS")
print("Metrics | Precision | Recall | F1 Score | AligndAcc")
print("-----------+-----------+-----------+-----------+-----------")
for metric in metrics:
print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
metric,
100 * evaluation[metric].precision,
100 * evaluation[metric].recall,
100 * evaluation[metric].f1,
"{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
))
if __name__ == "__main__":
main()
# Tests, which can be executed with `python -m unittest conll17_ud_eval`.
class TestAlignment(unittest.TestCase):
@staticmethod
def _load_words(words):
"""Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors."""
lines, num_words = [], 0
for w in words:
parts = w.split(" ")
if len(parts) == 1:
num_words += 1
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1)))
else:
lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0]))
for part in parts[1:]:
num_words += 1
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1)))
return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"])))
def _test_exception(self, gold, system):
self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system))
def _test_ok(self, gold, system, correct):
metrics = evaluate(self._load_words(gold), self._load_words(system))
gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold))
system_words = sum((max(1, len(word.split(" ")) - 1) for word in system))
self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1),
(correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words)))
def test_exception(self):
self._test_exception(["a"], ["b"])
def test_equal(self):
self._test_ok(["a"], ["a"], 1)
self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3)
def test_equal_with_multiword(self):
self._test_ok(["abc a b c"], ["a", "b", "c"], 3)
self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4)
self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4)
self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5)
def test_alignment(self):
self._test_ok(["abcd"], ["a", "b", "c", "d"], 0)
self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1)
self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2)
self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2)
self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4)
self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2)
self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)

View File

@ -8,8 +8,8 @@ from thinc.neural._classes.model import Model
from timeit import default_timer as timer from timeit import default_timer as timer
from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus, minibatch from ..gold import GoldCorpus
from ..util import prints from ..util import prints, minibatch, minibatch_by_words
from .. import util from .. import util
from .. import about from .. import about
from .. import displacy from .. import displacy
@ -51,8 +51,6 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data) dev_path = util.ensure_path(dev_data)
meta_path = util.ensure_path(meta_path) meta_path = util.ensure_path(meta_path)
if not output_path.exists():
output_path.mkdir()
if not train_path.exists(): if not train_path.exists():
prints(train_path, title="Training data not found", exits=1) prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists(): if dev_path and not dev_path.exists():
@ -66,6 +64,13 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
meta.setdefault('lang', lang) meta.setdefault('lang', lang)
meta.setdefault('name', 'unnamed') meta.setdefault('name', 'unnamed')
if not output_path.exists():
output_path.mkdir()
print("Counting training words (limit=%s" % n_sents)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train()
print(n_train_words)
pipeline = ['tagger', 'parser', 'ner'] pipeline = ['tagger', 'parser', 'ner']
if no_tagger and 'tagger' in pipeline: if no_tagger and 'tagger' in pipeline:
pipeline.remove('tagger') pipeline.remove('tagger')
@ -81,13 +86,9 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
util.env_opt('dropout_to', 0.2), util.env_opt('dropout_to', 0.2),
util.env_opt('dropout_decay', 0.0)) util.env_opt('dropout_decay', 0.0))
batch_sizes = util.compounding(util.env_opt('batch_from', 1), batch_sizes = util.compounding(util.env_opt('batch_from', 1000),
util.env_opt('batch_to', 16), util.env_opt('batch_to', 1000),
util.env_opt('batch_compound', 1.001)) util.env_opt('batch_compound', 1.001))
max_doc_len = util.env_opt('max_doc_len', 5000)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train()
lang_class = util.get_lang_class(lang) lang_class = util.get_lang_class(lang)
nlp = lang_class() nlp = lang_class()
meta['pipeline'] = pipeline meta['pipeline'] = pipeline
@ -105,6 +106,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
lex.is_oov = False lex.is_oov = False
for name in pipeline: for name in pipeline:
nlp.add_pipe(nlp.create_pipe(name), name=name) nlp.add_pipe(nlp.create_pipe(name), name=name)
nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
if parser_multitasks: if parser_multitasks:
for objective in parser_multitasks.split(','): for objective in parser_multitasks.split(','):
nlp.parser.add_multitask_objective(objective) nlp.parser.add_multitask_objective(objective)
@ -116,21 +118,20 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
try: try:
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
train_docs = list(train_docs)
for i in range(n_iter): for i in range(n_iter):
train_docs = corpus.train_docs(nlp, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
words_seen = 0
with tqdm.tqdm(total=n_train_words, leave=False) as pbar: with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {} losses = {}
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch_by_words(train_docs, size=batch_sizes):
batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
if not batch: if not batch:
continue continue
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses) drop=next(dropout_rates), losses=losses)
pbar.update(sum(len(doc) for doc in docs)) pbar.update(sum(len(doc) for doc in docs))
words_seen += sum(len(doc) for doc in docs)
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
util.set_env_log(False) util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i) epoch_model_path = output_path / ('model%d' % i)

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import six
import ftfy import ftfy
import sys import sys
import ujson import ujson
@ -47,8 +46,9 @@ is_windows = sys.platform.startswith('win')
is_linux = sys.platform.startswith('linux') is_linux = sys.platform.startswith('linux')
is_osx = sys.platform == 'darwin' is_osx = sys.platform == 'darwin'
is_python2 = six.PY2 # See: https://github.com/benjaminp/six/blob/master/six.py
is_python3 = six.PY3 is_python2 = sys.version_info[0] == 2
is_python3 = sys.version_info[0] == 3
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5) is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
if is_python2: if is_python2:

View File

@ -3,16 +3,25 @@
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import re import re
import ujson
import random import random
import cytoolz import cytoolz
import itertools import itertools
import numpy
import tempfile
import shutil
from pathlib import Path
import msgpack
import ujson
from . import _align
from .syntax import nonproj from .syntax import nonproj
from .tokens import Doc from .tokens import Doc
from . import util from . import util
from .util import minibatch from .util import minibatch, itershuffle
from .compat import json_dumps
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
def tags_to_entities(tags): def tags_to_entities(tags):
entities = [] entities = []
@ -59,196 +68,62 @@ def merge_sents(sents):
return [(m_deps, m_brackets)] return [(m_deps, m_brackets)]
def align(cand_words, gold_words):
cost, edit_path = _min_edit_path(cand_words, gold_words)
alignment = []
i_of_gold = 0
for move in edit_path:
if move == 'M':
alignment.append(i_of_gold)
i_of_gold += 1
elif move == 'S':
alignment.append(None)
i_of_gold += 1
elif move == 'D':
alignment.append(None)
elif move == 'I':
i_of_gold += 1
else:
raise Exception(move)
return alignment
punct_re = re.compile(r'\W') punct_re = re.compile(r'\W')
def align(cand_words, gold_words):
def _min_edit_path(cand_words, gold_words):
cdef:
Pool mem
int i, j, n_cand, n_gold
int* curr_costs
int* prev_costs
# TODO: Fix this --- just do it properly, make the full edit matrix and
# then walk back over it...
# Preprocess inputs
cand_words = [punct_re.sub('', w).lower() for w in cand_words]
gold_words = [punct_re.sub('', w).lower() for w in gold_words]
if cand_words == gold_words: if cand_words == gold_words:
return 0, ''.join(['M' for _ in gold_words]) alignment = numpy.arange(len(cand_words))
mem = Pool() return 0, alignment, alignment, {}, {}
n_cand = len(cand_words) cand_words = [w.replace(' ', '') for w in cand_words]
n_gold = len(gold_words) gold_words = [w.replace(' ', '') for w in gold_words]
# Levenshtein distance, except we need the history, and we may want cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
# different costs. Mark operations with a string, and score the history i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words],
# using _edit_cost. [len(w) for w in gold_words])
previous_row = [] for i, j in list(i2j_multi.items()):
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int)) if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int)) i2j[i] = j
for i in range(n_gold + 1): i2j_multi.pop(i)
cell = '' for j, i in list(j2i_multi.items()):
for j in range(i): if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
cell += 'I' j2i[j] = i
previous_row.append('I' * i) j2i_multi.pop(j)
prev_costs[i] = i return cost, i2j, j2i, i2j_multi, j2i_multi
for i, cand in enumerate(cand_words):
current_row = ['D' * (i + 1)]
curr_costs[0] = i+1
for j, gold in enumerate(gold_words):
if gold.lower() == cand.lower():
s_cost = prev_costs[j]
i_cost = curr_costs[j] + 1
d_cost = prev_costs[j + 1] + 1
else:
s_cost = prev_costs[j] + 1
i_cost = curr_costs[j] + 1
d_cost = prev_costs[j + 1] + (1 if cand else 0)
if s_cost <= i_cost and s_cost <= d_cost:
best_cost = s_cost
best_hist = previous_row[j] + ('M' if gold == cand else 'S')
elif i_cost <= s_cost and i_cost <= d_cost:
best_cost = i_cost
best_hist = current_row[j] + 'I'
else:
best_cost = d_cost
best_hist = previous_row[j + 1] + 'D'
current_row.append(best_hist)
curr_costs[j+1] = best_cost
previous_row = current_row
for j in range(len(gold_words) + 1):
prev_costs[j] = curr_costs[j]
curr_costs[j] = 0
return prev_costs[n_gold], previous_row[-1]
class GoldCorpus(object): class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages """An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.""" annotations for tagging, dependency parsing and NER."""
def __init__(self, train_path, dev_path, gold_preproc=True, limit=None): def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus. """Create a GoldCorpus.
train_path (unicode or Path): File or directory of training data. train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data. dev_path (unicode or Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object. RETURNS (GoldCorpus): The newly created object.
""" """
self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path)
self.limit = limit self.limit = limit
self.train_locs = self.walk_corpus(self.train_path) if isinstance(train, str) or isinstance(train, Path):
self.dev_locs = self.walk_corpus(self.dev_path) train = self.read_tuples(self.walk_corpus(train))
dev = self.read_tuples(self.walk_corpus(dev))
@property # Write temp directory with one doc per file, so we can shuffle
def train_tuples(self): # and stream
i = 0 self.tmp_dir = Path(tempfile.mkdtemp())
for loc in self.train_locs: self.write_msgpack(self.tmp_dir / 'train', train)
gold_tuples = read_json_file(loc) self.write_msgpack(self.tmp_dir / 'dev', dev)
for item in gold_tuples:
yield item
i += len(item[1])
if self.limit and i >= self.limit:
break
@property def __del__(self):
def dev_tuples(self): shutil.rmtree(self.tmp_dir)
i = 0
for loc in self.dev_locs:
gold_tuples = read_json_file(loc)
for item in gold_tuples:
yield item
i += len(item[1])
if self.limit and i >= self.limit:
break
def count_train(self): @staticmethod
n = 0 def write_msgpack(directory, doc_tuples):
i = 0 if not directory.exists():
for raw_text, paragraph_tuples in self.train_tuples: directory.mkdir()
n += sum([len(s[0][1]) for s in paragraph_tuples]) for i, doc_tuple in enumerate(doc_tuples):
if self.limit and i >= self.limit: with open(directory / '{}.msg'.format(i), 'wb') as file_:
break msgpack.dump([doc_tuple], file_, use_bin_type=True, encoding='utf8')
i += len(paragraph_tuples)
return n
def train_docs(self, nlp, gold_preproc=False,
projectivize=False, max_length=None,
noise_level=0.0):
train_tuples = self.train_tuples
if projectivize:
train_tuples = nonproj.preprocess_training_data(
self.train_tuples, label_freq_cutoff=100)
random.shuffle(train_tuples)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length,
noise_level=noise_level)
yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
yield from gold_docs
@classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0):
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length:
yield doc, gold
@classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
noise_level=0.0):
if raw_text is not None:
raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)]
else:
return [Doc(nlp.vocab,
words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples]
@classmethod
def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples)
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0],
paragraph_tuples[0][0])]
else:
return [GoldParse.from_annot_tuples(doc, sent_tuples)
for doc, (sent_tuples, brackets)
in zip(docs, paragraph_tuples)]
@staticmethod @staticmethod
def walk_corpus(path): def walk_corpus(path):
path = util.ensure_path(path)
if not path.is_dir(): if not path.is_dir():
return [path] return [path]
paths = [path] paths = [path]
@ -266,6 +141,101 @@ class GoldCorpus(object):
locs.append(path) locs.append(path)
return locs return locs
@staticmethod
def read_tuples(locs, limit=0):
i = 0
for loc in locs:
loc = util.ensure_path(loc)
if loc.parts[-1].endswith('json'):
gold_tuples = read_json_file(loc)
elif loc.parts[-1].endswith('msg'):
with loc.open('rb') as file_:
gold_tuples = msgpack.load(file_, encoding='utf8')
else:
msg = "Cannot read from file: %s. Supported formats: .json, .msg"
raise ValueError(msg % loc)
for item in gold_tuples:
yield item
i += len(item[1])
if limit and i >= limit:
break
@property
def dev_tuples(self):
locs = (self.tmp_dir / 'dev').iterdir()
yield from self.read_tuples(locs, limit=self.limit)
@property
def train_tuples(self):
locs = (self.tmp_dir / 'train').iterdir()
yield from self.read_tuples(locs, limit=self.limit)
def count_train(self):
n = 0
i = 0
for raw_text, paragraph_tuples in self.train_tuples:
for sent_tuples, brackets in paragraph_tuples:
n += len(sent_tuples[1])
if self.limit and i >= self.limit:
break
i += len(paragraph_tuples)
return n
def train_docs(self, nlp, gold_preproc=False, max_length=None,
noise_level=0.0):
locs = list((self.tmp_dir / 'train').iterdir())
random.shuffle(locs)
train_tuples = self.read_tuples(locs, limit=self.limit)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length,
noise_level=noise_level,
make_projective=True)
yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples,
gold_preproc=gold_preproc)
yield from gold_docs
@classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0, make_projective=False):
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples, make_projective)
for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length:
yield doc, gold
@classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
noise_level=0.0):
if raw_text is not None:
raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)]
else:
return [Doc(nlp.vocab,
words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples]
@classmethod
def _make_golds(cls, docs, paragraph_tuples, make_projective):
assert len(docs) == len(paragraph_tuples)
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0],
paragraph_tuples[0][0],
make_projective=make_projective)]
else:
return [GoldParse.from_annot_tuples(doc, sent_tuples,
make_projective=make_projective)
for doc, (sent_tuples, brackets)
in zip(docs, paragraph_tuples)]
def add_noise(orig, noise_level): def add_noise(orig, noise_level):
if random.random() >= noise_level: if random.random() >= noise_level:
@ -297,11 +267,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
for filename in loc.iterdir(): for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit) yield from read_json_file(loc / filename, limit=limit)
else: else:
with loc.open('r', encoding='utf8') as file_: for doc in _json_iterate(loc):
docs = ujson.load(file_)
if limit is not None:
docs = docs[:limit]
for doc in docs:
if docs_filter is not None and not docs_filter(doc): if docs_filter is not None and not docs_filter(doc):
continue continue
paragraphs = [] paragraphs = []
@ -331,6 +297,56 @@ def read_json_file(loc, docs_filter=None, limit=None):
yield [paragraph.get('raw', None), sents] yield [paragraph.get('raw', None), sents]
def _json_iterate(loc):
# We should've made these files jsonl...But since we didn't, parse out
# the docs one-by-one to reduce memory usage.
# It's okay to read in the whole file -- just don't parse it into JSON.
cdef bytes py_raw
loc = util.ensure_path(loc)
with loc.open('rb') as file_:
py_raw = file_.read()
raw = <char*>py_raw
cdef int square_depth = 0
cdef int curly_depth = 0
cdef int inside_string = 0
cdef int escape = 0
cdef int start = -1
cdef char c
cdef char quote = ord('"')
cdef char backslash = ord('\\')
cdef char open_square = ord('[')
cdef char close_square = ord(']')
cdef char open_curly = ord('{')
cdef char close_curly = ord('}')
for i in range(len(py_raw)):
c = raw[i]
if c == backslash:
escape = True
continue
if escape:
escape = False
continue
if c == quote:
inside_string = not inside_string
continue
if inside_string:
continue
if c == open_square:
square_depth += 1
elif c == close_square:
square_depth -= 1
elif c == open_curly:
if square_depth == 1 and curly_depth == 0:
start = i
curly_depth += 1
elif c == close_curly:
curly_depth -= 1
if square_depth == 1 and curly_depth == 0:
py_str = py_raw[start : i+1].decode('utf8')
yield ujson.loads(py_str)
start = -1
def iob_to_biluo(tags): def iob_to_biluo(tags):
out = [] out = []
curr_label = None curr_label = None
@ -434,8 +450,21 @@ cdef class GoldParse:
self.labels = [None] * len(doc) self.labels = [None] * len(doc)
self.ner = [None] * len(doc) self.ner = [None] * len(doc)
self.cand_to_gold = align([t.orth_ for t in doc], words) # This needs to be done before we align the words
self.gold_to_cand = align(words, [t.orth_ for t in doc]) if make_projective and heads is not None and deps is not None:
heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens.
# If we over-segment, we'll have one gold word that covers a sequence
# of predicted words
# If we under-segment, we'll have one predicted word that covers a
# sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that.
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
annot_tuples = (range(len(words)), words, tags, heads, deps, entities) annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
self.orig_annot = list(zip(*annot_tuples)) self.orig_annot = list(zip(*annot_tuples))
@ -443,12 +472,47 @@ cdef class GoldParse:
for i, gold_i in enumerate(self.cand_to_gold): for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace(): if doc[i].text.isspace():
self.words[i] = doc[i].text self.words[i] = doc[i].text
self.tags[i] = 'SP' self.tags[i] = '_SP'
self.heads[i] = None self.heads[i] = None
self.labels[i] = None self.labels[i] = None
self.ner[i] = 'O' self.ner[i] = 'O'
if gold_i is None: if gold_i is None:
pass if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last
if not is_last:
self.heads[i] = i+1
self.labels[i] = 'subtok'
else:
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
self.labels[i] = deps[i2j_multi[i]]
# Now set NER...This is annoying because if we've split
# got an entity word split into two, we need to adjust the
# BILOU tags. We can't have BB or LL etc.
# Case 1: O -- easy.
ner_tag = entities[i2j_multi[i]]
if ner_tag == 'O':
self.ner[i] = 'O'
# Case 2: U. This has to become a B I* L sequence.
elif ner_tag.startswith('U-'):
if is_first:
self.ner[i] = ner_tag.replace('U-', 'B-', 1)
elif is_last:
self.ner[i] = ner_tag.replace('U-', 'L-', 1)
else:
self.ner[i] = ner_tag.replace('U-', 'I-', 1)
# Case 3: L. If not last, change to I.
elif ner_tag.startswith('L-'):
if is_last:
self.ner[i] = ner_tag
else:
self.ner[i] = ner_tag.replace('L-', 'I-', 1)
# Case 4: I. Stays correct
elif ner_tag.startswith('I-'):
self.ner[i] = ner_tag
else: else:
self.words[i] = words[gold_i] self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i] self.tags[i] = tags[gold_i]
@ -463,10 +527,6 @@ cdef class GoldParse:
if cycle is not None: if cycle is not None:
raise Exception("Cycle found: %s" % cycle) raise Exception("Cycle found: %s" % cycle)
if make_projective:
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
self.heads = proj_heads
def __len__(self): def __len__(self):
"""Get the number of gold-standard tokens. """Get the number of gold-standard tokens.

View File

@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
must my myself must my myself
name namely neither never nevertheless next nine no nobody none noone nor not name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere nothing now nowhere n't
of off often on once one only onto or other others otherwise our ours ourselves of off often on once one only onto or other others otherwise our ours ourselves
out over own out over own
@ -66,4 +66,6 @@ whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves yet you your yours yourself yourselves
'd 'll 'm 're 's 've
""".split()) """.split())

View File

@ -6,17 +6,19 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX
def noun_chunks(obj): def noun_chunks(obj):
doc = obj.doc doc = obj.doc
np_label = doc.vocab.strings['NP'] if not len(doc):
return
np_label = doc.vocab.strings.add('NP')
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed'] left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
right_labels = ['flat', 'fixed', 'compound', 'neg'] right_labels = ['flat', 'fixed', 'compound', 'neg']
stop_labels = ['punct'] stop_labels = ['punct']
np_left_deps = [doc.vocab.strings[label] for label in left_labels] np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
np_right_deps = [doc.vocab.strings[label] for label in right_labels] np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
stop_deps = [doc.vocab.strings[label] for label in stop_labels] stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
token = doc[0] token = doc[0]
while token and token.i < len(doc): while token and token.i < len(doc):
if token.pos in [PROPN, NOUN, PRON]: if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(token) left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
yield left.i, right.i+1, np_label yield left.i, right.i+1, np_label
token = right token = right
token = next_token(token) token = next_token(token)
@ -33,7 +35,7 @@ def next_token(token):
return None return None
def noun_bounds(root): def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
left_bound = root left_bound = root
for token in reversed(list(root.lefts)): for token in reversed(list(root.lefts)):
if token.dep in np_left_deps: if token.dep in np_left_deps:
@ -41,7 +43,7 @@ def noun_bounds(root):
right_bound = root right_bound = root
for token in root.rights: for token in root.rights:
if (token.dep in np_right_deps): if (token.dep in np_right_deps):
left, right = noun_bounds(token) left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i: right.i])): doc[left_bound.i: right.i])):
break break

15
spacy/lang/fi/examples.py Normal file
View File

@ -0,0 +1,15 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.fi.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple harkitsee ostavansa startup-yrityksen UK:sta 1 miljardilla dollarilla.",
"Itseajavat autot siirtävät vakuutusriskin valmistajille.",
"San Francisco harkitsee jakelurobottien kieltämistä jalkakäytävillä.",
"Lontoo on iso kaupunki Iso-Britanniassa."
]

View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
# import the symbols for the attrs you want to overwrite
from ...attrs import LIKE_NUM
# check if token resembles a number
_num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'seitsemän', 'kahdeksan', 'yhdeksän', 'kymmenen', 'yksitoista', 'kaksitoista', 'kolmetoista' 'neljätoista', 'viisitoista', 'kuusitoista', 'seitsemäntoista', 'kahdeksantoista', 'yhdeksäntoista', 'kaksikymmentä', 'kolmekymmentä', 'neljäkymmentä', 'viisikymmentä', 'kuusikymmentä'v, 'seitsemänkymmentä', 'kahdeksankymmentä', 'yhdeksänkymmentä', 'sata', 'tuhat', 'miljoona', 'miljardi', 'triljoona']
def like_num(text):
text = text.replace('.', '').replace(',', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -79,7 +79,7 @@ pienestä pieni pienin poikki puolesta puolestaan päälle
runsaasti runsaasti
saakka sama samaa samaan samalla saman samat samoin sata sataa satojen se saakka sama samaa samaan samalla saman samat samoin satojen se
seitsemän sekä sen seuraavat siellä sieltä siihen siinä siis siitä sijaan siksi seitsemän sekä sen seuraavat siellä sieltä siihen siinä siis siitä sijaan siksi
sille silloin sillä silti siltä sinne sinua sinulla sinulle sinulta sinun sille silloin sillä silti siltä sinne sinua sinulla sinulle sinulta sinun
sinussa sinusta sinut sinuun sinä sisäkkäin sisällä siten sitten sitä ssa sta sinussa sinusta sinut sinuun sinä sisäkkäin sisällä siten sitten sitä ssa sta
@ -89,7 +89,7 @@ taa taas taemmas tahansa tai takaa takaisin takana takia tallä tapauksessa
tarpeeksi tavalla tavoitteena te teidän teidät teihin teille teillä teiltä tarpeeksi tavalla tavoitteena te teidän teidät teihin teille teillä teiltä
teissä teistä teitä tietysti todella toinen toisaalla toisaalle toisaalta teissä teistä teitä tietysti todella toinen toisaalla toisaalle toisaalta
toiseen toiseksi toisella toiselle toiselta toisemme toisen toisensa toisessa toiseen toiseksi toisella toiselle toiselta toisemme toisen toisensa toisessa
toisesta toista toistaiseksi toki tosin tuhannen tuhat tule tulee tulemme tulen toisesta toista toistaiseksi toki tosin tule tulee tulemme tulen
tulet tulette tulevat tulimme tulin tulisi tulisimme tulisin tulisit tulisitte tulet tulette tulevat tulimme tulin tulisi tulisimme tulisin tulisit tulisitte
tulisivat tulit tulitte tulivat tulla tulleet tullut tuntuu tuo tuohon tuoksi tulisivat tulit tulitte tulivat tulla tulleet tullut tuntuu tuo tuohon tuoksi
tuolla tuolle tuolloin tuolta tuon tuona tuonne tuossa tuosta tuota tuskin tykö tuolla tuolle tuolloin tuolta tuon tuona tuonne tuossa tuosta tuota tuskin tykö

View File

@ -35,14 +35,32 @@ class JapaneseTokenizer(object):
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
return self return self
class JapaneseCharacterSegmenter(object):
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = []
spaces = []
doc = self.tokenizer(text)
for token in self.tokenizer(text):
words.extend(list(token.text))
spaces.extend([False]*len(token.text))
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ja' lex_attr_getters[LANG] = lambda text: 'ja'
use_janome = True
@classmethod @classmethod
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
if cls.use_janome:
return JapaneseTokenizer(cls, nlp) return JapaneseTokenizer(cls, nlp)
else:
return JapaneseCharacterSegmenter(cls, nlp.vocab)
class Japanese(Language): class Japanese(Language):

View File

@ -144,7 +144,7 @@ def is_lower(string): return string.islower()
def is_space(string): return string.isspace() def is_space(string): return string.isspace()
def is_title(string): return string.istitle() def is_title(string): return string.istitle()
def is_upper(string): return string.isupper() def is_upper(string): return string.isupper()
def is_stop(string, stops=set()): return string in stops def is_stop(string, stops=set()): return string.lower() in stops
def is_oov(string): return True def is_oov(string): return True
def get_prob(string): return -20. def get_prob(string): return -20.

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -17,6 +18,7 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
tag_map = TAG_MAP
class Polish(Language): class Polish(Language):

1628
spacy/lang/pl/tag_map.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
# encoding: utf8 # encoding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
_exc = {} _exc = {}
@ -12,24 +12,11 @@ for exc_data in [
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN}, {ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV}, {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV}, {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}, {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
{ORTH: "adw.", LEMMA: "adwokat", POS: NOUN},
{ORTH: "afr.", LEMMA: "afrykański", POS: ADJ},
{ORTH: "c.b.d.o.", LEMMA: "co było do okazania", POS: ADV},
{ORTH: "cbdu.", LEMMA: "co było do udowodnienia", POS: ADV},
{ORTH: "mn.w.", LEMMA: "mniej więcej", POS: ADV},
{ORTH: "nt.", LEMMA: "na temat", POS: ADP},
{ORTH: "ok.", LEMMA: "około"},
{ORTH: "n.p.u.", LEMMA: "na psa urok"},
{ORTH: "ww.", LEMMA: "wyżej wymieniony", POS: ADV}]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
for orth in [ for orth in [
"w.", "r.", "br.", "bm.", "b.r.", "amer.", "am.", "bdb.", "św.", "p.", "lit.", "w.", "r."]:
"wym.", "czyt.", "daw.", "d.", "zob.", "gw.", "dn.", "dyr.", "im.", "mł.",
"min.", "dot.", "muz.", "k.k.", "k.p.a.", "k.p.c.", "n.p.m.", "p.p.m.", "nb.",
"ob.", "n.e.", "p.n.e.", "zw.", "zool.", "zach.", "żarg.", "żart.", "wzgl.",
"wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]

View File

@ -24,5 +24,5 @@ TAG_MAP = {
"ADJ": {POS: ADJ}, "ADJ": {POS: ADJ},
"VERB": {POS: VERB}, "VERB": {POS: VERB},
"PART": {POS: PART}, "PART": {POS: PART},
"SP": {POS: SPACE} "_SP": {POS: SPACE}
} }

19
spacy/lang/vi/__init__.py Normal file
View File

@ -0,0 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
class VietnameseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'vi' # for pickling
class Vietnamese(Language):
lang = 'vi'
Defaults = VietnameseDefaults # override defaults
__all__ = ['Vietnamese']

View File

@ -9,6 +9,7 @@ from ...tokens import Doc
class ChineseDefaults(Language.Defaults): class ChineseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'zh' # for pickling lex_attr_getters[LANG] = lambda text: 'zh' # for pickling
use_jieba = True
class Chinese(Language): class Chinese(Language):
@ -16,14 +17,25 @@ class Chinese(Language):
Defaults = ChineseDefaults # override defaults Defaults = ChineseDefaults # override defaults
def make_doc(self, text): def make_doc(self, text):
if self.Defaults.use_jieba:
try: try:
import jieba import jieba
except ImportError: except ImportError:
raise ImportError("The Chinese tokenizer requires the Jieba library: " msg = ("Jieba not installed. Either set Chinese.use_jieba = False, "
"https://github.com/fxsjy/jieba") "or install it https://github.com/fxsjy/jieba")
raise ImportError(msg)
words = list(jieba.cut(text, cut_all=False)) words = list(jieba.cut(text, cut_all=False))
words = [x for x in words if x] words = [x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False]*len(words)) return Doc(self.vocab, words=words, spaces=[False]*len(words))
else:
words = []
spaces = []
doc = self.tokenizer(text)
for token in self.tokenizer(text):
words.extend(list(token.text))
spaces.extend([False]*len(token.text))
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)
__all__ = ['Chinese'] __all__ = ['Chinese']

View File

@ -17,7 +17,7 @@ from .vocab import Vocab
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
from .pipeline import merge_noun_chunks, merge_entities from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .compat import json_dumps, izip, basestring_ from .compat import json_dumps, izip, basestring_
from .gold import GoldParse from .gold import GoldParse
from .scorer import Scorer from .scorer import Scorer
@ -108,7 +108,8 @@ class Language(object):
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), 'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), 'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks, 'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
'merge_entities': lambda nlp, **cfg: merge_entities 'merge_entities': lambda nlp, **cfg: merge_entities,
'merge_subtokens': lambda nlp, **cfg: merge_subtokens,
} }
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .symbols import POS, NOUN, VERB, ADJ, PUNCT from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
@ -27,11 +27,13 @@ class Lemmatizer(object):
univ_pos = 'adj' univ_pos = 'adj'
elif univ_pos in (PUNCT, 'PUNCT', 'punct'): elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
univ_pos = 'punct' univ_pos = 'punct'
elif univ_pos in (PROPN, 'PROPN'):
return [string]
else: else:
return list(set([string.lower()])) return [string.lower()]
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology): if self.is_base_form(univ_pos, morphology):
return list(set([string.lower()])) return [string.lower()]
lemmas = lemmatize(string, self.index.get(univ_pos, {}), lemmas = lemmatize(string, self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}), self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, [])) self.rules.get(univ_pos, []))
@ -88,6 +90,7 @@ class Lemmatizer(object):
def lemmatize(string, index, exceptions, rules): def lemmatize(string, index, exceptions, rules):
orig = string
string = string.lower() string = string.lower()
forms = [] forms = []
forms.extend(exceptions.get(string, [])) forms.extend(exceptions.get(string, []))
@ -105,5 +108,5 @@ def lemmatize(string, index, exceptions, rules):
if not forms: if not forms:
forms.extend(oov_forms) forms.extend(oov_forms)
if not forms: if not forms:
forms.append(string) forms.append(orig)
return list(set(forms)) return list(set(forms))

View File

@ -1,24 +1,19 @@
# cython: profile=True
# cython: infer_types=True # cython: infer_types=True
# coding: utf8 # cython: profile=True
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.pair cimport pair from libc.stdint cimport int32_t, uint64_t, uint16_t
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t from .typedefs cimport attr_t, hash_t
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .structs cimport TokenC from .structs cimport TokenC
from .tokens.doc cimport Doc, get_token_attr from .lexeme cimport attr_id_t
from .vocab cimport Vocab from .vocab cimport Vocab
from .tokens.doc cimport Doc
from .tokens.doc cimport get_token_attr
from .attrs cimport ID, attr_id_t, NULL_ATTR
from .attrs import IDS from .attrs import IDS
from .attrs cimport attr_id_t, ID, NULL_ATTR
from .attrs import FLAG61 as U_ENT from .attrs import FLAG61 as U_ENT
from .attrs import FLAG60 as B2_ENT from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT from .attrs import FLAG59 as B3_ENT
@ -48,29 +43,24 @@ from .attrs import FLAG36 as L9_ENT
from .attrs import FLAG35 as L10_ENT from .attrs import FLAG35 as L10_ENT
cpdef enum quantifier_t: cdef enum action_t:
_META REJECT = 0000
ONE MATCH = 1000
ADVANCE = 0100
RETRY = 0010
RETRY_EXTEND = 0011
MATCH_EXTEND = 1001
MATCH_REJECT = 2000
cdef enum quantifier_t:
ZERO ZERO
ZERO_ONE ZERO_ONE
ZERO_PLUS ZERO_PLUS
ONE
ONE_PLUS
cdef enum action_t:
REJECT
ADVANCE
REPEAT
ACCEPT
ADVANCE_ZERO
ACCEPT_PREV
PANIC
# A "match expression" conists of one or more token patterns
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
# A state is an (int, pattern pointer) pair, where the int is the start
# position, and the pattern pointer shows where we're up to
# in the pattern.
cdef struct AttrValueC: cdef struct AttrValueC:
attr_id_t attr attr_id_t attr
attr_t value attr_t value
@ -80,10 +70,231 @@ cdef struct TokenPatternC:
AttrValueC* attrs AttrValueC* attrs
int32_t nr_attr int32_t nr_attr
quantifier_t quantifier quantifier_t quantifier
hash_t key
ctypedef TokenPatternC* TokenPatternC_ptr cdef struct ActionC:
ctypedef pair[int, TokenPatternC_ptr] StateC char emit_match
char next_state_next_token
char next_state_same_token
char same_state_next_token
cdef struct PatternStateC:
TokenPatternC* pattern
int32_t start
int32_t length
cdef struct MatchC:
attr_t pattern_id
int32_t start
int32_t length
cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
cdef vector[PatternStateC] states
cdef vector[MatchC] matches
cdef PatternStateC state
cdef Pool mem = Pool()
# TODO: Prefill this with the extra attribute values.
extra_attrs = <attr_t**>mem.alloc(len(doc), sizeof(attr_t*))
# Main loop
cdef int i, j
for i in range(doc.length):
for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, &doc.c[i], extra_attrs[i])
# Handle matches that end in 0-width patterns
finish_states(matches, states)
return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length)
for i in range(matches.size())]
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
const TokenC* token, const attr_t* extra_attrs) except *:
cdef int q = 0
cdef vector[PatternStateC] new_states
for i in range(states.size()):
action = get_action(states[i], token, extra_attrs)
if action == REJECT:
continue
state = states[i]
states[q] = state
while action in (RETRY, RETRY_EXTEND):
if action == RETRY_EXTEND:
new_states.push_back(
PatternStateC(pattern=state.pattern, start=state.start,
length=state.length+1))
states[q].pattern += 1
action = get_action(states[q], token, extra_attrs)
if action == REJECT:
pass
elif action == ADVANCE:
states[q].pattern += 1
states[q].length += 1
q += 1
else:
ent_id = state.pattern[1].attrs.value
if action == MATCH:
matches.push_back(
MatchC(pattern_id=ent_id, start=state.start,
length=state.length+1))
elif action == MATCH_REJECT:
matches.push_back(
MatchC(pattern_id=ent_id, start=state.start,
length=state.length))
elif action == MATCH_EXTEND:
matches.push_back(
MatchC(pattern_id=ent_id, start=state.start,
length=state.length))
states[q].length += 1
q += 1
states.resize(q)
for i in range(new_states.size()):
states.push_back(new_states[i])
cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
'''Handle states that end in zero-width patterns.'''
cdef PatternStateC state
for i in range(states.size()):
state = states[i]
while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
is_final = get_is_final(state)
if is_final:
ent_id = state.pattern[1].attrs.value
matches.push_back(
MatchC(pattern_id=ent_id, start=state.start, length=state.length))
break
else:
state.pattern += 1
cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
'''We need to consider:
a) Does the token match the specification? [Yes, No]
b) What's the quantifier? [1, 0+, ?]
c) Is this the last specification? [final, non-final]
We can transition in the following ways:
a) Do we emit a match?
b) Do we add a state with (next state, next token)?
c) Do we add a state with (next state, same token)?
d) Do we add a state with (same state, next token)?
We'll code the actions as boolean strings, so 0000 means no to all 4,
1000 means match but no states added, etc.
1:
Yes, final:
1000
Yes, non-final:
0100
No, final:
0000
No, non-final
0000
0+:
Yes, final:
1001
Yes, non-final:
0011
No, final:
1000 (note: Don't include last token!)
No, non-final:
0010
?:
Yes, final:
1000
Yes, non-final:
0100
No, final:
1000 (note: Don't include last token!)
No, non-final:
0010
Possible combinations: 1000, 0100, 0000, 1001, 0011, 0010,
We'll name the bits "match", "advance", "retry", "extend"
REJECT = 0000
MATCH = 1000
ADVANCE = 0100
RETRY = 0010
MATCH_EXTEND = 1001
RETRY_EXTEND = 0011
MATCH_REJECT = 2000 # Match, but don't include last token
Problem: If a quantifier is matching, we're adding a lot of open partials
'''
cdef char is_match
is_match = get_is_match(state, token, extra_attrs)
quantifier = get_quantifier(state)
is_final = get_is_final(state)
if quantifier == ZERO:
is_match = not is_match
quantifier = ONE
if quantifier == ONE:
if is_match and is_final:
# Yes, final: 1000
return MATCH
elif is_match and not is_final:
# Yes, non-final: 0100
return ADVANCE
elif not is_match and is_final:
# No, final: 0000
return REJECT
else:
return REJECT
elif quantifier == ZERO_PLUS:
if is_match and is_final:
# Yes, final: 1001
return MATCH_EXTEND
elif is_match and not is_final:
# Yes, non-final: 0011
return RETRY_EXTEND
elif not is_match and is_final:
# No, final 2000 (note: Don't include last token!)
return MATCH_REJECT
else:
# No, non-final 0010
return RETRY
elif quantifier == ZERO_ONE:
if is_match and is_final:
# Yes, final: 1000
return MATCH
elif is_match and not is_final:
# Yes, non-final: 0100
return ADVANCE
elif not is_match and is_final:
# No, final 2000 (note: Don't include last token!)
return MATCH_REJECT
else:
# No, non-final 0010
return RETRY
cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
spec = state.pattern
for attr in spec.attrs[:spec.nr_attr]:
if get_token_attr(token, attr.attr) != attr.value:
return 0
else:
return 1
cdef char get_is_final(PatternStateC state) nogil:
if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0:
return 1
else:
return 0
cdef char get_quantifier(PatternStateC state) nogil:
return state.pattern.quantifier
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
@ -97,6 +308,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
for j, (attr, value) in enumerate(spec): for j, (attr, value) in enumerate(spec):
pattern[i].attrs[j].attr = attr pattern[i].attrs[j].attr = attr
pattern[i].attrs[j].value = value pattern[i].attrs[j].value = value
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
i = len(token_specs) i = len(token_specs)
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC)) pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
pattern[i].attrs[0].attr = ID pattern[i].attrs[0].attr = ID
@ -105,48 +317,16 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
return pattern return pattern
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0: cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
while pattern.nr_attr != 0: while pattern.nr_attr != 0:
pattern += 1 pattern += 1
id_attr = pattern[0].attrs[0] id_attr = pattern[0].attrs[0]
assert id_attr.attr == ID
return id_attr.value return id_attr.value
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
lookahead = &pattern[1]
for attr in pattern.attrs[:pattern.nr_attr]:
if get_token_attr(token, attr.attr) != attr.value:
if pattern.quantifier == ONE:
return REJECT
elif pattern.quantifier == ZERO:
return ACCEPT if lookahead.nr_attr == 0 else ADVANCE
elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO
else:
return PANIC
if pattern.quantifier == ZERO:
return REJECT
elif lookahead.nr_attr == 0:
return ACCEPT
elif pattern.quantifier in (ONE, ZERO_ONE):
return ADVANCE
elif pattern.quantifier == ZERO_PLUS:
# This is a bandaid over the 'shadowing' problem described here:
# https://github.com/explosion/spaCy/issues/864
next_action = get_action(lookahead, token)
if next_action is REJECT:
return REPEAT
else:
return ADVANCE_ZERO
else:
return PANIC
def _convert_strings(token_specs, string_store): def _convert_strings(token_specs, string_store):
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
'?': (ZERO_ONE,), '1': (ONE,)} '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
tokens = [] tokens = []
op = ONE op = ONE
for spec in token_specs: for spec in token_specs:
@ -176,21 +356,6 @@ def _convert_strings(token_specs, string_store):
return tokens return tokens
def merge_phrase(matcher, doc, i, matches):
"""Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i]
span = doc[start:end]
span.merge(ent_type=label, ent_id=ent_id)
def unpickle_matcher(vocab, patterns, callbacks):
matcher = Matcher(vocab)
for key, specs in patterns.items():
callback = callbacks.get(key, None)
matcher.add(key, callback, *specs)
return matcher
cdef class Matcher: cdef class Matcher:
"""Match sequences of tokens, based on pattern rules.""" """Match sequences of tokens, based on pattern rules."""
cdef Pool mem cdef Pool mem
@ -333,85 +498,9 @@ cdef class Matcher:
describing the matches. A match tuple describes a span describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers. `doc[start:end]`. The `label_id` and `key` are both integers.
""" """
cdef vector[StateC] partials matches = find_matches(&self.patterns[0], self.patterns.size(), doc)
cdef int n_partials = 0 for i, (key, start, end) in enumerate(matches):
cdef int q = 0 on_match = self._callbacks.get(key, None)
cdef int i, token_i
cdef const TokenC* token
cdef StateC state
matches = []
for token_i in range(doc.length):
token = &doc.c[token_i]
q = 0
# Go over the open matches, extending or finalizing if able.
# Otherwise, we over-write them (q doesn't advance)
for state in partials:
action = get_action(state.second, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO:
state.second += 1
action = get_action(state.second, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
if action == REPEAT:
# Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match
# more pattern.
q += 1
elif action == REJECT:
pass
elif action == ADVANCE:
partials[q] = state
partials[q].second += 1
q += 1
elif action in (ACCEPT, ACCEPT_PREV):
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
start = state.first
end = token_i+1 if action == ACCEPT else token_i
ent_id = state.second[1].attrs[0].value
label = state.second[1].attrs[1].value
matches.append((ent_id, start, end))
partials.resize(q)
# Check whether we open any new patterns on this token
for pattern in self.patterns:
action = get_action(pattern, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO:
pattern += 1
action = get_action(pattern, token)
if action == REPEAT:
state.first = token_i
state.second = pattern
partials.push_back(state)
elif action == ADVANCE:
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
state.first = token_i
state.second = pattern + 1
partials.push_back(state)
elif action in (ACCEPT, ACCEPT_PREV):
start = token_i
end = token_i+1 if action == ACCEPT else token_i
ent_id = pattern[1].attrs[0].value
label = pattern[1].attrs[1].value
matches.append((ent_id, start, end))
# Look for open patterns that are actually satisfied
for state in partials:
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
state.second += 1
if state.second.nr_attr == 0:
start = state.first
end = len(doc)
ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value
matches.append((ent_id, start, end))
for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
if on_match is not None: if on_match is not None:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)
return matches return matches
@ -423,31 +512,37 @@ cdef class Matcher:
return key return key
def unpickle_matcher(vocab, patterns, callbacks):
matcher = Matcher(vocab)
for key, specs in patterns.items():
callback = callbacks.get(key, None)
matcher.add(key, callback, *specs)
return matcher
def _get_longest_matches(matches):
'''Filter out matches that have a longer equivalent.'''
longest_matches = {}
for pattern_id, start, end in matches:
key = (pattern_id, start)
length = end-start
if key not in longest_matches or length > longest_matches[key]:
longest_matches[key] = length
return [(pattern_id, start, start+length)
for (pattern_id, start), length in longest_matches.items()]
def get_bilou(length): def get_bilou(length):
if length == 1: if length == 0:
raise ValueError("Length must be >= 1")
elif length == 1:
return [U_ENT] return [U_ENT]
elif length == 2: elif length == 2:
return [B2_ENT, L2_ENT] return [B2_ENT, L2_ENT]
elif length == 3: elif length == 3:
return [B3_ENT, I3_ENT, L3_ENT] return [B3_ENT, I3_ENT, L3_ENT]
elif length == 4:
return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
elif length == 5:
return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
elif length == 6:
return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
elif length == 7:
return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
elif length == 8:
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
elif length == 9:
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
L9_ENT]
elif length == 10:
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT]
else: else:
raise ValueError("Max length currently 10 for phrase matching") return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
cdef class PhraseMatcher: cdef class PhraseMatcher:
@ -456,21 +551,21 @@ cdef class PhraseMatcher:
cdef Matcher matcher cdef Matcher matcher
cdef PreshMap phrase_ids cdef PreshMap phrase_ids
cdef int max_length cdef int max_length
cdef attr_t* _phrase_key
cdef public object _callbacks cdef public object _callbacks
cdef public object _patterns cdef public object _patterns
def __init__(self, Vocab vocab, max_length=10): def __init__(self, Vocab vocab, max_length=10):
self.mem = Pool() self.mem = Pool()
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
self.max_length = max_length self.max_length = max_length
self.vocab = vocab self.vocab = vocab
self.matcher = Matcher(self.vocab) self.matcher = Matcher(self.vocab)
self.phrase_ids = PreshMap() self.phrase_ids = PreshMap()
abstract_patterns = [] abstract_patterns = [
for length in range(1, max_length): [{U_ENT: True}],
abstract_patterns.append([{tag: True} [{B2_ENT: True}, {L2_ENT: True}],
for tag in get_bilou(length)]) [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
[{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
]
self.matcher.add('Candidate', None, *abstract_patterns) self.matcher.add('Candidate', None, *abstract_patterns)
self._callbacks = {} self._callbacks = {}
@ -504,29 +599,24 @@ cdef class PhraseMatcher:
*docs (Doc): `Doc` objects representing match patterns. *docs (Doc): `Doc` objects representing match patterns.
""" """
cdef Doc doc cdef Doc doc
for doc in docs:
if len(doc) >= self.max_length:
msg = (
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
"Length can be set on initialization, up to 10."
)
raise ValueError(msg % (len(doc), self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key) cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match self._callbacks[ent_id] = on_match
cdef int length cdef int length
cdef int i cdef int i
cdef hash_t phrase_hash cdef hash_t phrase_hash
cdef Pool mem = Pool()
for doc in docs: for doc in docs:
length = doc.length length = doc.length
if length == 0:
continue
tags = get_bilou(length) tags = get_bilou(length)
for i in range(self.max_length): phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
self._phrase_key[i] = 0
for i, tag in enumerate(tags): for i, tag in enumerate(tags):
lexeme = self.vocab[doc.c[i].lex.orth] lexeme = self.vocab[doc.c[i].lex.orth]
lexeme.set_flag(tag, True) lexeme.set_flag(tag, True)
self._phrase_key[i] = lexeme.orth phrase_key[i] = lexeme.orth
phrase_hash = hash64(self._phrase_key, phrase_hash = hash64(phrase_key,
self.max_length * sizeof(attr_t), 0) length * sizeof(attr_t), 0)
self.phrase_ids.set(phrase_hash, <void*>ent_id) self.phrase_ids.set(phrase_hash, <void*>ent_id)
def __call__(self, Doc doc): def __call__(self, Doc doc):
@ -548,28 +638,45 @@ cdef class PhraseMatcher:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)
return matches return matches
def pipe(self, stream, batch_size=1000, n_threads=2): def pipe(self, stream, batch_size=1000, n_threads=2, return_matches=False,
as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (iterable): A stream of documents.
batch_size (int): Number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer n_threads (int): The number of threads with which to work on the buffer
in parallel, if the implementation supports multi-threading. in parallel, if the implementation supports multi-threading.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
and yield (result, context) tuples out.
If both return_matches and as_tuples are True, the output will
be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
if as_tuples:
for doc, context in stream:
matches = self(doc)
if return_matches:
yield ((doc, matches), context)
else:
yield (doc, context)
else:
for doc in stream: for doc in stream:
self(doc) matches = self(doc)
if return_matches:
yield (doc, matches)
else:
yield doc yield doc
def accept_match(self, Doc doc, int start, int end): def accept_match(self, Doc doc, int start, int end):
assert (end - start) < self.max_length
cdef int i, j cdef int i, j
for i in range(self.max_length): cdef Pool mem = Pool()
self._phrase_key[i] = 0 phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t))
for i, j in enumerate(range(start, end)): for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.c[j].lex.orth phrase_key[i] = doc.c[j].lex.orth
cdef hash_t key = hash64(self._phrase_key, cdef hash_t key = hash64(phrase_key,
self.max_length * sizeof(attr_t), 0) (end-start) * sizeof(attr_t), 0)
ent_id = <hash_t>self.phrase_ids.get(key) ent_id = <hash_t>self.phrase_ids.get(key)
if ent_id == 0: if ent_id == 0:
return None return None

View File

@ -47,7 +47,9 @@ cdef class Morphology:
cdef enum univ_morph_t: cdef enum univ_morph_t:
NIL = 0 NIL = 0
Animacy_anim = symbols.Animacy_anim Animacy_anim = symbols.Animacy_anim
Animacy_inam Animacy_inan
Animacy_hum
Animacy_nhum
Aspect_freq Aspect_freq
Aspect_imp Aspect_imp
Aspect_mod Aspect_mod

View File

@ -184,7 +184,9 @@ cdef class Morphology:
IDS = { IDS = {
"Animacy_anim": Animacy_anim, "Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam, "Animacy_inan": Animacy_inan,
"Animacy_hum": Animacy_hum, # U20
"Animacy_nhum": Animacy_nhum,
"Aspect_freq": Aspect_freq, "Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp, "Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod, "Aspect_mod": Aspect_mod,

View File

@ -25,6 +25,7 @@ from .morphology cimport Morphology
from .vocab cimport Vocab from .vocab cimport Vocab
from .syntax import nonproj from .syntax import nonproj
from .compat import json_dumps from .compat import json_dumps
from .matcher import Matcher
from .attrs import POS from .attrs import POS
from .parts_of_speech import X from .parts_of_speech import X
@ -97,6 +98,17 @@ def merge_entities(doc):
return doc return doc
def merge_subtokens(doc, label='subtok'):
merger = Matcher(doc.vocab)
merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
matches = merger(doc)
spans = [doc[start:end+1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets:
doc.merge(start_char, end_char)
return doc
class Pipe(object): class Pipe(object):
"""This class is not instantiated directly. Components inherit from it, and """This class is not instantiated directly. Components inherit from it, and
it defines the interface that components should follow to function as it defines the interface that components should follow to function as
@ -652,11 +664,13 @@ class MultitaskObjective(Tagger):
self.make_label = self.make_dep_tag_offset self.make_label = self.make_dep_tag_offset
elif target == 'ent_tag': elif target == 'ent_tag':
self.make_label = self.make_ent_tag self.make_label = self.make_ent_tag
elif target == 'sent_start':
self.make_label = self.make_sent_start
elif hasattr(target, '__call__'): elif hasattr(target, '__call__'):
self.make_label = target self.make_label = target
else: else:
raise ValueError("MultitaskObjective target should be function or " raise ValueError("MultitaskObjective target should be function or "
"one of: dep, tag, ent, dep_tag_offset, ent_tag.") "one of: dep, tag, ent, sent_start, dep_tag_offset, ent_tag.")
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.cfg.setdefault('pretrained_dims',
@ -716,11 +730,7 @@ class MultitaskObjective(Tagger):
for i, gold in enumerate(golds): for i, gold in enumerate(golds):
for j in range(len(docs[i])): for j in range(len(docs[i])):
# Handes alignment for tokenization differences # Handes alignment for tokenization differences
gold_idx = gold.cand_to_gold[j] label = self.make_label(j, gold.words, gold.tags,
if gold_idx is None:
idx += 1
continue
label = self.make_label(gold_idx, gold.words, gold.tags,
gold.heads, gold.labels, gold.ents) gold.heads, gold.labels, gold.ents)
if label is None or label not in self.labels: if label is None or label not in self.labels:
correct[idx] = guesses[idx] correct[idx] = guesses[idx]
@ -765,6 +775,51 @@ class MultitaskObjective(Tagger):
else: else:
return '%s-%s' % (tags[i], ents[i]) return '%s-%s' % (tags[i], ents[i])
@staticmethod
def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
'''A multi-task objective for representing sentence boundaries,
using BILU scheme. (O is impossible)
The implementation of this method uses an internal cache that relies
on the identity of the heads array, to avoid requiring a new piece
of gold data. You can pass cache=False if you know the cache will
do the wrong thing.
'''
assert len(words) == len(heads)
assert target < len(words), (target, len(words))
if cache:
if id(heads) in _cache:
return _cache[id(heads)][target]
else:
for key in list(_cache.keys()):
_cache.pop(key)
sent_tags = ['I-SENT'] * len(words)
_cache[id(heads)] = sent_tags
else:
sent_tags = ['I-SENT'] * len(words)
def _find_root(child):
seen = set([child])
while child is not None and heads[child] != child:
seen.add(child)
child = heads[child]
return child
sentences = {}
for i in range(len(words)):
root = _find_root(i)
if root is None:
sent_tags[i] = None
else:
sentences.setdefault(root, []).append(i)
for root, span in sorted(sentences.items()):
if len(span) == 1:
sent_tags[span[0]] = 'U-SENT'
else:
sent_tags[span[0]] = 'B-SENT'
sent_tags[span[-1]] = 'L-SENT'
return sent_tags[target]
class SimilarityHook(Pipe): class SimilarityHook(Pipe):
""" """
@ -823,8 +878,8 @@ class TextCategorizer(Pipe):
name = 'textcat' name = 'textcat'
@classmethod @classmethod
def Model(cls, nr_class=1, width=64, **cfg): def Model(cls, **cfg):
return build_text_classifier(nr_class, width, **cfg) return build_text_classifier(**cfg)
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab self.vocab = vocab
@ -890,6 +945,15 @@ class TextCategorizer(Pipe):
if label in self.labels: if label in self.labels:
return 0 return 0
if self.model not in (None, True, False): if self.model not in (None, True, False):
# This functionality was available previously, but was broken.
# The problem is that we resize the last layer, but the last layer
# is actually just an ensemble. We're not resizing the child layers
# -- a huge problem.
raise ValueError(
"Cannot currently add labels to pre-trained text classifier. "
"Add labels before training begins. This functionality was "
"available in previous versions, but had significant bugs that "
"let to poor performance")
smaller = self.model._layers[-1] smaller = self.model._layers[-1]
larger = Affine(len(self.labels)+1, smaller.nI) larger = Affine(len(self.labels)+1, smaller.nI)
copy_array(larger.W[:smaller.nO], smaller.W) copy_array(larger.W[:smaller.nO], smaller.W)
@ -905,8 +969,9 @@ class TextCategorizer(Pipe):
token_vector_width = 64 token_vector_width = 64
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(len(self.labels), token_vector_width, self.cfg['nr_class'] = len(self.labels)
**self.cfg) self.cfg['width'] = token_vector_width
self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import division, print_function, unicode_literals from __future__ import division, print_function, unicode_literals
from .gold import tags_to_entities from .gold import tags_to_entities, GoldParse
class PRFScore(object): class PRFScore(object):
@ -84,6 +84,8 @@ class Scorer(object):
} }
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
if len(tokens) != len(gold):
gold = GoldParse.from_annot_tuples(tokens, zip(*gold.orig_annot))
assert len(tokens) == len(gold) assert len(tokens) == len(gold)
gold_deps = set() gold_deps = set()
gold_tags = set() gold_tags = set()
@ -100,7 +102,6 @@ class Scorer(object):
continue continue
gold_i = gold.cand_to_gold[token.i] gold_i = gold.cand_to_gold[token.i]
if gold_i is None: if gold_i is None:
if token.dep_.lower() not in punct_labels:
self.tokens.fp += 1 self.tokens.fp += 1
else: else:
self.tokens.tp += 1 self.tokens.tp += 1

View File

@ -85,6 +85,7 @@ cdef enum symbol_t:
SENT_START SENT_START
SPACY SPACY
PROB PROB
LANG
ADJ ADJ
ADP ADP
@ -108,8 +109,9 @@ cdef enum symbol_t:
SPACE SPACE
Animacy_anim Animacy_anim
Animacy_inam Animacy_inan
Animacy_hum # U20 Animacy_hum # U20
Animacy_nhum
Aspect_freq Aspect_freq
Aspect_imp Aspect_imp
Aspect_mod Aspect_mod
@ -393,6 +395,7 @@ cdef enum symbol_t:
EVENT EVENT
WORK_OF_ART WORK_OF_ART
LANGUAGE LANGUAGE
LAW
DATE DATE
TIME TIME
@ -451,10 +454,9 @@ cdef enum symbol_t:
prt prt
punct punct
quantmod quantmod
relcl
rcmod rcmod
root root
xcomp xcomp
acl acl
LAW
LANG

View File

@ -114,8 +114,9 @@ IDS = {
"SPACE": SPACE, "SPACE": SPACE,
"Animacy_anim": Animacy_anim, "Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam, "Animacy_inam": Animacy_inan,
"Animacy_hum": Animacy_hum, # U20 "Animacy_hum": Animacy_hum, # U20
"Animacy_nhum": Animacy_nhum,
"Aspect_freq": Aspect_freq, "Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp, "Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod, "Aspect_mod": Aspect_mod,
@ -458,6 +459,7 @@ IDS = {
"punct": punct, "punct": punct,
"quantmod": quantmod, "quantmod": quantmod,
"rcmod": rcmod, "rcmod": rcmod,
"relcl": relcl,
"root": root, "root": root,
"xcomp": xcomp, "xcomp": xcomp,

View File

@ -108,7 +108,7 @@ cdef cppclass StateC:
ids[1] = this.B(1) ids[1] = this.B(1)
ids[2] = this.S(0) ids[2] = this.S(0)
ids[3] = this.S(1) ids[3] = this.S(1)
ids[4] = this.H(this.S(0)) ids[4] = this.S(2)
ids[5] = this.L(this.B(0), 1) ids[5] = this.L(this.B(0), 1)
ids[6] = this.L(this.S(0), 1) ids[6] = this.L(this.S(0), 1)
ids[7] = this.R(this.S(0), 1) ids[7] = this.R(this.S(0), 1)

View File

@ -6,16 +6,19 @@ from __future__ import unicode_literals
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from collections import OrderedDict from collections import OrderedDict, defaultdict, Counter
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
import json
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .nonproj import is_nonproj_tree from . import nonproj
from .transition_system cimport move_cost_func_t, label_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse, GoldParseC from ..gold cimport GoldParse, GoldParseC
from ..structs cimport TokenC from ..structs cimport TokenC
# Calculate cost as gold/not gold. We don't use scalar value anyway.
cdef int BINARY_COSTS = 1
DEF NON_MONOTONIC = True DEF NON_MONOTONIC = True
DEF USE_BREAK = True DEF USE_BREAK = True
@ -54,6 +57,8 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
cost += 1 cost += 1
if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)): if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
cost += 1 cost += 1
if BINARY_COSTS and cost >= 1:
return cost
cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0 cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
return cost return cost
@ -67,6 +72,8 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
cost += gold.heads[target] == B_i cost += gold.heads[target] == B_i
if gold.heads[B_i] == B_i or gold.heads[B_i] < target: if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
break break
if BINARY_COSTS and cost >= 1:
return cost
if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0: if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
cost += 1 cost += 1
return cost return cost
@ -110,7 +117,8 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
cdef class Shift: cdef class Shift:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1 sent_start = st._sent[st.B_(0).l_edge].sent_start
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and sent_start != 1
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
@ -170,7 +178,8 @@ cdef class Reduce:
cdef class LeftArc: cdef class LeftArc:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.B_(0).sent_start != 1 sent_start = st._sent[st.B_(0).l_edge].sent_start
return sent_start != 1
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
@ -205,7 +214,8 @@ cdef class RightArc:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
# If there's (perhaps partial) parse pre-set, don't allow cycle. # If there's (perhaps partial) parse pre-set, don't allow cycle.
return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0) sent_start = st._sent[st.B_(0).l_edge].sent_start
return sent_start != 1 and st.H(st.S(0)) != st.B(0)
@staticmethod @staticmethod
cdef int transition(StateC* st, attr_t label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
@ -312,39 +322,42 @@ cdef class ArcEager(TransitionSystem):
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', OrderedDict(( min_freq = kwargs.get('min_freq', None)
(SHIFT, ['']), actions = defaultdict(lambda: Counter())
(REDUCE, ['']), actions[SHIFT][''] = 1
(RIGHT, []), actions[REDUCE][''] = 1
(LEFT, []),
(BREAK, ['ROOT']))
))
seen_actions = set()
for label in kwargs.get('left_labels', []): for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT': actions[LEFT][label] = 1
if (LEFT, label) not in seen_actions: actions[SHIFT][label] = 1
actions[LEFT].append(label)
seen_actions.add((LEFT, label))
for label in kwargs.get('right_labels', []): for label in kwargs.get('right_labels', []):
if label.upper() != 'ROOT': actions[RIGHT][label] = 1
if (RIGHT, label) not in seen_actions: actions[REDUCE][label] = 1
actions[RIGHT].append(label)
seen_actions.add((RIGHT, label))
for raw_text, sents in kwargs.get('gold_parses', []): for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, iob), ctnts in sents: for (ids, words, tags, heads, labels, iob), ctnts in sents:
heads, labels = nonproj.projectivize(heads, labels)
for child, head, label in zip(ids, heads, labels): for child, head, label in zip(ids, heads, labels):
if label.upper() == 'ROOT' : if label.upper() == 'ROOT' :
label = 'ROOT' label = 'ROOT'
if label != 'ROOT': if head == child:
if head < child: actions[BREAK][label] += 1
if (RIGHT, label) not in seen_actions: elif head < child:
actions[RIGHT].append(label) actions[RIGHT][label] += 1
seen_actions.add((RIGHT, label)) actions[REDUCE][''] += 1
elif head > child: elif head > child:
if (LEFT, label) not in seen_actions: actions[LEFT][label] += 1
actions[LEFT].append(label) actions[SHIFT][''] += 1
seen_actions.add((LEFT, label)) if min_freq is not None:
for action, label_freqs in actions.items():
for label, freq in list(label_freqs.items()):
if freq < min_freq:
label_freqs.pop(label)
# Ensure these actions are present
actions[BREAK].setdefault('ROOT', 0)
actions[RIGHT].setdefault('subtok', 0)
actions[LEFT].setdefault('subtok', 0)
# Used for backoff
actions[RIGHT].setdefault('dep', 0)
actions[LEFT].setdefault('dep', 0)
return actions return actions
property action_types: property action_types:
@ -376,18 +389,34 @@ cdef class ArcEager(TransitionSystem):
def preprocess_gold(self, GoldParse gold): def preprocess_gold(self, GoldParse gold):
if not self.has_gold(gold): if not self.has_gold(gold):
return None return None
for i in range(gold.length): for i, (head, dep) in enumerate(zip(gold.heads, gold.labels)):
# Missing values # Missing values
if gold.heads[i] is None or gold.labels[i] is None: if head is None or dep is None:
gold.c.heads[i] = i gold.c.heads[i] = i
gold.c.has_dep[i] = False gold.c.has_dep[i] = False
else: else:
label = gold.labels[i] if head > i:
action = LEFT
elif head < i:
action = RIGHT
else:
action = BREAK
if dep not in self.labels[action]:
if action == BREAK:
dep = 'ROOT'
elif nonproj.is_decorated(dep):
backoff = nonproj.decompose(dep)[0]
if backoff in self.labels[action]:
dep = backoff
else:
dep = 'dep'
else:
dep = 'dep'
gold.c.has_dep[i] = True gold.c.has_dep[i] = True
if label.upper() == 'ROOT': if dep.upper() == 'ROOT':
label = 'ROOT' dep = 'ROOT'
gold.c.heads[i] = gold.heads[i] gold.c.heads[i] = head
gold.c.labels[i] = self.strings.add(label) gold.c.labels[i] = self.strings.add(dep)
return gold return gold
def get_beam_parses(self, Beam beam): def get_beam_parses(self, Beam beam):
@ -527,8 +556,13 @@ cdef class ArcEager(TransitionSystem):
is_valid[i] = False is_valid[i] = False
costs[i] = 9000 costs[i] = 9000
if n_gold < 1: if n_gold < 1:
# Check projectivity --- leading cause # Check label set --- leading cause
if is_nonproj_tree(gold.heads): label_set = set([self.strings[self.c[i].label] for i in range(self.n_moves)])
for label_str in gold.labels:
if label_str is not None and label_str not in label_set:
raise ValueError("Cannot get gold parser action: unknown label: %s" % label_str)
# Check projectivity --- other leading cause
if nonproj.is_nonproj_tree(gold.heads):
raise ValueError( raise ValueError(
"Could not find a gold-standard action to supervise the " "Could not find a gold-standard action to supervise the "
"dependency parser. Likely cause: the tree is " "dependency parser. Likely cause: the tree is "

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from collections import OrderedDict from collections import OrderedDict, Counter
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
@ -64,21 +64,18 @@ cdef class BiluoPushDown(TransitionSystem):
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', OrderedDict(( actions = {
(MISSING, ['']), MISSING: Counter(),
(BEGIN, []), BEGIN: Counter(),
(IN, []), IN: Counter(),
(LAST, []), LAST: Counter(),
(UNIT, []), UNIT: Counter(),
(OUT, ['']) OUT: Counter()
))) }
seen_entities = set() actions[OUT][''] = 1
for entity_type in kwargs.get('entity_types', []): for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities:
continue
seen_entities.add(entity_type)
for action in (BEGIN, IN, LAST, UNIT): for action in (BEGIN, IN, LAST, UNIT):
actions[action].append(entity_type) actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U') moves = ('M', 'B', 'I', 'L', 'U')
for raw_text, sents in kwargs.get('gold_parses', []): for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, biluo), _ in sents: for (ids, words, tags, heads, labels, biluo), _ in sents:
@ -87,10 +84,8 @@ cdef class BiluoPushDown(TransitionSystem):
if ner_tag.count('-') != 1: if ner_tag.count('-') != 1:
raise ValueError(ner_tag) raise ValueError(ner_tag)
_, label = ner_tag.split('-') _, label = ner_tag.split('-')
if label not in seen_entities: for action in (BEGIN, IN, LAST, UNIT):
seen_entities.add(label) actions[action][label] += 1
for move_str in ('B', 'I', 'L', 'U'):
actions[moves.index(move_str)].append(label)
return actions return actions
property action_types: property action_types:
@ -213,7 +208,7 @@ cdef class BiluoPushDown(TransitionSystem):
raise Exception(move) raise Exception(move)
return t return t
def add_action(self, int action, label_name): def add_action(self, int action, label_name, freq=None):
cdef attr_t label_id cdef attr_t label_id
if not isinstance(label_name, (int, long)): if not isinstance(label_name, (int, long)):
label_id = self.strings.add(label_name) label_id = self.strings.add(label_name)
@ -234,6 +229,12 @@ cdef class BiluoPushDown(TransitionSystem):
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id assert self.c[self.n_moves].label == label_id
self.n_moves += 1 self.n_moves += 1
if self.labels.get(action, []):
freq = min(0, min(self.labels[action].values()))
self.labels[action][label_name] = freq-1
else:
self.labels[action] = Counter()
self.labels[action][label_name] = -1
return 1 return 1
cdef int initialize_state(self, StateC* st) nogil: cdef int initialize_state(self, StateC* st) nogil:

View File

@ -15,7 +15,7 @@ cdef class Parser:
cdef readonly object cfg cdef readonly object cfg
cdef public object _multitasks cdef public object _multitasks
cdef void _parseC(self, StateC* state, cdef void _parseC(self, StateC** states, int nr_task,
const float* feat_weights, const float* bias, const float* feat_weights, const float* bias,
const float* hW, const float* hb, const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil

View File

@ -1,7 +1,6 @@
# cython: infer_types=True # cython: infer_types=True
# cython: cdivision=True # cython: cdivision=True
# cython: boundscheck=False # cython: boundscheck=False
# cython: profile=True
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
@ -28,6 +27,8 @@ from thinc.misc import LayerNorm
from thinc.neural.ops import CupyOps from thinc.neural.ops import CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec from thinc.linalg cimport Vec, VecVec
from thinc cimport openblas
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
from .._ml import link_vectors_to_models, create_default_optimizer from .._ml import link_vectors_to_models, create_default_optimizer
@ -266,7 +267,7 @@ cdef class Parser:
with Model.use_device('cpu'): with Model.use_device('cpu'):
upper = chain( upper = chain(
clone(LayerNorm(Maxout(hidden_width, hidden_width)), depth-1), clone(Maxout(hidden_width, hidden_width), depth-1),
zero_init(Affine(nr_class, hidden_width, drop_factor=0.0)) zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
) )
@ -302,7 +303,7 @@ cdef class Parser:
""" """
self.vocab = vocab self.vocab = vocab
if moves is True: if moves is True:
self.moves = self.TransitionSystem(self.vocab.strings, {}) self.moves = self.TransitionSystem(self.vocab.strings)
else: else:
self.moves = moves self.moves = moves
if 'beam_width' not in cfg: if 'beam_width' not in cfg:
@ -311,12 +312,7 @@ cdef class Parser:
cfg['beam_density'] = util.env_opt('beam_density', 0.0) cfg['beam_density'] = util.env_opt('beam_density', 0.0)
if 'pretrained_dims' not in cfg: if 'pretrained_dims' not in cfg:
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
cfg.setdefault('cnn_maxout_pieces', 3)
self.cfg = cfg self.cfg = cfg
if 'actions' in self.cfg:
for action, labels in self.cfg.get('actions', {}).items():
for label in labels:
self.moves.add_action(action, label)
self.model = model self.model = model
self._multitasks = [] self._multitasks = []
@ -423,69 +419,81 @@ cdef class Parser:
cdef int nr_hidden = hidden_weights.shape[0] cdef int nr_hidden = hidden_weights.shape[0]
cdef int nr_task = states.size() cdef int nr_task = states.size()
with nogil: with nogil:
for i in range(nr_task): self._parseC(&states[0], nr_task, feat_weights, bias, hW, hb,
self._parseC(states[i],
feat_weights, bias, hW, hb,
nr_class, nr_hidden, nr_feat, nr_piece) nr_class, nr_hidden, nr_feat, nr_piece)
PyErr_CheckSignals() PyErr_CheckSignals()
tokvecs = self.model[0].ops.unflatten(tokvecs, tokvecs = self.model[0].ops.unflatten(tokvecs,
[len(doc) for doc in docs]) [len(doc) for doc in docs])
return state_objs, tokvecs return state_objs, tokvecs
cdef void _parseC(self, StateC* state, cdef void _parseC(self, StateC** states, int nr_task,
const float* feat_weights, const float* bias, const float* feat_weights, const float* bias,
const float* hW, const float* hb, const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil: int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
token_ids = <int*>calloc(nr_feat, sizeof(int)) token_ids = <int*>calloc(nr_feat, sizeof(int))
is_valid = <int*>calloc(nr_class, sizeof(int)) is_valid = <int*>calloc(nr_class, sizeof(int))
vectors = <float*>calloc(nr_hidden * nr_piece, sizeof(float)) vectors = <float*>calloc(nr_hidden * nr_task, sizeof(float))
scores = <float*>calloc(nr_class, sizeof(float)) unmaxed = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
scores = <float*>calloc(nr_class*nr_task, sizeof(float))
if not (token_ids and is_valid and vectors and scores): if not (token_ids and is_valid and vectors and scores):
with gil: with gil:
PyErr_SetFromErrno(MemoryError) PyErr_SetFromErrno(MemoryError)
PyErr_CheckSignals() PyErr_CheckSignals()
cdef float feature cdef int nr_todo = nr_task
while not state.is_final(): cdef int i, j
cdef vector[StateC*] unfinished
while nr_todo >= 1:
memset(vectors, 0, nr_todo * nr_hidden * sizeof(float))
memset(scores, 0, nr_todo * nr_class * sizeof(float))
for i in range(nr_todo):
state = states[i]
state.set_context_tokens(token_ids, nr_feat) state.set_context_tokens(token_ids, nr_feat)
memset(vectors, 0, nr_hidden * nr_piece * sizeof(float)) memset(unmaxed, 0, nr_hidden * nr_piece * sizeof(float))
memset(scores, 0, nr_class * sizeof(float)) sum_state_features(unmaxed,
sum_state_features(vectors,
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece) feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
for i in range(nr_hidden * nr_piece): VecVec.add_i(unmaxed,
vectors[i] += bias[i] bias, 1., nr_hidden*nr_piece)
V = vectors state_vector = &vectors[i*nr_hidden]
W = hW for j in range(nr_hidden):
for i in range(nr_hidden): index = j * nr_piece
if nr_piece == 1: which = Vec.arg_max(&unmaxed[index], nr_piece)
feature = V[0] if V[0] >= 0. else 0. state_vector[j] = unmaxed[index + which]
elif nr_piece == 2: # Compute hidden-to-output
feature = V[0] if V[0] >= V[1] else V[1] openblas.simple_gemm(scores, nr_todo, nr_class,
else: vectors, nr_todo, nr_hidden, hW, nr_hidden, nr_class, 0, 0)
feature = Vec.max(V, nr_piece) # Add bias
for j in range(nr_class): for i in range(nr_todo):
scores[j] += feature * W[j] VecVec.add_i(&scores[i*nr_class],
W += nr_class hb, 1., nr_class)
V += nr_piece # Validate actions, argmax, take action.
for i in range(nr_class): for i in range(nr_todo):
scores[i] += hb[i] state = states[i]
self.moves.set_valid(is_valid, state) self.moves.set_valid(is_valid, state)
guess = arg_max_if_valid(scores, is_valid, nr_class) guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
action = self.moves.c[guess] action = self.moves.c[guess]
action.do(state, action.label) action.do(state, action.label)
state.push_hist(guess) state.push_hist(guess)
if not state.is_final():
unfinished.push_back(state)
for i in range(unfinished.size()):
states[i] = unfinished[i]
nr_todo = unfinished.size()
unfinished.clear()
free(token_ids) free(token_ids)
free(is_valid) free(is_valid)
free(vectors) free(vectors)
free(unmaxed)
free(scores) free(scores)
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001): def beam_parse(self, docs, int beam_width=3, float beam_density=0.001,
float drop=0.):
cdef Beam beam cdef Beam beam
cdef np.ndarray scores cdef np.ndarray scores
cdef Doc doc cdef Doc doc
cdef int nr_class = self.moves.n_moves cdef int nr_class = self.moves.n_moves
cuda_stream = util.get_cuda_stream() cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
docs, cuda_stream, 0.0) docs, cuda_stream, drop)
cdef int offset = 0 cdef int offset = 0
cdef int j = 0 cdef int j = 0
cdef int k cdef int k
@ -524,8 +532,8 @@ cdef class Parser:
n_states += 1 n_states += 1
if n_states == 0: if n_states == 0:
break break
vectors = state2vec(token_ids[:n_states]) vectors, _ = state2vec.begin_update(token_ids[:n_states], drop)
scores = vec2scores(vectors) scores, _ = vec2scores.begin_update(vectors, drop=drop)
c_scores = <float*>scores.data c_scores = <float*>scores.data
for beam in todo: for beam in todo:
for i in range(beam.size): for i in range(beam.size):
@ -556,7 +564,10 @@ cdef class Parser:
for multitask in self._multitasks: for multitask in self._multitasks:
multitask.update(docs, golds, drop=drop, sgd=sgd) multitask.update(docs, golds, drop=drop, sgd=sgd)
cuda_stream = util.get_cuda_stream() cuda_stream = util.get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds) # Chop sequences into lengths of this many transitions, to make the
# batch uniform length.
cut_gold = numpy.random.choice(range(20, 100))
states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
drop) drop)
todo = [(s, g) for (s, g) in zip(states, golds) todo = [(s, g) for (s, g) in zip(states, golds)
@ -659,8 +670,7 @@ cdef class Parser:
for beam in beams: for beam in beams:
_cleanup(beam) _cleanup(beam)
def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500):
def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long """Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N, doc will get multiple states. Let's say we have a doc of length 2*N,
where N is the shortest doc. We'll make two states, one representing where N is the shortest doc. We'll make two states, one representing
@ -669,7 +679,7 @@ cdef class Parser:
StateClass state StateClass state
Transition action Transition action
whole_states = self.moves.init_batch(whole_docs) whole_states = self.moves.init_batch(whole_docs)
max_length = max(5, min(50, min([len(doc) for doc in whole_docs]))) max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
max_moves = 0 max_moves = 0
states = [] states = []
golds = [] golds = []
@ -791,6 +801,11 @@ cdef class Parser:
for doc in docs: for doc in docs:
hook(doc) hook(doc)
@property
def labels(self):
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
return class_names
@property @property
def tok2vec(self): def tok2vec(self):
'''Return the embedding and convolutional layer of the model.''' '''Return the embedding and convolutional layer of the model.'''
@ -809,9 +824,6 @@ cdef class Parser:
for action in self.moves.action_types: for action in self.moves.action_types:
added = self.moves.add_action(action, label) added = self.moves.add_action(action, label)
if added: if added:
# Important that the labels be stored as a list! We need the
# order, or the model goes out of synch
self.cfg.setdefault('extra_labels', []).append(label)
resized = True resized = True
if self.model not in (True, False, None) and resized: if self.model not in (True, False, None) and resized:
# Weights are stored in (nr_out, nr_in) format, so we're basically # Weights are stored in (nr_out, nr_in) format, so we're basically

View File

@ -9,7 +9,7 @@ from __future__ import unicode_literals
from copy import copy from copy import copy
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc, set_children_from_heads
DELIMITER = '||' DELIMITER = '||'
@ -74,7 +74,21 @@ def decompose(label):
def is_decorated(label): def is_decorated(label):
return label.find(DELIMITER) != -1 return DELIMITER in label
def count_decorated_labels(gold_tuples):
freqs = {}
for raw_text, sents in gold_tuples:
for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads, deco_labels = projectivize(heads, labels)
# set the label to ROOT for each root dependent
deco_labels = ['ROOT' if head == i else deco_labels[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
for label in deco_labels:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
return freqs
def preprocess_training_data(gold_tuples, label_freq_cutoff=30): def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
@ -124,8 +138,9 @@ cpdef deprojectivize(Doc doc):
if DELIMITER in label: if DELIMITER in label:
new_label, head_label = label.split(DELIMITER) new_label, head_label = label.split(DELIMITER)
new_head = _find_new_head(doc[i], head_label) new_head = _find_new_head(doc[i], head_label)
doc[i].head = new_head doc.c[i].head = new_head.i - i
doc.c[i].dep = doc.vocab.strings.add(new_label) doc.c[i].dep = doc.vocab.strings.add(new_label)
set_children_from_heads(doc.c, doc.length)
return doc return doc
@ -191,9 +206,12 @@ def _filter_labels(gold_tuples, cutoff, freqs):
for raw_text, sents in gold_tuples: for raw_text, sents in gold_tuples:
filtered_sents = [] filtered_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents: for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = [decompose(label)[0] filtered_labels = []
if freqs.get(label, cutoff) < cutoff for label in labels:
else label for label in labels] if is_decorated(label) and freqs.get(label, 0) < cutoff:
filtered_labels.append(decompose(label)[0])
else:
filtered_labels.append(label)
filtered_sents.append( filtered_sents.append(
((ids, words, tags, heads, filtered_labels, iob), ctnts)) ((ids, words, tags, heads, filtered_labels, iob), ctnts))
filtered.append((raw_text, filtered_sents)) filtered.append((raw_text, filtered_sents))

View File

@ -42,6 +42,7 @@ cdef class TransitionSystem:
cdef public attr_t root_label cdef public attr_t root_label
cdef public freqs cdef public freqs
cdef init_state_t init_beam_state cdef init_state_t init_beam_state
cdef public object labels
cdef int initialize_state(self, StateC* state) nogil cdef int initialize_state(self, StateC* state) nogil
cdef int finalize_state(self, StateC* state) nogil cdef int finalize_state(self, StateC* state) nogil

View File

@ -5,7 +5,7 @@ from __future__ import unicode_literals
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from collections import OrderedDict from collections import OrderedDict, Counter
import ujson import ujson
from ..structs cimport TokenC from ..structs cimport TokenC
@ -28,7 +28,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef class TransitionSystem: cdef class TransitionSystem:
def __init__(self, StringStore string_table, labels_by_action): def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
self.mem = Pool() self.mem = Pool()
self.strings = string_table self.strings = string_table
self.n_moves = 0 self.n_moves = 0
@ -36,21 +36,14 @@ cdef class TransitionSystem:
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition)) self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
for action, label_strs in labels_by_action.items(): self.labels = {}
for label_str in label_strs: if labels_by_action:
self.add_action(int(action), label_str) self.initialize_actions(labels_by_action, min_freq=min_freq)
self.root_label = self.strings.add('ROOT') self.root_label = self.strings.add('ROOT')
self.init_beam_state = _init_state self.init_beam_state = _init_state
def __reduce__(self): def __reduce__(self):
labels_by_action = OrderedDict() return (self.__class__, (self.strings, self.labels), None, None)
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str)
return (self.__class__,
(self.strings, labels_by_action),
None, None)
def init_batch(self, docs): def init_batch(self, docs):
cdef StateClass state cdef StateClass state
@ -146,6 +139,22 @@ cdef class TransitionSystem:
act = self.c[clas] act = self.c[clas]
return self.move_name(act.move, act.label) return self.move_name(act.move, act.label)
def initialize_actions(self, labels_by_action, min_freq=None):
self.labels = {}
self.n_moves = 0
for action, label_freqs in sorted(labels_by_action.items()):
action = int(action)
# Make sure we take a copy here, and that we get a Counter
self.labels[action] = Counter()
# Have to be careful here: Sorting must be stable, or our model
# won't be read back in correctly.
sorted_labels = [(f, L) for L, f in label_freqs.items()]
sorted_labels.sort()
sorted_labels.reverse()
for freq, label_str in sorted_labels:
self.add_action(int(action), label_str)
self.labels[action][label_str] = freq
def add_action(self, int action, label_name): def add_action(self, int action, label_name):
cdef attr_t label_id cdef attr_t label_id
if not isinstance(label_name, int) and \ if not isinstance(label_name, int) and \
@ -164,6 +173,14 @@ cdef class TransitionSystem:
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id assert self.c[self.n_moves].label == label_id
self.n_moves += 1 self.n_moves += 1
if self.labels.get(action, []):
new_freq = min(self.labels[action].values())
else:
self.labels[action] = Counter()
new_freq = -1
if new_freq > 0:
new_freq = 0
self.labels[action][label_name] = new_freq-1
return 1 return 1
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
@ -178,26 +195,18 @@ cdef class TransitionSystem:
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
transitions = [] transitions = []
for trans in self.c[:self.n_moves]:
transitions.append({
'clas': trans.clas,
'move': trans.move,
'label': self.strings[trans.label],
'name': self.move_name(trans.move, trans.label)
})
serializers = { serializers = {
'transitions': lambda: json_dumps(transitions), 'moves': lambda: json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes() 'strings': lambda: self.strings.to_bytes()
} }
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
transitions = [] labels = {}
deserializers = { deserializers = {
'transitions': lambda b: transitions.extend(ujson.loads(b)), 'moves': lambda b: labels.update(ujson.loads(b)),
'strings': lambda b: self.strings.from_bytes(b) 'strings': lambda b: self.strings.from_bytes(b)
} }
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)
for trans in transitions: self.initialize_actions(labels)
self.add_action(trans['move'], trans['label'])
return self return self

View File

@ -19,6 +19,15 @@ def doc(en_tokenizer):
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
@pytest.fixture
def doc_not_parsed(en_tokenizer):
text = "This is a sentence. This is another sentence. And a third."
tokens = en_tokenizer(text)
d = get_doc(tokens.vocab, [t.text for t in tokens])
d.is_parsed = False
return d
def test_spans_sent_spans(doc): def test_spans_sent_spans(doc):
sents = list(doc.sents) sents = list(doc.sents)
assert sents[0].start == 0 assert sents[0].start == 0
@ -34,6 +43,7 @@ def test_spans_root(doc):
assert span.root.text == 'sentence' assert span.root.text == 'sentence'
assert span.root.head.text == 'is' assert span.root.head.text == 'is'
def test_spans_string_fn(doc): def test_spans_string_fn(doc):
span = doc[0:4] span = doc[0:4]
assert len(span) == 4 assert len(span) == 4
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
assert span.upper_ == 'THIS IS A SENTENCE' assert span.upper_ == 'THIS IS A SENTENCE'
assert span.lower_ == 'this is a sentence' assert span.lower_ == 'this is a sentence'
def test_spans_root2(en_tokenizer): def test_spans_root2(en_tokenizer):
text = "through North and South Carolina" text = "through North and South Carolina"
heads = [0, 3, -1, -2, -4] heads = [0, 3, -1, -2, -4]
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
assert doc[-2:].root.text == 'Carolina' assert doc[-2:].root.text == 'Carolina'
def test_spans_span_sent(doc): def test_spans_span_sent(doc, doc_not_parsed):
"""Test span.sent property""" """Test span.sent property"""
assert len(list(doc.sents)) assert len(list(doc.sents))
assert doc[:2].sent.root.text == 'is' assert doc[:2].sent.root.text == 'is'
assert doc[:2].sent.text == 'This is a sentence .' assert doc[:2].sent.text == 'This is a sentence .'
assert doc[6:7].sent.root.left_edge.text == 'This' assert doc[6:7].sent.root.left_edge.text == 'This'
# test on manual sbd
doc_not_parsed[0].is_sent_start = True
doc_not_parsed[5].is_sent_start = True
assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
def test_spans_lca_matrix(en_tokenizer): def test_spans_lca_matrix(en_tokenizer):
@ -129,7 +145,7 @@ def test_span_to_array(doc):
assert arr[0, 1] == len(span[0]) assert arr[0, 1] == len(span[0])
def test_span_as_doc(doc): #def test_span_as_doc(doc):
span = doc[4:10] # span = doc[4:10]
span_doc = span.as_doc() # span_doc = span.as_doc()
assert span.text == span_doc.text.strip() # assert span.text == span_doc.text.strip()

View File

@ -1,36 +0,0 @@
# coding: utf-8
"""Find the min-cost alignment between two tokenizations"""
from __future__ import unicode_literals
from ...gold import _min_edit_path as min_edit_path
from ...gold import align
import pytest
@pytest.mark.parametrize('cand,gold,path', [
(["U.S", ".", "policy"], ["U.S.", "policy"], (0, 'MDM')),
(["U.N", ".", "policy"], ["U.S.", "policy"], (1, 'SDM')),
(["The", "cat", "sat", "down"], ["The", "cat", "sat", "down"], (0, 'MMMM')),
(["cat", "sat", "down"], ["The", "cat", "sat", "down"], (1, 'IMMM')),
(["The", "cat", "down"], ["The", "cat", "sat", "down"], (1, 'MMIM')),
(["The", "cat", "sag", "down"], ["The", "cat", "sat", "down"], (1, 'MMSM'))])
def test_gold_lev_align_edit_path(cand, gold, path):
assert min_edit_path(cand, gold) == path
def test_gold_lev_align_edit_path2():
cand = ["your", "stuff"]
gold = ["you", "r", "stuff"]
assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')]
@pytest.mark.parametrize('cand,gold,result', [
(["U.S", ".", "policy"], ["U.S.", "policy"], [0, None, 1]),
(["your", "stuff"], ["you", "r", "stuff"], [None, 2]),
(["i", "like", "2", "guys", " ", "well", "id", "just", "come", "straight", "out"],
["i", "like", "2", "guys", "well", "i", "d", "just", "come", "straight", "out"],
[0, 1, 2, 3, None, 4, None, 7, 8, 9, 10])])
def test_gold_lev_align(cand, gold, result):
assert align(cand, gold) == result

View File

@ -2,9 +2,9 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ....parts_of_speech import SPACE from ....parts_of_speech import SPACE
from ....compat import unicode_
from ...util import get_doc from ...util import get_doc
import six
import pytest import pytest
@ -24,8 +24,8 @@ def test_tag_names(EN):
text = "I ate pizzas with anchovies." text = "I ate pizzas with anchovies."
doc = EN(text, disable=['parser']) doc = EN(text, disable=['parser'])
assert type(doc[2].pos) == int assert type(doc[2].pos) == int
assert isinstance(doc[2].pos_, six.text_type) assert isinstance(doc[2].pos_, unicode_)
assert isinstance(doc[2].dep_, six.text_type) assert isinstance(doc[2].dep_, unicode_)
assert doc[2].tag_ == u'NNS' assert doc[2].tag_ == u'NNS'

View File

@ -0,0 +1,75 @@
from __future__ import unicode_literals
from ...vocab import Vocab
from ...pipeline import DependencyParser
from ...tokens import Doc
from ...gold import GoldParse
from ...syntax.nonproj import projectivize
annot_tuples = [
(0, 'When', 'WRB', 11, 'advmod', 'O'),
(1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'),
(2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'),
(3, ',', ',', 2, 'punct', 'O'),
(4, 'our', 'PRP$', 6, 'poss', 'O'),
(5, 'embedded', 'VBN', 6, 'amod', 'O'),
(6, 'reporter', 'NN', 2, 'appos', 'O'),
(7, 'with', 'IN', 6, 'prep', 'O'),
(8, 'the', 'DT', 10, 'det', 'B-ORG'),
(9, '3rd', 'NNP', 10, 'compound', 'I-ORG'),
(10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'),
(11, 'says', 'VBZ', 44, 'advcl', 'O'),
(12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'),
(13, 'battalions', 'NNS', 16, 'nsubj', 'O'),
(14, 'of', 'IN', 13, 'prep', 'O'),
(15, 'troops', 'NNS', 14, 'pobj', 'O'),
(16, 'are', 'VBP', 11, 'ccomp', 'O'),
(17, 'on', 'IN', 16, 'prep', 'O'),
(18, 'the', 'DT', 19, 'det', 'O'),
(19, 'ground', 'NN', 17, 'pobj', 'O'),
(20, ',', ',', 17, 'punct', 'O'),
(21, 'inside', 'IN', 17, 'prep', 'O'),
(22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'),
(23, 'itself', 'PRP', 22, 'appos', 'O'),
(24, ',', ',', 16, 'punct', 'O'),
(25, 'have', 'VBP', 26, 'aux', 'O'),
(26, 'taken', 'VBN', 16, 'dep', 'O'),
(27, 'up', 'RP', 26, 'prt', 'O'),
(28, 'positions', 'NNS', 26, 'dobj', 'O'),
(29, 'they', 'PRP', 31, 'nsubj', 'O'),
(30, "'re", 'VBP', 31, 'aux', 'O'),
(31, 'going', 'VBG', 26, 'parataxis', 'O'),
(32, 'to', 'TO', 33, 'aux', 'O'),
(33, 'spend', 'VB', 31, 'xcomp', 'O'),
(34, 'the', 'DT', 35, 'det', 'B-TIME'),
(35, 'night', 'NN', 33, 'dobj', 'L-TIME'),
(36, 'there', 'RB', 33, 'advmod', 'O'),
(37, 'presumably', 'RB', 33, 'advmod', 'O'),
(38, ',', ',', 44, 'punct', 'O'),
(39, 'how', 'WRB', 40, 'advmod', 'O'),
(40, 'many', 'JJ', 41, 'amod', 'O'),
(41, 'soldiers', 'NNS', 44, 'pobj', 'O'),
(42, 'are', 'VBP', 44, 'aux', 'O'),
(43, 'we', 'PRP', 44, 'nsubj', 'O'),
(44, 'talking', 'VBG', 44, 'ROOT', 'O'),
(45, 'about', 'IN', 44, 'prep', 'O'),
(46, 'right', 'RB', 47, 'advmod', 'O'),
(47, 'now', 'RB', 44, 'advmod', 'O'),
(48, '?', '.', 44, 'punct', 'O')]
def test_get_oracle_actions():
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
parser = DependencyParser(doc.vocab)
parser.moves.add_action(0, '')
parser.moves.add_action(1, '')
parser.moves.add_action(1, '')
parser.moves.add_action(4, 'ROOT')
for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
if head > i:
parser.moves.add_action(2, dep)
elif head < i:
parser.moves.add_action(3, dep)
ids, words, tags, heads, deps, ents = zip(*annot_tuples)
heads, deps = projectivize(heads, deps)
gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
parser.moves.preprocess_gold(gold)
actions = parser.moves.get_oracle_sequence(doc, gold)

View File

@ -13,8 +13,8 @@ from ...vocab import Vocab
('a b', 0, 2), ('a b', 0, 2),
('a c', 0, 1), ('a c', 0, 1),
('a b c', 0, 2), ('a b c', 0, 2),
('a b b c', 0, 2), ('a b b c', 0, 3),
('a b b', 0, 2), ('a b b', 0, 3),
] ]
) )
def test_issue1450_matcher_end_zero_plus(string, start, end): def test_issue1450_matcher_end_zero_plus(string, start, end):
@ -54,5 +54,6 @@ def test_issue1450_matcher_end_zero_plus(string, start, end):
if start is None or end is None: if start is None or end is None:
assert matches == [] assert matches == []
assert matches[0][1] == start print(matches)
assert matches[0][2] == end assert matches[-1][1] == start
assert matches[-1][2] == end

View File

@ -0,0 +1,65 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from ...matcher import Matcher
import pytest
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
re_pattern1 = 'AA*'
re_pattern2 = 'A*A'
re_pattern3 = 'AA'
re_pattern4 = 'BA*B'
re_pattern5 = 'B*A*B'
@pytest.fixture
def text():
return "(ABBAAAAAB)."
@pytest.fixture
def doc(en_tokenizer,text):
doc = en_tokenizer(' '.join(text))
return doc
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_greedy_matching(doc,text,pattern,re_pattern):
"""
Test that the greedy matching behavior of the * op
is consistant with other re implementations
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
for match,re_match in zip(matches,re_matches):
assert match[1:]==re_match
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern',[
(pattern1,re_pattern1),
(pattern2,re_pattern2),
(pattern3,re_pattern3),
(pattern4,re_pattern4),
(pattern5,re_pattern5)])
def test_match_consuming(doc,text,pattern,re_pattern):
"""
Test that matcher.__call__ consumes tokens on a match
similar to re.findall
"""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern,None,pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
assert len(matches)==len(re_matches)

View File

@ -0,0 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
from ...lang.lex_attrs import is_stop
from ...lang.en.stop_words import STOP_WORDS
import pytest
@pytest.mark.parametrize('word', ['the'])
def test_lex_attrs_stop_words_case_sensitivity(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)

View File

@ -6,7 +6,6 @@ from ...vocab import Vocab
from ...tokens import Doc from ...tokens import Doc
from ...matcher import Matcher from ...matcher import Matcher
@pytest.mark.xfail
def test_issue1945(): def test_issue1945():
text = "a a a" text = "a a a"
matcher = Matcher(Vocab()) matcher = Matcher(Vocab())

View File

@ -22,10 +22,9 @@ def test_basic_case():
assert end == 4 assert end == 4
@pytest.mark.xfail
def test_issue850(): def test_issue850():
"""The problem here is that the variable-length pattern matches the """The variable-length pattern matches the
succeeding token. We then don't handle the ambiguity correctly.""" succeeding token. Check we handle the ambiguity correctly."""
matcher = Matcher(Vocab( matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()})) lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)

66
spacy/tests/test_align.py Normal file
View File

@ -0,0 +1,66 @@
from __future__ import unicode_literals
import pytest
from .._align import align, multi_align
@pytest.mark.parametrize('string1,string2,cost', [
('hello', 'hell', 1),
('rat', 'cat', 1),
('rat', 'rat', 0),
('rat', 'catsie', 4),
('t', 'catsie', 5),
])
def test_align_costs(string1, string2, cost):
output_cost, i2j, j2i, matrix = align(string1, string2)
assert output_cost == cost
@pytest.mark.parametrize('string1,string2,i2j', [
('hello', 'hell', [0,1,2,3,-1]),
('rat', 'cat', [0,1,2]),
('rat', 'rat', [0,1,2]),
('rat', 'catsie', [0,1,2]),
('t', 'catsie', [2]),
])
def test_align_i2j(string1, string2, i2j):
output_cost, output_i2j, j2i, matrix = align(string1, string2)
assert list(output_i2j) == i2j
@pytest.mark.parametrize('string1,string2,j2i', [
('hello', 'hell', [0,1,2,3]),
('rat', 'cat', [0,1,2]),
('rat', 'rat', [0,1,2]),
('rat', 'catsie', [0,1,2, -1, -1, -1]),
('t', 'catsie', [-1, -1, 0, -1, -1, -1]),
])
def test_align_i2j(string1, string2, j2i):
output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
assert list(output_j2i) == j2i
def test_align_strings():
words1 = ['hello', 'this', 'is', 'test!']
words2 = ['hellothis', 'is', 'test', '!']
cost, i2j, j2i, matrix = align(words1, words2)
assert cost == 4
assert list(i2j) == [-1, -1, 1, -1]
assert list(j2i) == [-1, 2, -1, -1]
def test_align_many_to_one():
words1 = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
words2 = ['ab', 'bc', 'e', 'fg', 'h']
cost, i2j, j2i, matrix = align(words1, words2)
assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
lengths1 = [len(w) for w in words1]
lengths2 = [len(w) for w in words2]
i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
assert i2j_multi[0] == 0
assert i2j_multi[1] == 0
assert i2j_multi[2] == 1
assert i2j_multi[3] == 1
assert i2j_multi[3] == 1
assert i2j_multi[5] == 3
assert i2j_multi[6] == 3
assert j2i_multi[0] == 1
assert j2i_multi[1] == 3

View File

@ -3,12 +3,17 @@ from __future__ import unicode_literals
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from .util import get_doc from .util import get_doc
from ..util import get_lang_class
from ..tokens import Doc from ..tokens import Doc
import pytest import pytest
@pytest.fixture(scope="session")
def en_vocab():
return get_lang_class('en').Defaults.create_vocab()
@pytest.fixture
@pytest.fixture(scope="session")
def matcher(en_vocab): def matcher(en_vocab):
rules = { rules = {
'JS': [[{'ORTH': 'JavaScript'}]], 'JS': [[{'ORTH': 'JavaScript'}]],
@ -21,187 +26,196 @@ def matcher(en_vocab):
return matcher return matcher
def test_matcher_from_api_docs(en_vocab): #def test_matcher_from_api_docs(en_vocab):
matcher = Matcher(en_vocab) # matcher = Matcher(en_vocab)
pattern = [{'ORTH': 'test'}] # pattern = [{'ORTH': 'test'}]
assert len(matcher) == 0 # assert len(matcher) == 0
matcher.add('Rule', None, pattern) # matcher.add('Rule', None, pattern)
assert len(matcher) == 1 # assert len(matcher) == 1
matcher.remove('Rule') # matcher.remove('Rule')
assert 'Rule' not in matcher # assert 'Rule' not in matcher
matcher.add('Rule', None, pattern) # matcher.add('Rule', None, pattern)
assert 'Rule' in matcher # assert 'Rule' in matcher
on_match, patterns = matcher.get('Rule') # on_match, patterns = matcher.get('Rule')
assert len(patterns[0]) # assert len(patterns[0])
#
#
#def test_matcher_from_usage_docs(en_vocab):
# text = "Wow 😀 This is really cool! 😂 😂"
# doc = get_doc(en_vocab, words=text.split(' '))
# pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
# pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
#
# def label_sentiment(matcher, doc, i, matches):
# match_id, start, end = matches[i]
# if doc.vocab.strings[match_id] == 'HAPPY':
# doc.sentiment += 0.1
# span = doc[start : end]
# token = span.merge()
# token.vocab[token.text].norm_ = 'happy emoji'
#
# matcher = Matcher(en_vocab)
# matcher.add('HAPPY', label_sentiment, *pos_patterns)
# matches = matcher(doc)
# assert doc.sentiment != 0
# assert doc[1].norm_ == 'happy emoji'
def test_matcher_from_usage_docs(en_vocab): #@pytest.mark.parametrize('words', [["Some", "words"]])
text = "Wow 😀 This is really cool! 😂 😂" #def test_matcher_init(en_vocab, words):
doc = get_doc(en_vocab, words=text.split(' ')) # matcher = Matcher(en_vocab)
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍'] # doc = get_doc(en_vocab, words)
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji] # assert len(matcher) == 0
# assert matcher(doc) == []
def label_sentiment(matcher, doc, i, matches): #
match_id, start, end = matches[i] #
if doc.vocab.strings[match_id] == 'HAPPY': #def test_matcher_contains(matcher):
doc.sentiment += 0.1 # matcher.add('TEST', None, [{'ORTH': 'test'}])
span = doc[start : end] # assert 'TEST' in matcher
token = span.merge() # assert 'TEST2' not in matcher
token.vocab[token.text].norm_ = 'happy emoji' #
#
matcher = Matcher(en_vocab) #def test_matcher_no_match(matcher):
matcher.add('HAPPY', label_sentiment, *pos_patterns) # words = ["I", "like", "cheese", "."]
matches = matcher(doc) # doc = get_doc(matcher.vocab, words)
assert doc.sentiment != 0 # assert matcher(doc) == []
assert doc[1].norm_ == 'happy emoji' #
#
#def test_matcher_compile(en_vocab):
@pytest.mark.parametrize('words', [["Some", "words"]]) # rules = {
def test_matcher_init(en_vocab, words): # 'JS': [[{'ORTH': 'JavaScript'}]],
matcher = Matcher(en_vocab) # 'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
doc = get_doc(en_vocab, words) # 'Java': [[{'LOWER': 'java'}]]
assert len(matcher) == 0 # }
assert matcher(doc) == [] # matcher = Matcher(en_vocab)
# for key, patterns in rules.items():
# matcher.add(key, None, *patterns)
def test_matcher_contains(matcher): # assert len(matcher) == 3
matcher.add('TEST', None, [{'ORTH': 'test'}]) #
assert 'TEST' in matcher #
assert 'TEST2' not in matcher #def test_matcher_match_start(matcher):
# words = ["JavaScript", "is", "good"]
# doc = get_doc(matcher.vocab, words)
def test_matcher_no_match(matcher): # assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
words = ["I", "like", "cheese", "."] #
doc = get_doc(matcher.vocab, words) #
assert matcher(doc) == [] #def test_matcher_match_end(matcher):
# words = ["I", "like", "java"]
# doc = get_doc(matcher.vocab, words)
def test_matcher_compile(matcher): # assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
assert len(matcher) == 3 #
#
#def test_matcher_match_middle(matcher):
def test_matcher_match_start(matcher): # words = ["I", "like", "Google", "Now", "best"]
words = ["JavaScript", "is", "good"] # doc = get_doc(matcher.vocab, words)
doc = get_doc(matcher.vocab, words) # assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)] #
#
#def test_matcher_match_multi(matcher):
def test_matcher_match_end(matcher): # words = ["I", "like", "Google", "Now", "and", "java", "best"]
words = ["I", "like", "java"] # doc = get_doc(matcher.vocab, words)
doc = get_doc(matcher.vocab, words) # assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)] # (doc.vocab.strings['Java'], 5, 6)]
#
#
def test_matcher_match_middle(matcher): #def test_matcher_empty_dict(en_vocab):
words = ["I", "like", "Google", "Now", "best"] # '''Test matcher allows empty token specs, meaning match on any token.'''
doc = get_doc(matcher.vocab, words) # matcher = Matcher(en_vocab)
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)] # abc = ["a", "b", "c"]
# doc = get_doc(matcher.vocab, abc)
# matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
def test_matcher_match_multi(matcher): # matches = matcher(doc)
words = ["I", "like", "Google", "Now", "and", "java", "best"] # assert len(matches) == 1
doc = get_doc(matcher.vocab, words) # assert matches[0][1:] == (0, 3)
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4), # matcher = Matcher(en_vocab)
(doc.vocab.strings['Java'], 5, 6)] # matcher.add('A.', None, [{'ORTH': 'a'}, {}])
# matches = matcher(doc)
# assert matches[0][1:] == (0, 2)
def test_matcher_empty_dict(en_vocab): #
'''Test matcher allows empty token specs, meaning match on any token.''' #
matcher = Matcher(en_vocab) #def test_matcher_operator_shadow(en_vocab):
abc = ["a", "b", "c"] # matcher = Matcher(en_vocab)
doc = get_doc(matcher.vocab, abc) # abc = ["a", "b", "c"]
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}]) # doc = get_doc(matcher.vocab, abc)
matches = matcher(doc) # matcher.add('A.C', None, [{'ORTH': 'a'},
assert len(matches) == 1 # {"IS_ALPHA": True, "OP": "+"},
assert matches[0][1:] == (0, 3) # {'ORTH': 'c'}])
matcher = Matcher(en_vocab) # matches = matcher(doc)
matcher.add('A.', None, [{'ORTH': 'a'}, {}]) # assert len(matches) == 1
matches = matcher(doc) # assert matches[0][1:] == (0, 3)
assert matches[0][1:] == (0, 2) #
#
#def test_matcher_phrase_matcher(en_vocab):
def test_matcher_operator_shadow(en_vocab): # words = ["Google", "Now"]
matcher = Matcher(en_vocab) # doc = get_doc(en_vocab, words)
abc = ["a", "b", "c"] # matcher = PhraseMatcher(en_vocab)
doc = get_doc(matcher.vocab, abc) # matcher.add('COMPANY', None, doc)
matcher.add('A.C', None, [{'ORTH': 'a'}, # words = ["I", "like", "Google", "Now", "best"]
{"IS_ALPHA": True, "OP": "+"}, # doc = get_doc(en_vocab, words)
{'ORTH': 'c'}]) # assert len(matcher(doc)) == 1
matches = matcher(doc) #
assert len(matches) == 1 #
assert matches[0][1:] == (0, 3) #def test_phrase_matcher_length(en_vocab):
# matcher = PhraseMatcher(en_vocab)
# assert len(matcher) == 0
def test_matcher_phrase_matcher(en_vocab): # matcher.add('TEST', None, get_doc(en_vocab, ['test']))
words = ["Google", "Now"] # assert len(matcher) == 1
doc = get_doc(en_vocab, words) # matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
matcher = PhraseMatcher(en_vocab) # assert len(matcher) == 2
matcher.add('COMPANY', None, doc) #
words = ["I", "like", "Google", "Now", "best"] #
doc = get_doc(en_vocab, words) #def test_phrase_matcher_contains(en_vocab):
assert len(matcher(doc)) == 1 # matcher = PhraseMatcher(en_vocab)
# matcher.add('TEST', None, get_doc(en_vocab, ['test']))
# assert 'TEST' in matcher
def test_phrase_matcher_length(en_vocab): # assert 'TEST2' not in matcher
matcher = PhraseMatcher(en_vocab) #
assert len(matcher) == 0 #
matcher.add('TEST', None, get_doc(en_vocab, ['test'])) #def test_matcher_match_zero(matcher):
assert len(matcher) == 1 # words1 = 'He said , " some words " ...'.split()
matcher.add('TEST2', None, get_doc(en_vocab, ['test2'])) # words2 = 'He said , " some three words " ...'.split()
assert len(matcher) == 2 # pattern1 = [{'ORTH': '"'},
# {'OP': '!', 'IS_PUNCT': True},
# {'OP': '!', 'IS_PUNCT': True},
def test_phrase_matcher_contains(en_vocab): # {'ORTH': '"'}]
matcher = PhraseMatcher(en_vocab) # pattern2 = [{'ORTH': '"'},
matcher.add('TEST', None, get_doc(en_vocab, ['test'])) # {'IS_PUNCT': True},
assert 'TEST' in matcher # {'IS_PUNCT': True},
assert 'TEST2' not in matcher # {'IS_PUNCT': True},
# {'ORTH': '"'}]
#
def test_matcher_match_zero(matcher): # matcher.add('Quote', None, pattern1)
words1 = 'He said , " some words " ...'.split() # doc = get_doc(matcher.vocab, words1)
words2 = 'He said , " some three words " ...'.split() # assert len(matcher(doc)) == 1
pattern1 = [{'ORTH': '"'}, #
{'OP': '!', 'IS_PUNCT': True}, # doc = get_doc(matcher.vocab, words2)
{'OP': '!', 'IS_PUNCT': True}, # assert len(matcher(doc)) == 0
{'ORTH': '"'}] # matcher.add('Quote', None, pattern2)
pattern2 = [{'ORTH': '"'}, # assert len(matcher(doc)) == 0
{'IS_PUNCT': True}, #
{'IS_PUNCT': True}, #
{'IS_PUNCT': True}, #def test_matcher_match_zero_plus(matcher):
{'ORTH': '"'}] # words = 'He said , " some words " ...'.split()
# pattern = [{'ORTH': '"'},
matcher.add('Quote', None, pattern1) # {'OP': '*', 'IS_PUNCT': False},
doc = get_doc(matcher.vocab, words1) # {'ORTH': '"'}]
assert len(matcher(doc)) == 1 # matcher = Matcher(matcher.vocab)
# matcher.add('Quote', None, pattern)
doc = get_doc(matcher.vocab, words2) # doc = get_doc(matcher.vocab, words)
assert len(matcher(doc)) == 0 # assert len(matcher(doc)) == 1
matcher.add('Quote', None, pattern2) #
assert len(matcher(doc)) == 0 #
#def test_matcher_match_one_plus(matcher):
# control = Matcher(matcher.vocab)
def test_matcher_match_zero_plus(matcher): # control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
words = 'He said , " some words " ...'.split() # doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
pattern = [{'ORTH': '"'}, # m = control(doc)
{'OP': '*', 'IS_PUNCT': False}, # assert len(m) == 2
{'ORTH': '"'}] # matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
matcher.add('Quote', None, pattern) # {'ORTH': 'Philippe', 'OP': '+'}])
doc = get_doc(matcher.vocab, words) # m = matcher(doc)
assert len(matcher(doc)) == 1 # assert len(m) == 1
#
def test_matcher_match_one_plus(matcher):
control = Matcher(matcher.vocab)
control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
m = control(doc)
assert len(m) == 2
matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
{'ORTH': 'Philippe', 'OP': '+'}])
m = matcher(doc)
assert len(m) == 1
def test_operator_combos(matcher): def test_operator_combos(matcher):
cases = [ cases = [
@ -252,9 +266,8 @@ def test_matcher_end_zero_plus(matcher):
) )
nlp = lambda string: Doc(matcher.vocab, words=string.split()) nlp = lambda string: Doc(matcher.vocab, words=string.split())
assert len(matcher(nlp(u'a'))) == 1 assert len(matcher(nlp(u'a'))) == 1
assert len(matcher(nlp(u'a b'))) == 1 assert len(matcher(nlp(u'a b'))) == 2
assert len(matcher(nlp(u'a b'))) == 1
assert len(matcher(nlp(u'a c'))) == 1 assert len(matcher(nlp(u'a c'))) == 1
assert len(matcher(nlp(u'a b c'))) == 1 assert len(matcher(nlp(u'a b c'))) == 2
assert len(matcher(nlp(u'a b b c'))) == 1 assert len(matcher(nlp(u'a b b c'))) == 3
assert len(matcher(nlp(u'a b b'))) == 1 assert len(matcher(nlp(u'a b b'))) == 3

View File

@ -0,0 +1,44 @@
from __future__ import unicode_literals
import random
import numpy.random
from ..pipeline import TextCategorizer
from ..lang.en import English
from ..vocab import Vocab
from ..tokens import Doc
from ..gold import GoldParse
def test_textcat_learns_multilabel():
random.seed(0)
numpy.random.seed(0)
docs = []
nlp = English()
vocab = nlp.vocab
letters = ['a', 'b', 'c']
for w1 in letters:
for w2 in letters:
cats = {letter: float(w2==letter) for letter in letters}
docs.append((Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3), cats))
random.shuffle(docs)
model = TextCategorizer(vocab, width=8)
for letter in letters:
model.add_label(letter)
optimizer = model.begin_training()
for i in range(30):
losses = {}
Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
Xs = [doc for doc, cats in docs]
model.update(Xs, Ys, sgd=optimizer, losses=losses)
random.shuffle(docs)
for w1 in letters:
for w2 in letters:
doc = Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3)
truth = {letter: w2==letter for letter in letters}
model(doc)
for cat, score in doc.cats.items():
if not truth[cat]:
assert score < 0.5
else:
assert score > 0.5

View File

@ -19,6 +19,9 @@ ctypedef fused LexemeOrToken:
const_TokenC_ptr const_TokenC_ptr
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2

View File

@ -186,6 +186,20 @@ cdef class Doc:
def _(self): def _(self):
return Underscore(Underscore.doc_extensions, self) return Underscore(Underscore.doc_extensions, self)
@property
def is_sentenced(self):
# Check if the document has sentence boundaries,
# i.e at least one tok has the sent_start in (-1, 1)
if 'sents' in self.user_hooks:
return True
if self.is_parsed:
return True
for i in range(self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True
else:
return False
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a `Token` or `Span` object. """Get a `Token` or `Span` object.
@ -517,22 +531,16 @@ cdef class Doc:
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"] >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
""" """
def __get__(self): def __get__(self):
if 'sents' in self.user_hooks: if not self.is_sentenced:
yield from self.user_hooks['sents'](self)
return
cdef int i
if not self.is_parsed:
for i in range(1, self.length):
if self.c[i].sent_start != 0:
break
else:
raise ValueError( raise ValueError(
"Sentence boundaries unset. You can add the 'sentencizer' " "Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: " "component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) " "nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set " "Alternatively, add the dependency parser, or set "
"sentence boundaries by setting doc[i].sent_start") "sentence boundaries by setting doc[i].sent_start")
if 'sents' in self.user_hooks:
yield from self.user_hooks['sents'](self)
else:
start = 0 start = 0
for i in range(1, self.length): for i in range(1, self.length):
if self.c[i].sent_start == 1: if self.c[i].sent_start == 1:

View File

@ -285,16 +285,42 @@ cdef class Span:
def __get__(self): def __get__(self):
if 'sent' in self.doc.user_span_hooks: if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self) return self.doc.user_span_hooks['sent'](self)
# This should raise if we're not parsed. # This should raise if we're not parsed
# or doesen't have any sbd component :)
self.doc.sents self.doc.sents
# if doc is parsed we can use the deps to find the sentence
# otherwise we use the `sent_start` token attribute
cdef int n = 0 cdef int n = 0
cdef int i
if self.doc.is_parsed:
root = &self.doc.c[self.start] root = &self.doc.c[self.start]
n = 0
while root.head != 0: while root.head != 0:
root += root.head root += root.head
n += 1 n += 1
if n >= self.doc.length: if n >= self.doc.length:
raise RuntimeError raise RuntimeError
return self.doc[root.l_edge:root.r_edge + 1] return self.doc[root.l_edge:root.r_edge + 1]
elif self.doc.is_sentenced:
# find start of the sentence
start = self.start
while self.doc.c[start].sent_start != 1 and start > 0:
start += -1
# find end of the sentence
end = self.end
n = 0
while end < self.doc.length and self.doc.c[end].sent_start != 1:
end += 1
n += 1
if n >= self.doc.length:
break
#
return self.doc[start:end]
else:
raise ValueError(
"Access to sentence requires either the dependency parse "
"or sentence boundaries to be set by setting " +
"doc[i].is_sent_start = True")
property has_vector: property has_vector:
"""RETURNS (bool): Whether a word vector is associated with the object. """RETURNS (bool): Whether a word vector is associated with the object.

View File

@ -34,11 +34,11 @@ cdef class Token:
@classmethod @classmethod
def get_extension(cls, name): def get_extension(cls, name):
return Underscore.token_extensions.get(name) return Underscore.span_extensions.get(name)
@classmethod @classmethod
def has_extension(cls, name): def has_extension(cls, name):
return name in Underscore.token_extensions return name in Underscore.span_extensions
def __cinit__(self, Vocab vocab, Doc doc, int offset): def __cinit__(self, Vocab vocab, Doc doc, int offset):
"""Construct a `Token` object. """Construct a `Token` object.

View File

@ -442,6 +442,29 @@ def decaying(start, stop, decay):
nr_upd += 1 nr_upd += 1
def minibatch_by_words(items, size, count_words=len):
'''Create minibatches of a given number of words.'''
if isinstance(size, int):
size_ = itertools.repeat(size)
else:
size_ = size
items = iter(items)
while True:
batch_size = next(size_)
batch = []
while batch_size >= 0:
try:
doc, gold = next(items)
except StopIteration:
if batch:
yield batch
return
batch_size -= count_words(doc)
batch.append((doc, gold))
if batch:
yield batch
def itershuffle(iterable, bufsize=1000): def itershuffle(iterable, bufsize=1000):
"""Shuffle an iterator. This works by holding `bufsize` items back """Shuffle an iterator. This works by holding `bufsize` items back
and yielding them sometime later. Obviously, this is not unbiased and yielding them sometime later. Obviously, this is not unbiased
@ -457,7 +480,7 @@ def itershuffle(iterable, bufsize=1000):
try: try:
while True: while True:
for i in range(random.randint(1, bufsize-len(buf))): for i in range(random.randint(1, bufsize-len(buf))):
buf.append(iterable.next()) buf.append(next(iterable))
random.shuffle(buf) random.shuffle(buf)
for i in range(random.randint(1, bufsize)): for i in range(random.randint(1, bufsize)):
if buf: if buf:

View File

@ -120,9 +120,6 @@ include ../_includes/_mixins
| A Practical Real-World Approach to Gaining Actionable Insights | A Practical Real-World Approach to Gaining Actionable Insights
| from your Data | from your Data
+card("Practical Machine Learning with Python", "", "Dipanjan Sarkar et al. (Apress, 2017)", "book")
| A Problem-Solver's Guide to Building Real-World Intelligent Systems
+section("notebooks") +section("notebooks")
+h(2, "notebooks") Jupyter notebooks +h(2, "notebooks") Jupyter notebooks