@ -32,7 +32,7 @@ test_script:
# Note that you must use the environment variable %PYTHON% to refer to
# the interpreter you're using - Appveyor does not do anything special
# to put the Python version you want to use on PATH.
- "%PYTHON%\\python.exe -m pytest spacy/"
- "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
# This step builds your wheels.
@ -0,0 +1,11 @@
command: "fab env clean make test wheel"
label: ":dizzy: :python:"
artifact_paths: "dist/*.whl"
- wait
- trigger: "spacy-train-from-wheel"
label: ":dizzy: :train:"
@ -182,7 +182,7 @@ If you've made a contribution to spaCy, you should fill in the
[spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that
your contribution can be used across the project. If you agree to be bound by
the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md)
and include it with your pull request, or sumit it separately to
and include it with your pull request, or submit it separately to
[`.github/contributors/`](/.github/contributors). The name of the file should be
your GitHub username, with the extension `.md`. For example, the user
example_user would create the file `.github/contributors/example_user.md`.
@ -0,0 +1,392 @@
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
.conllu format for development data, allowing the official scorer to be used.
from __future__ import unicode_literals
import plac
import tqdm
import attr
from pathlib import Path
import re
import sys
import json
import spacy
import spacy.util
from spacy.tokens import Token, Doc
from spacy.gold import GoldParse
from spacy.syntax.nonproj import projectivize
from collections import defaultdict, Counter
from timeit import default_timer as timer
from spacy.matcher import Matcher
import itertools
import random
import numpy.random
import cytoolz
import conll17_ud_eval
import spacy.lang.zh
import spacy.lang.ja
spacy.lang.zh.Chinese.Defaults.use_jieba = False
spacy.lang.ja.Japanese.Defaults.use_janome = False
def minibatch_by_words(items, size=5000):
if isinstance(size, int):
size_ = itertools.repeat(size)
size_ = size
items = iter(items)
while True:
batch_size = next(size_)
batch = []
while batch_size >= 0:
doc, gold = next(items)
except StopIteration:
if batch:
yield batch
batch_size -= len(doc)
batch.append((doc, gold))
if batch:
yield batch
# Data reading #
space_re = re.compile('\s+')
def split_text(text):
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
max_doc_length=None, limit=None):
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
include Doc objects created using nlp.make_doc and then aligned against
the gold-standard sequences. If oracle_segments=True, include Doc objects
created from the gold-standard segments. At least one must be True.'''
if not raw_text and not oracle_segments:
raise ValueError("At least one of raw_text or oracle_segments must be True")
paragraphs = split_text(text_file.read())
conllu = read_conllu(conllu_file)
# sd is spacy doc; cd is conllu doc
# cs is conllu sent, ct is conllu token
docs = []
golds = []
for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
sent_annots = []
for cs in cd:
sent = defaultdict(list)
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
if '.' in id_:
if '-' in id_:
id_ = int(id_)-1
head = int(head)-1 if head != '0' else id_
sent['deps'].append('ROOT' if dep == 'root' else dep)
sent['spaces'].append(space_after == '_')
sent['entities'] = ['-'] * len(sent['words'])
sent['heads'], sent['deps'] = projectivize(sent['heads'],
if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
golds.append(GoldParse(docs[-1], **sent))
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
doc, gold = _make_gold(nlp, None, sent_annots)
sent_annots = []
if limit and len(docs) >= limit:
return docs, golds
if raw_text and sent_annots:
doc, gold = _make_gold(nlp, None, sent_annots)
if limit and len(docs) >= limit:
return docs, golds
return docs, golds
def read_conllu(file_):
docs = []
sent = []
doc = []
for line in file_:
if line.startswith('# newdoc'):
if doc:
doc = []
elif line.startswith('#'):
elif not line.strip():
if sent:
sent = []
if len(sent[-1]) != 10:
raise ValueError
if sent:
if doc:
return docs
def _make_gold(nlp, text, sent_annots):
# Flatten the conll annotations, and adjust the head indices
flat = defaultdict(list)
for sent in sent_annots:
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
# Construct text if necessary
assert len(flat['words']) == len(flat['spaces'])
if text is None:
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
doc = nlp.make_doc(text)
gold = GoldParse(doc, **flat)
return doc, gold
# Data transforms for spaCy #
def golds_to_gold_tuples(docs, golds):
'''Get out the annoying 'tuples' format used by begin_training, given the
GoldParse objects.'''
tuples = []
for doc, gold in zip(docs, golds):
text = doc.text
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
sents = [((ids, words, tags, heads, labels, iob), [])]
tuples.append((text, sents))
return tuples
# Evaluation #
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
with text_loc.open('r', encoding='utf8') as text_file:
texts = split_text(text_file.read())
docs = list(nlp.pipe(texts))
with sys_loc.open('w', encoding='utf8') as out_file:
write_conllu(docs, out_file)
with gold_loc.open('r', encoding='utf8') as gold_file:
gold_ud = conll17_ud_eval.load_conllu(gold_file)
with sys_loc.open('r', encoding='utf8') as sys_file:
sys_ud = conll17_ud_eval.load_conllu(sys_file)
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
return scores
def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start:end+1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets:
doc.merge(start_char, end_char)
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent):
file_.write(token._.get_conllu_lines(k) + '\n')
def print_progress(itn, losses, ud_scores):
fields = {
'dep_loss': losses.get('parser', 0.0),
'tag_loss': losses.get('tagger', 0.0),
'words': ud_scores['Words'].f1 * 100,
'sents': ud_scores['Sentences'].f1 * 100,
'tags': ud_scores['XPOS'].f1 * 100,
'uas': ud_scores['UAS'].f1 * 100,
'las': ud_scores['LAS'].f1 * 100,
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
if itn == 0:
tpl = '\t'.join((
print(tpl.format(itn, **fields))
#def get_sent_conllu(sent, sent_id):
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
def get_token_conllu(token, i):
if token._.begins_fused:
n = 1
while token.nbor(n)._.inside_fused:
n += 1
id_ = '%d-%d' % (i, i+n)
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
lines = []
if token.head.i == token.i:
head = 0
head = i + (token.head.i - token.i) + 1
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
str(head), token.dep_.lower(), '_', '_']
return '\n'.join(lines)
Token.set_extension('get_conllu_lines', method=get_token_conllu)
Token.set_extension('begins_fused', default=False)
Token.set_extension('inside_fused', default=False)
# Initialization #
def load_nlp(corpus, config):
lang = corpus.split('_')[0]
nlp = spacy.blank(lang)
if config.vectors:
nlp.vocab.from_disk(config.vectors / 'vocab')
return nlp
def initialize_pipeline(nlp, docs, golds, config):
if config.multitask_tag:
if config.multitask_sent:
nlp.parser.moves.add_action(2, 'subtok')
for gold in golds:
for tag in gold.tags:
if tag is not None:
# Replace labels that didn't make the frequency cutoff
actions = set(nlp.parser.labels)
label_set = set([act.split('-')[1] for act in actions if '-' in act])
for gold in golds:
for i, label in enumerate(gold.labels):
if label is not None and label not in label_set:
gold.labels[i] = label.split('||')[0]
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
# Command line helpers #
class Config(object):
vectors = attr.ib(default=None)
max_doc_length = attr.ib(default=10)
multitask_tag = attr.ib(default=True)
multitask_sent = attr.ib(default=True)
nr_epoch = attr.ib(default=30)
batch_size = attr.ib(default=1000)
dropout = attr.ib(default=0.2)
def load(cls, loc):
with Path(loc).open('r', encoding='utf8') as file_:
cfg = json.load(file_)
return cls(**cfg)
class Dataset(object):
def __init__(self, path, section):
self.path = path
self.section = section
self.conllu = None
self.text = None
for file_path in self.path.iterdir():
name = file_path.parts[-1]
if section in name and name.endswith('conllu'):
self.conllu = file_path
elif section in name and name.endswith('txt'):
self.text = file_path
if self.conllu is None:
msg = "Could not find .txt file in {path} for {section}"
raise IOError(msg.format(section=section, path=path))
if self.text is None:
msg = "Could not find .txt file in {path} for {section}"
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
class TreebankPaths(object):
def __init__(self, ud_path, treebank, **cfg):
self.train = Dataset(ud_path / treebank, 'train')
self.dev = Dataset(ud_path / treebank, 'dev')
self.lang = self.train.lang
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
"positional", None, str),
parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "positional", None, Config.load),
limit=("Size limit", "option", "n", int)
def main(ud_dir, parses_dir, config, corpus, limit=0):
paths = TreebankPaths(ud_dir, corpus)
if not (parses_dir / corpus).exists():
(parses_dir / corpus).mkdir()
print("Train and evaluate", corpus, "using lang", paths.lang)
nlp = load_nlp(paths.lang, config)
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
max_doc_length=config.max_doc_length, limit=limit)
optimizer = initialize_pipeline(nlp, docs, golds, config)
for i in range(config.nr_epoch):
docs = [nlp.make_doc(doc.text) for doc in docs]
batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
losses = {}
n_train_words = sum(len(doc) for doc in docs)
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
for batch in batches:
batch_docs, batch_gold = zip(*batch)
pbar.update(sum(len(doc) for doc in batch_docs))
nlp.update(batch_docs, batch_gold, sgd=optimizer,
drop=config.dropout, losses=losses)
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
with nlp.use_params(optimizer.averages):
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
print_progress(i, losses, scores)
if __name__ == '__main__':
#!/usr/bin/env python
# coding: utf8
"""Export spaCy model vectors for use in TensorBoard's standalone embedding projector.
python vectors_tensorboard_standalone.py ./myVectorModel ./output [name]
This outputs two files that have to be copied into the "oss_data" of the standalone projector:
[name]_labels.tsv - metadata such as human readable labels for vectors
[name]_tensors.bytes - numpy.ndarray of numpy.float32 precision vectors
from __future__ import unicode_literals
import json
import math
from os import path
import numpy
import plac
import spacy
import tqdm
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
out_loc=("Path to output folder writing tensors and labels data", "positional", None, str),
name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
def main(vectors_loc, out_loc, name="spaCy_vectors"):
# A tab-separated file that contains information about the vectors for visualization
# Learn more: https://www.tensorflow.org/programmers_guide/embedding#metadata
meta_file = "{}_labels.tsv".format(name)
out_meta_file = path.join(out_loc, meta_file)
print('Loading spaCy vectors model: {}'.format(vectors_loc))
model = spacy.load(vectors_loc)
print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
voacb_strings = [
w for w in tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
if model.vocab.has_vector(w)
vector_count = len(voacb_strings)
print('Building Projector labels for {} vectors: {}'.format(vector_count, out_meta_file))
vector_dimensions = model.vocab.vectors.shape[1]
tf_vectors_variable = numpy.zeros((vector_count, vector_dimensions), dtype=numpy.float32)
# Write a tab-separated file that contains information about the vectors for visualization
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
with open(out_meta_file, 'wb') as file_metadata:
# Define columns in the first row
# Write out a row for each vector that we add to the tensorflow variable we created
vec_index = 0
for text in tqdm.tqdm(voacb_strings, total=len(voacb_strings), leave=False):
# https://github.com/tensorflow/tensorflow/issues/9094
text = '<Space>' if text.lstrip() == '' else text
lex = model.vocab[text]
# Store vector data and metadata
tf_vectors_variable[vec_index] = numpy.float64(model.vocab.get_vector(text))
file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * len(voacb_strings)).encode('utf-8'))
vec_index += 1
# Write out "[name]_tensors.bytes" file for standalone embeddings projector to load
tensor_path = '{}_tensors.bytes'.format(name)
tf_vectors_variable.tofile(path.join(out_loc, tensor_path))
print('Add the following entry to "oss_data/oss_demo_projector_config.json"')
"tensorName": name,
"tensorShape": [vector_count, vector_dimensions],
"tensorPath": 'oss_data/{}'.format(tensor_path),
"metadataPath": 'oss_data/{}'.format(meta_file)
}, indent=2))
if __name__ == '__main__':
@ -1,49 +1,92 @@
# coding: utf-8
from __future__ import unicode_literals, print_function
import contextlib
from pathlib import Path
from fabric.api import local, lcd, env, settings, prefix
from fabtools.python import virtualenv
from os import path, environ
import shutil
PWD = path.dirname(__file__)
ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
VENV_DIR = path.join(PWD, ENV)
def env(lang='python2.7'):
if path.exists(VENV_DIR):
def virtualenv(name, create=False, python='/usr/bin/python3.6'):
python = Path(python).resolve()
env_path = VENV_DIR
if create:
if env_path.exists():
local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR))
def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
return local('source {}/bin/activate && {}'.format(env_path, cmd),
shell='/bin/bash', capture=False)
yield wrapped_local
def env(lang='python3.6'):
if VENV_DIR.exists():
local('rm -rf {env}'.format(env=VENV_DIR))
local('pip install virtualenv')
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
if lang.startswith('python3'):
local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR))
local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang))
local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR))
with virtualenv(VENV_DIR) as venv_local:
print(venv_local('python --version', capture=True))
venv_local('pip install --upgrade setuptools --no-cache-dir')
venv_local('pip install pytest --no-cache-dir')
venv_local('pip install wheel --no-cache-dir')
venv_local('pip install -r requirements.txt --no-cache-dir')
venv_local('pip install pex --no-cache-dir')
def install():
with virtualenv(VENV_DIR):
local('pip install --upgrade setuptools')
local('pip install dist/*.tar.gz')
local('pip install pytest')
with virtualenv(VENV_DIR) as venv_local:
venv_local('pip install dist/*.tar.gz')
def make():
with virtualenv(VENV_DIR):
with lcd(path.dirname(__file__)):
local('pip install cython')
local('pip install murmurhash')
local('pip install -r requirements.txt')
local('python setup.py build_ext --inplace')
with lcd(path.dirname(__file__)):
local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace',
def sdist():
with virtualenv(VENV_DIR):
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
local('python setup.py sdist')
def wheel():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
venv_local('python setup.py bdist_wheel')
def pex():
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
sha = local('git rev-parse --short HEAD', capture=True)
venv_local('pex dist/*.whl -e spacy -o dist/spacy-%s.pex' % sha,
def clean():
with lcd(path.dirname(__file__)):
local('python setup.py clean --all')
local('rm -f dist/*.whl')
local('rm -f dist/*.pex')
with virtualenv(VENV_DIR) as venv_local:
venv_local('python setup.py clean --all')
def test():
with virtualenv(VENV_DIR):
with virtualenv(VENV_DIR) as venv_local:
with lcd(path.dirname(__file__)):
local('py.test -x spacy/tests')
venv_local('pytest -x spacy/tests')
def train():
args = environ.get('SPACY_TRAIN_ARGS', '')
with virtualenv(VENV_DIR) as venv_local:
venv_local('spacy train {args}'.format(args=args))
cymem>=1.30,<1.32
@ -16,4 +16,3 @@ pytest>=3.0.6,<4.0.0
@ -18,6 +18,7 @@ PACKAGES = find_packages()
@ -191,8 +192,6 @@ def setup_package():
@ -201,6 +200,7 @@ def setup_package():
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
@ -8,6 +8,7 @@ if __name__ == '__main__':
import sys
from spacy.cli import download, link, info, package, train, convert
from spacy.cli import vocab, init_model, profile, evaluate, validate
from spacy.cli import ud_train, ud_evaluate
from spacy.util import prints
commands = {
@ -15,7 +16,9 @@ if __name__ == '__main__':
'link': link,
'info': info,
'train': train,
'ud-train': ud_train,
'evaluate': evaluate,
'ud-evaluate': ud_evaluate,
'convert': convert,
'package': package,
'vocab': vocab,
@ -0,0 +1,251 @@
# cython: infer_types=True
'''Do Levenshtein alignment, for evaluation of tokenized input.
Random notes:
r i n g
0 1 2 3 4
r 1 0 1 2 3
a 2 1 1 2 3
n 3 2 2 1 2
g 4 3 3 2 1
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
2,2: (3,3)
3,2: (4,3)
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
We know the costs to transition:
S[:i] -> T[:j] (at D[i,j])
S[:i+1] -> T[:j] (at D[i+1,j])
S[:i] -> T[:j+1] (at D[i,j+1])
Further, we now we can tranform:
S[:i+1] -> S[:i] (DEL) for 1,
T[:j+1] -> T[:j] (INS) for 1.
S[i+1] -> T[j+1] (SUB) for 0 or 1
Therefore we have the costs:
SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j])
i.e. D[i, j] + S[i+1] != T[j+1]
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
i.e. D[i+1,j] + 1
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i])
i.e. D[i,j+1] + 1
Source string S has length m, with index i
Target string T has length n, with index j
Output two alignment vectors: i2j (length m) and j2i (length n)
# function LevenshteinDistance(char s[1..m], char t[1..n]):
# for all i and j, d[i,j] will hold the Levenshtein distance between
# the first i characters of s and the first j characters of t
# note that d has (m+1)*(n+1) values
# set each element in d to zero
ring rang
- r i n g
- 0 0 0 0 0
r 0 0 0 0 0
a 0 0 0 0 0
n 0 0 0 0 0
g 0 0 0 0 0
# source prefixes can be transformed into empty string by
# dropping all characters
# d[i, 0] := i
ring rang
- r i n g
- 0 0 0 0 0
r 1 0 0 0 0
a 2 0 0 0 0
n 3 0 0 0 0
g 4 0 0 0 0
# target prefixes can be reached from empty source prefix
# by inserting every character
# d[0, j] := j
- r i n g
- 0 1 2 3 4
r 1 0 0 0 0
a 2 0 0 0 0
n 3 0 0 0 0
g 4 0 0 0 0
from __future__ import unicode_literals
from libc.stdint cimport uint32_t
import numpy
cimport numpy as np
from .compat import unicode_
from murmurhash.mrmr cimport hash32
def align(S, T):
cdef int m = len(S)
cdef int n = len(T)
cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
cdef np.ndarray S_arr = _convert_sequence(S)
cdef np.ndarray T_arr = _convert_sequence(T)
<const int*>S_arr.data, m, <const int*>T_arr.data, n)
fill_i2j(i2j, matrix)
fill_j2i(j2i, matrix)
for i in range(i2j.shape[0]):
if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
i2j[i] = -1
for j in range(j2i.shape[0]):
if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
j2i[j] = -1
return matrix[-1,-1], i2j, j2i, matrix
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
'''Let's say we had:
Guess: [aa bb cc dd]
Truth: [aa bbcc dd]
i2j: [0, None, -2, 2]
j2i: [0, -2, 3]
We want:
i2j_multi: {1: 1, 2: 1}
j2i_multi: {}
i2j_miss = _get_regions(i2j, i_lengths)
j2i_miss = _get_regions(j2i, j_lengths)
i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
return i2j_multi, j2i_multi
def _get_regions(alignment, lengths):
regions = {}
start = None
offset = 0
for i in range(len(alignment)):
if alignment[i] < 0:
if start is None:
start = offset
regions.setdefault(start, [])
start = None
offset += lengths[i]
return regions
def _get_mapping(miss1, miss2, lengths1, lengths2):
i2j = {}
j2i = {}
for start, region1 in miss1.items():
if not region1 or start not in miss2:
region2 = miss2[start]
if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
j = region2.pop(0)
buff = []
# Consume tokens from region 1, until we meet the length of the
# first token in region2. If we do, align the tokens. If
# we exceed the length, break.
while region1:
if sum(lengths1[i] for i in buff) == lengths2[j]:
for i in buff:
i2j[i] = j
j2i[j] = buff[-1]
j += 1
buff = []
elif sum(lengths1[i] for i in buff) > lengths2[j]:
if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
for i in buff:
i2j[i] = j
j2i[j] = buff[-1]
return i2j, j2i
def _convert_sequence(seq):
if isinstance(seq, numpy.ndarray):
return numpy.ascontiguousarray(seq, dtype='uint32_t')
cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
cdef bytes item_bytes
for i, item in enumerate(seq):
if isinstance(item, unicode):
item_bytes = item.encode('utf8')
item_bytes = item
output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
return output
cdef void fill_matrix(int* D,
const int* S, int m, const int* T, int n) nogil:
m1 = m+1
n1 = n+1
for i in range(m1*n1):
D[i] = 0
for i in range(m1):
D[i*n1] = i
for j in range(n1):
D[j] = j
cdef int sub_cost, ins_cost, del_cost
for j in range(n):
for i in range(m):
i_j = i*n1 + j
i1_j1 = (i+1)*n1 + j+1
i1_j = (i+1)*n1 + j
i_j1 = i*n1 + j+1
if S[i] != T[j]:
sub_cost = D[i_j] + 1
sub_cost = D[i_j]
del_cost = D[i_j1] + 1
ins_cost = D[i1_j] + 1
best = min(min(sub_cost, ins_cost), del_cost)
D[i1_j1] = best
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
j = D.shape[1]-2
cdef int i = D.shape[0]-2
while i >= 0:
while D[i+1, j] < D[i+1, j+1]:
j -= 1
if D[i, j+1] < D[i+1, j+1]:
i2j[i] = -1
i2j[i] = j
j -= 1
i -= 1
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
i = D.shape[0]-2
cdef int j = D.shape[1]-2
while j >= 0:
while D[i, j+1] < D[i+1, j+1]:
i -= 1
if D[i+1, j] < D[i+1, j+1]:
j2i[j] = -1
j2i[j] = i
i -= 1
j -= 1
@ -0,0 +1,251 @@
import pytest
class Vocab(object):
class Doc(list):
def __init__(self, vocab, words=None):
self.extend([Token(i, w) for i, w in enumerate(words)])
class Token(object):
def __init__(self, i, word):
self.i = i
self.text = word
def find_matches(patterns, doc):
init_states = [(pattern, 0, None) for pattern in patterns]
curr_states = []
matches = []
for token in doc:
nexts = []
for state in (curr_states + init_states):
matches, nexts = transition(state, token, matches, nexts)
curr_states = nexts
return matches
def transition(state, token, matches, nexts):
action = get_action(state, token)
is_match, keep_state, advance_state = [bool(int(c)) for c in action]
pattern, i, start = state
if start is None:
start = token.i
if is_match:
matches.append((pattern, start, token.i+1))
if advance_state:
nexts.append((pattern, i+1, start))
if keep_state:
# TODO: This needs to be zero-width :(.
nexts.append((pattern, i, start))
return (matches, nexts)
def get_action(state, token):
'''We need to consider:
a) Does the token match the specification? [Yes, No]
b) What's the quantifier? [1, 0+, ?]
c) Is this the last specification? [final, non-final]
We can transition in the following ways:
a) Do we emit a match?
b) Do we add a state with (next state, next token)?
c) Do we add a state with (next state, same token)?
d) Do we add a state with (same state, next token)?
We'll code the actions as boolean strings, so 0000 means no to all 4,
1000 means match but no states added, etc.
Yes, final:
Yes, non-final:
No, final:
No, non-final
Yes, final:
Yes, non-final:
No, final:
1000 (note: Don't include last token!)
No, non-final:
Yes, final:
Yes, non-final:
No, final:
1000 (note: Don't include last token!)
No, non-final:
Problem: If a quantifier is matching, we're adding a lot of open partials
is_match = get_is_match(state, token)
operator = get_operator(state, token)
is_final = get_is_final(state, token)
raise NotImplementedError
def get_is_match(state, token):
pattern, i, start = state
is_match = token.text == pattern[i]['spec']
if pattern[i].get('invert'):
return not is_match
return is_match
def get_is_final(state, token):
pattern, i, start = state
return i == len(pattern)-1
def get_operator(state, token):
pattern, i, start = state
return pattern[i].get('op', '1')
# Tests for get_action #
def test_get_action_simple_match():
pattern = [{'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '100'
def test_get_action_simple_reject():
pattern = [{'spec': 'b', 'op': '1'}]
doc = Doc(Vocab(), words=['a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '000'
def test_get_action_simple_match_match():
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '001'
state = (pattern, 1, 0)
action = get_action(state, doc[1])
assert action == '100'
def test_get_action_simple_match_reject():
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '001'
state = (pattern, 1, 0)
action = get_action(state, doc[1])
assert action == '000'
def test_get_action_simple_match_reject():
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '001'
state = (pattern, 1, 0)
action = get_action(state, doc[1])
assert action == '000'
def test_get_action_plus_match():
pattern = [{'spec': 'a', 'op': '1+'}]
doc = Doc(Vocab(), words=['a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '110'
def test_get_action_plus_match_match():
pattern = [{'spec': 'a', 'op': '1+'}]
doc = Doc(Vocab(), words=['a', 'a'])
state = (pattern, 0, None)
action = get_action(state, doc[0])
assert action == '110'
state = (pattern, 0, 0)
action = get_action(state, doc[1])
assert action == '110'
# Tests for find_matches #
def test_find_matches_simple_accept():
pattern = [{'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['a'])
matches = find_matches([pattern], doc)
assert matches == [(pattern, 0, 1)]
def test_find_matches_simple_reject():
pattern = [{'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['b'])
matches = find_matches([pattern], doc)
assert matches == []
def test_find_matches_match_twice():
pattern = [{'spec': 'a', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'a'])
matches = find_matches([pattern], doc)
assert matches == [(pattern, 0, 1), (pattern, 1, 2)]
def test_find_matches_longer_pattern():
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
doc = Doc(Vocab(), words=['a', 'b'])
matches = find_matches([pattern], doc)
assert matches == [(pattern, 0, 2)]
def test_find_matches_two_patterns():
patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]
doc = Doc(Vocab(), words=['a', 'b'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]
def test_find_matches_two_patterns_overlap():
patterns = [[{'spec': 'a'}, {'spec': 'b'}],
[{'spec': 'b'}, {'spec': 'c'}]]
doc = Doc(Vocab(), words=['a', 'b', 'c'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]
def test_find_matches_greedy():
patterns = [[{'spec': 'a', 'op': '1+'}]]
doc = Doc(Vocab(), words=['a'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 1)]
doc = Doc(Vocab(), words=['a', 'a'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]
def test_find_matches_non_greedy():
patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]]
doc = Doc(Vocab(), words=['b'])
matches = find_matches(patterns, doc)
assert matches == [(patterns[0], 0, 1)]
@ -64,23 +64,6 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
return (X, lengths), finish_update
def _logistic(X, drop=0.):
xp = get_array_module(X)
if not isinstance(X, xp.ndarray):
X = xp.asarray(X)
# Clip to range (-10, 10)
X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y))
return dX
return Y, logistic_bwd
def _zero_init(model):
def _zero_init_impl(self, X, y):
@ -144,8 +127,8 @@ class PrecomputableAffine(Model):
self.nF = nF
def begin_update(self, X, drop=0.):
Yf = self.ops.xp.dot(X,
self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T)
Yf = self.ops.gemm(X,
self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
Yf = self._add_padding(Yf)
@ -161,11 +144,11 @@ class PrecomputableAffine(Model):
Wopfi = self.W.transpose((1, 2, 0, 3))
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
# Reuse the buffer
dWopfi = Wopfi; dWopfi.fill(0.)
self.ops.xp.dot(dY.T, Xf, out=dWopfi)
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
# (o, p, f, i) --> (f, o, p, i)
self.d_W += dWopfi.transpose((2, 0, 1, 3))
@ -467,6 +450,7 @@ def SpacyVectors(docs, drop=0.):
def build_text_classifier(nr_class, width=64, **cfg):
depth = cfg.get('depth', 2)
nr_vector = cfg.get('nr_vector', 5000)
pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
@ -518,7 +502,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
LN(Maxout(width, vectors_width))
>> Residual(
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
) ** 2, pad=2
) ** depth, pad=depth
>> flatten_add_lengths
>> ParametricAttention(width)
@ -531,8 +515,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
>> LinearModel(nr_class)
#model = linear_model >> logistic
model = (
(linear_model | cnn_model)
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
@ -9,7 +9,7 @@ __uri__ = 'https://spacy.io'
__author__ = 'Explosion AI'
__email__ = 'contact@explosion.ai'
__license__ = 'MIT'
__release__ = True
__release__ = False
__docs_models__ = 'https://spacy.io/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
@ -131,7 +131,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'NumValue', 'PartType', 'Polite', 'StyleVariant',
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', 'Animacy' # U20
'Polarity', 'PrepCase', 'Animacy' # U20
for key in morph_keys:
if key in stringy_attrs:
@ -9,3 +9,5 @@ from .convert import convert
from .vocab import make_vocab as vocab
from .init_model import init_model
from .validate import validate
from .ud_train import main as ud_train
from .conll17_ud_eval import main as ud_evaluate
@ -0,0 +1,571 @@
#!/usr/bin/env python
# CoNLL 2017 UD Parsing evaluation script.
# Compatible with Python 2.7 and 3.2+, can be used either as a module
# or a standalone executable.
# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL),
# Faculty of Mathematics and Physics, Charles University, Czech Republic.
# Changelog:
# - [02 Jan 2017] Version 0.9: Initial release
# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation
# - [10 Mar 2017] Version 1.0: Add documentation and test
# Compare HEADs correctly using aligned words
# Allow evaluation with errorneous spaces in forms
# Compare forms in LCS case insensitively
# Detect cycles and multiple root nodes
# Compute AlignedAccuracy
# Command line usage
# ------------------
# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file
# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics
# is printed
# - if -v is given, several metrics are printed (as precision, recall, F1 score,
# and in case the metric is computed on aligned words also accuracy on these):
# - Tokens: how well do the gold tokens match system tokens
# - Sentences: how well do the gold sentences match system sentences
# - Words: how well can the gold words be aligned to system words
# - UPOS: using aligned words, how well does UPOS match
# - XPOS: using aligned words, how well does XPOS match
# - Feats: using aligned words, how well does FEATS match
# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
# - Lemmas: using aligned words, how well does LEMMA match
# - UAS: using aligned words, how well does HEAD match
# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
# - if weights_file is given (with lines containing deprel-weight pairs),
# one more metric is shown:
# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight
# API usage
# ---------
# - load_conllu(file)
# - loads CoNLL-U file from given file object to an internal representation
# - the file object should return str on both Python 2 and Python 3
# - raises UDError exception if the given file cannot be loaded
# - evaluate(gold_ud, system_ud)
# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
# - raises UDError if the concatenated tokens of gold and system file do not match
# - returns a dictionary with the metrics described above, each metrics having
# three fields: precision, recall and f1
# Description of token matching
# -----------------------------
# In order to match tokens of gold file and system file, we consider the text
# resulting from concatenation of gold tokens and text resulting from
# concatenation of system tokens. These texts should match -- if they do not,
# the evaluation fails.
# If the texts do match, every token is represented as a range in this original
# text, and tokens are equal only if their range is the same.
# Description of word matching
# ----------------------------
# When matching words of gold file and system file, we first match the tokens.
# The words which are also tokens are matched as tokens, but words in multi-word
# tokens have to be handled differently.
# To handle multi-word tokens, we start by finding "multi-word spans".
# Multi-word span is a span in the original text such that
# - it contains at least one multi-word token
# - all multi-word tokens in the span (considering both gold and system ones)
# are completely inside the span (i.e., they do not "stick out")
# - the multi-word span is as small as possible
# For every multi-word span, we align the gold and system words completely
# inside this span using LCS on their FORMs. The words not intersecting
# (even partially) any multi-word span are then aligned as tokens.
from __future__ import division
from __future__ import print_function
import argparse
import io
import sys
import unittest
# CoNLL-U column names
# UD Error is used when raising exceptions in this module
class UDError(Exception):
# Load given CoNLL-U file into internal representation
def load_conllu(file):
# Internal representation classes
class UDRepresentation:
def __init__(self):
# Characters of all the tokens in the whole file.
# Whitespace between tokens is not included.
self.characters = []
# List of UDSpan instances with start&end indices into `characters`.
self.tokens = []
# List of UDWord instances.
self.words = []
# List of UDSpan instances with start&end indices into `characters`.
self.sentences = []
class UDSpan:
def __init__(self, start, end, characters):
self.start = start
# Note that self.end marks the first position **after the end** of span,
# so we can use characters[start:end] or range(start, end).
self.end = end
self.characters = characters
def text(self):
return ''.join(self.characters[self.start:self.end])
def __str__(self):
return self.text
def __repr__(self):
return self.text
class UDWord:
def __init__(self, span, columns, is_multiword):
# Span of this word (or MWT, see below) within ud_representation.characters.
self.span = span
# 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
self.columns = columns
# is_multiword==True means that this word is part of a multi-word token.
# In that case, self.span marks the span of the whole multi-word token.
self.is_multiword = is_multiword
# Reference to the UDWord instance representing the HEAD (or None if root).
self.parent = None
# Let's ignore language-specific deprel subtypes.
self.columns[DEPREL] = columns[DEPREL].split(':')[0]
ud = UDRepresentation()
# Load the CoNLL-U file
index, sentence_start = 0, None
linenum = 0
while True:
line = file.readline()
linenum += 1
if not line:
line = line.rstrip("\r\n")
# Handle sentence start boundaries
if sentence_start is None:
# Skip comments
if line.startswith("#"):
# Start a new sentence
ud.sentences.append(UDSpan(index, 0, ud.characters))
sentence_start = len(ud.words)
if not line:
# Add parent UDWord links and check there are no cycles
def process_word(word):
if word.parent == "remapping":
raise UDError("There is a cycle in a sentence")
if word.parent is None:
head = int(word.columns[HEAD])
if head > len(ud.words) - sentence_start:
raise UDError("Line {}: HEAD '{}' points outside of the sentence".format(
linenum, word.columns[HEAD]))
if head:
parent = ud.words[sentence_start + head - 1]
word.parent = "remapping"
word.parent = parent
for word in ud.words[sentence_start:]:
# Check there is a single root node
if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
raise UDError("There are multiple roots in a sentence")
# End the sentence
ud.sentences[-1].end = index
sentence_start = None
# Read next token/word
columns = line.split("\t")
if len(columns) != 10:
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line))
# Skip empty nodes
if "." in columns[ID]:
# Delete spaces from FORM so gold.characters == system.characters
# even if one of them tokenizes the space.
columns[FORM] = columns[FORM].replace(" ", "")
if not columns[FORM]:
raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum)
# Save token
ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters))
index += len(columns[FORM])
# Handle multi-word tokens to save word(s)
if "-" in columns[ID]:
start, end = map(int, columns[ID].split("-"))
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
for _ in range(start, end + 1):
word_line = file.readline().rstrip("\r\n")
word_columns = word_line.split("\t")
if len(word_columns) != 10:
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line))
ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
# Basic tokens/words
word_id = int(columns[ID])
raise UDError("Cannot parse word ID '{}'".format(columns[ID]))
if word_id != len(ud.words) - sentence_start + 1:
raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1))
head_id = int(columns[HEAD])
raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD]))
if head_id < 0:
raise UDError("HEAD cannot be negative")
ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
if sentence_start is not None:
raise UDError("The CoNLL-U file does not end with empty line")
return ud
# Evaluate the gold and system treebanks (loaded using load_conllu).
def evaluate(gold_ud, system_ud, deprel_weights=None):
class Score:
def __init__(self, gold_total, system_total, correct, aligned_total=None):
self.precision = correct / system_total if system_total else 0.0
self.recall = correct / gold_total if gold_total else 0.0
self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
class AlignmentWord:
def __init__(self, gold_word, system_word):
self.gold_word = gold_word
self.system_word = system_word
self.gold_parent = None
self.system_parent_gold_aligned = None
class Alignment:
def __init__(self, gold_words, system_words):
self.gold_words = gold_words
self.system_words = system_words
self.matched_words = []
self.matched_words_map = {}
def append_aligned_words(self, gold_word, system_word):
self.matched_words.append(AlignmentWord(gold_word, system_word))
self.matched_words_map[system_word] = gold_word
def fill_parents(self):
# We represent root parents in both gold and system data by '0'.
# For gold data, we represent non-root parent by corresponding gold word.
# For system data, we represent non-root parent by either gold word aligned
# to parent system nodes, or by None if no gold words is aligned to the parent.
for words in self.matched_words:
words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0
words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \
if words.system_word.parent is not None else 0
def lower(text):
if sys.version_info < (3, 0) and isinstance(text, str):
return text.decode("utf-8").lower()
return text.lower()
def spans_score(gold_spans, system_spans):
correct, gi, si = 0, 0, 0
while gi < len(gold_spans) and si < len(system_spans):
if system_spans[si].start < gold_spans[gi].start:
si += 1
elif gold_spans[gi].start < system_spans[si].start:
gi += 1
correct += gold_spans[gi].end == system_spans[si].end
si += 1
gi += 1
return Score(len(gold_spans), len(system_spans), correct)
def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
gold, system, aligned, correct = 0, 0, 0, 0
for word in alignment.gold_words:
gold += weight_fn(word)
for word in alignment.system_words:
system += weight_fn(word)
for words in alignment.matched_words:
aligned += weight_fn(words.gold_word)
if key_fn is None:
# Return score for whole aligned words
return Score(gold, system, aligned)
for words in alignment.matched_words:
if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned):
correct += weight_fn(words.gold_word)
return Score(gold, system, correct, aligned)
def beyond_end(words, i, multiword_span_end):
if i >= len(words):
return True
if words[i].is_multiword:
return words[i].span.start >= multiword_span_end
return words[i].span.end > multiword_span_end
def extend_end(word, multiword_span_end):
if word.is_multiword and word.span.end > multiword_span_end:
return word.span.end
return multiword_span_end
def find_multiword_span(gold_words, system_words, gi, si):
# We know gold_words[gi].is_multiword or system_words[si].is_multiword.
# Find the start of the multiword span (gs, ss), so the multiword span is minimal.
# Initialize multiword_span_end characters index.
if gold_words[gi].is_multiword:
multiword_span_end = gold_words[gi].span.end
if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start:
si += 1
else: # if system_words[si].is_multiword
multiword_span_end = system_words[si].span.end
if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start:
gi += 1
gs, ss = gi, si
# Find the end of the multiword span
# (so both gi and si are pointing to the word following the multiword span end).
while not beyond_end(gold_words, gi, multiword_span_end) or \
not beyond_end(system_words, si, multiword_span_end):
if gi < len(gold_words) and (si >= len(system_words) or
gold_words[gi].span.start <= system_words[si].span.start):
multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
gi += 1
multiword_span_end = extend_end(system_words[si], multiword_span_end)
si += 1
return gs, ss, gi, si
def compute_lcs(gold_words, system_words, gi, si, gs, ss):
lcs = [[0] * (si - ss) for i in range(gi - gs)]
for g in reversed(range(gi - gs)):
for s in reversed(range(si - ss)):
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
return lcs
def align_words(gold_words, system_words):
alignment = Alignment(gold_words, system_words)
gi, si = 0, 0
while gi < len(gold_words) and si < len(system_words):
if gold_words[gi].is_multiword or system_words[si].is_multiword:
# A: Multi-word tokens => align via LCS within the whole "multiword span".
gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
if si > ss and gi > gs:
lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
# Store aligned words
s, g = 0, 0
while g < gi - gs and s < si - ss:
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
g += 1
s += 1
elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0):
g += 1
s += 1
# B: No multi-word token => align according to spans.
if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end):
alignment.append_aligned_words(gold_words[gi], system_words[si])
gi += 1
si += 1
elif gold_words[gi].span.start <= system_words[si].span.start:
gi += 1
si += 1
return alignment
# Check that underlying character sequences do match
if gold_ud.characters != system_ud.characters:
index = 0
while gold_ud.characters[index] == system_ud.characters[index]:
index += 1
raise UDError(
"The concatenation of tokens in gold file and in system file differ!\n" +
"First 20 differing characters in gold file: '{}' and system file: '{}'".format(
"".join(gold_ud.characters[index:index + 20]),
"".join(system_ud.characters[index:index + 20])
# Align words
alignment = align_words(gold_ud.words, system_ud.words)
# Compute the F1-scores
result = {
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
"Words": alignment_score(alignment, None),
"UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
"XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
"AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
"UAS": alignment_score(alignment, lambda w, parent: parent),
"LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
# Add WeightedLAS if weights are given
if deprel_weights is not None:
def weighted_las(word):
return deprel_weights.get(word.columns[DEPREL], 1.0)
result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las)
return result
def load_deprel_weights(weights_file):
if weights_file is None:
return None
deprel_weights = {}
for line in weights_file:
# Ignore comments and empty lines
if line.startswith("#") or not line.strip():
columns = line.rstrip("\r\n").split()
if len(columns) != 2:
raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line))
deprel_weights[columns[0]] = float(columns[1])
return deprel_weights
def load_conllu_file(path):
_file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
return load_conllu(_file)
def evaluate_wrapper(args):
# Load CoNLL-U files
gold_ud = load_conllu_file(args.gold_file)
system_ud = load_conllu_file(args.system_file)
# Load weights if requested
deprel_weights = load_deprel_weights(args.weights)
return evaluate(gold_ud, system_ud, deprel_weights)
def main():
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("gold_file", type=str,
help="Name of the CoNLL-U file with the gold data.")
parser.add_argument("system_file", type=str,
help="Name of the CoNLL-U file with the predicted data.")
parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None,
help="Compute WeightedLAS using given weights for Universal Dependency Relations.")
parser.add_argument("--verbose", "-v", default=0, action="count",
help="Print all metrics.")
args = parser.parse_args()
# Use verbose if weights are supplied
if args.weights is not None and not args.verbose:
args.verbose = 1
# Evaluate
evaluation = evaluate_wrapper(args)
# Print the evaluation
if not args.verbose:
print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"]
if args.weights is not None:
print("Metrics | Precision | Recall | F1 Score | AligndAcc")
for metric in metrics:
print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
100 * evaluation[metric].precision,
100 * evaluation[metric].recall,
100 * evaluation[metric].f1,
"{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
if __name__ == "__main__":
# Tests, which can be executed with `python -m unittest conll17_ud_eval`.
class TestAlignment(unittest.TestCase):
def _load_words(words):
"""Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors."""
lines, num_words = [], 0
for w in words:
parts = w.split(" ")
if len(parts) == 1:
num_words += 1
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1)))
lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0]))
for part in parts[1:]:
num_words += 1
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1)))
return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"])))
def _test_exception(self, gold, system):
self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system))
def _test_ok(self, gold, system, correct):
metrics = evaluate(self._load_words(gold), self._load_words(system))
gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold))
system_words = sum((max(1, len(word.split(" ")) - 1) for word in system))
self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1),
(correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words)))
def test_exception(self):
self._test_exception(["a"], ["b"])
def test_equal(self):
self._test_ok(["a"], ["a"], 1)
self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3)
def test_equal_with_multiword(self):
self._test_ok(["abc a b c"], ["a", "b", "c"], 3)
self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4)
self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4)
self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5)
def test_alignment(self):
self._test_ok(["abcd"], ["a", "b", "c", "d"], 0)
self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1)
self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2)
self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2)
self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4)
self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2)
self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)
@ -8,8 +8,8 @@ from thinc.neural._classes.model import Model
from timeit import default_timer as timer
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus, minibatch
from ..util import prints
from ..gold import GoldCorpus
from ..util import prints, minibatch, minibatch_by_words
from .. import util
from .. import about
from .. import displacy
@ -51,8 +51,6 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data)
meta_path = util.ensure_path(meta_path)
if not output_path.exists():
if not train_path.exists():
prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists():
@ -65,7 +63,14 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
title="Not a valid meta.json format", exits=1)
meta.setdefault('lang', lang)
meta.setdefault('name', 'unnamed')
if not output_path.exists():
print("Counting training words (limit=%s" % n_sents)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train()
pipeline = ['tagger', 'parser', 'ner']
if no_tagger and 'tagger' in pipeline:
@ -81,13 +86,9 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
util.env_opt('dropout_to', 0.2),
util.env_opt('dropout_decay', 0.0))
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 16),
batch_sizes = util.compounding(util.env_opt('batch_from', 1000),
util.env_opt('batch_to', 1000),
util.env_opt('batch_compound', 1.001))
max_doc_len = util.env_opt('max_doc_len', 5000)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train()
lang_class = util.get_lang_class(lang)
nlp = lang_class()
meta['pipeline'] = pipeline
@ -105,6 +106,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
lex.is_oov = False
for name in pipeline:
nlp.add_pipe(nlp.create_pipe(name), name=name)
if parser_multitasks:
for objective in parser_multitasks.split(','):
@ -116,21 +118,20 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
train_docs = list(train_docs)
for i in range(n_iter):
train_docs = corpus.train_docs(nlp, noise_level=0.0,
gold_preproc=gold_preproc, max_length=0)
words_seen = 0
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {}
for batch in minibatch(train_docs, size=batch_sizes):
batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
for batch in minibatch_by_words(train_docs, size=batch_sizes):
if not batch:
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses)
pbar.update(sum(len(doc) for doc in docs))
words_seen += sum(len(doc) for doc in docs)
with nlp.use_params(optimizer.averages):
epoch_model_path = output_path / ('model%d' % i)
@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import six
import ftfy
import sys
import ujson
@ -47,9 +46,10 @@ is_windows = sys.platform.startswith('win')
is_linux = sys.platform.startswith('linux')
is_osx = sys.platform == 'darwin'
is_python2 = six.PY2
is_python3 = six.PY3
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1]<5)
# See: https://github.com/benjaminp/six/blob/master/six.py
is_python2 = sys.version_info[0] == 2
is_python3 = sys.version_info[0] == 3
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
if is_python2:
bytes_ = str
@ -3,16 +3,25 @@
from __future__ import unicode_literals, print_function
import re
import ujson
import random
import cytoolz
import itertools
import numpy
import tempfile
import shutil
from pathlib import Path
import msgpack
import ujson
from . import _align
from .syntax import nonproj
from .tokens import Doc
from . import util
from .util import minibatch
from .util import minibatch, itershuffle
from .compat import json_dumps
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
def tags_to_entities(tags):
entities = []
@ -59,196 +68,62 @@ def merge_sents(sents):
return [(m_deps, m_brackets)]
def align(cand_words, gold_words):
cost, edit_path = _min_edit_path(cand_words, gold_words)
alignment = []
i_of_gold = 0
for move in edit_path:
if move == 'M':
i_of_gold += 1
elif move == 'S':
i_of_gold += 1
elif move == 'D':
elif move == 'I':
i_of_gold += 1
raise Exception(move)
return alignment
punct_re = re.compile(r'\W')
def _min_edit_path(cand_words, gold_words):
Pool mem
int i, j, n_cand, n_gold
int* curr_costs
int* prev_costs
# TODO: Fix this --- just do it properly, make the full edit matrix and
# then walk back over it...
# Preprocess inputs
cand_words = [punct_re.sub('', w).lower() for w in cand_words]
gold_words = [punct_re.sub('', w).lower() for w in gold_words]
def align(cand_words, gold_words):
if cand_words == gold_words:
return 0, ''.join(['M' for _ in gold_words])
mem = Pool()
n_cand = len(cand_words)
n_gold = len(gold_words)
# Levenshtein distance, except we need the history, and we may want
# different costs. Mark operations with a string, and score the history
# using _edit_cost.
previous_row = []
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
for i in range(n_gold + 1):
cell = ''
for j in range(i):
cell += 'I'
previous_row.append('I' * i)
prev_costs[i] = i
for i, cand in enumerate(cand_words):
current_row = ['D' * (i + 1)]
curr_costs[0] = i+1
for j, gold in enumerate(gold_words):
if gold.lower() == cand.lower():
s_cost = prev_costs[j]
i_cost = curr_costs[j] + 1
d_cost = prev_costs[j + 1] + 1
s_cost = prev_costs[j] + 1
i_cost = curr_costs[j] + 1
d_cost = prev_costs[j + 1] + (1 if cand else 0)
if s_cost <= i_cost and s_cost <= d_cost:
best_cost = s_cost
best_hist = previous_row[j] + ('M' if gold == cand else 'S')
elif i_cost <= s_cost and i_cost <= d_cost:
best_cost = i_cost
best_hist = current_row[j] + 'I'
best_cost = d_cost
best_hist = previous_row[j + 1] + 'D'
curr_costs[j+1] = best_cost
previous_row = current_row
for j in range(len(gold_words) + 1):
prev_costs[j] = curr_costs[j]
curr_costs[j] = 0
return prev_costs[n_gold], previous_row[-1]
alignment = numpy.arange(len(cand_words))
return 0, alignment, alignment, {}, {}
cand_words = [w.replace(' ', '') for w in cand_words]
gold_words = [w.replace(' ', '') for w in gold_words]
cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words],
[len(w) for w in gold_words])
for i, j in list(i2j_multi.items()):
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
i2j[i] = j
for j, i in list(j2i_multi.items()):
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
j2i[j] = i
return cost, i2j, j2i, i2j_multi, j2i_multi
class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER."""
def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus.
train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path)
self.limit = limit
self.train_locs = self.walk_corpus(self.train_path)
self.dev_locs = self.walk_corpus(self.dev_path)
if isinstance(train, str) or isinstance(train, Path):
train = self.read_tuples(self.walk_corpus(train))
dev = self.read_tuples(self.walk_corpus(dev))
def train_tuples(self):
i = 0
for loc in self.train_locs:
gold_tuples = read_json_file(loc)
for item in gold_tuples:
yield item
i += len(item[1])
if self.limit and i >= self.limit:
# Write temp directory with one doc per file, so we can shuffle
# and stream
self.tmp_dir = Path(tempfile.mkdtemp())
self.write_msgpack(self.tmp_dir / 'train', train)
self.write_msgpack(self.tmp_dir / 'dev', dev)
def dev_tuples(self):
i = 0
for loc in self.dev_locs:
gold_tuples = read_json_file(loc)
for item in gold_tuples:
yield item
i += len(item[1])
if self.limit and i >= self.limit:
def count_train(self):
n = 0
i = 0
for raw_text, paragraph_tuples in self.train_tuples:
n += sum([len(s[0][1]) for s in paragraph_tuples])
if self.limit and i >= self.limit:
i += len(paragraph_tuples)
return n
def train_docs(self, nlp, gold_preproc=False,
projectivize=False, max_length=None,
train_tuples = self.train_tuples
if projectivize:
train_tuples = nonproj.preprocess_training_data(
self.train_tuples, label_freq_cutoff=100)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
yield from gold_docs
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length:
yield doc, gold
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
if raw_text is not None:
raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)]
return [Doc(nlp.vocab,
words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples]
def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples)
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0],
return [GoldParse.from_annot_tuples(doc, sent_tuples)
for doc, (sent_tuples, brackets)
in zip(docs, paragraph_tuples)]
def __del__(self):
def write_msgpack(directory, doc_tuples):
if not directory.exists():
for i, doc_tuple in enumerate(doc_tuples):
with open(directory / '{}.msg'.format(i), 'wb') as file_:
msgpack.dump([doc_tuple], file_, use_bin_type=True, encoding='utf8')
def walk_corpus(path):
path = util.ensure_path(path)
if not path.is_dir():
return [path]
paths = [path]
@ -266,6 +141,101 @@ class GoldCorpus(object):
return locs
def read_tuples(locs, limit=0):
i = 0
for loc in locs:
loc = util.ensure_path(loc)
if loc.parts[-1].endswith('json'):
gold_tuples = read_json_file(loc)
elif loc.parts[-1].endswith('msg'):
with loc.open('rb') as file_:
gold_tuples = msgpack.load(file_, encoding='utf8')
msg = "Cannot read from file: %s. Supported formats: .json, .msg"
raise ValueError(msg % loc)
for item in gold_tuples:
yield item
i += len(item[1])
if limit and i >= limit:
def dev_tuples(self):
locs = (self.tmp_dir / 'dev').iterdir()
yield from self.read_tuples(locs, limit=self.limit)
def train_tuples(self):
locs = (self.tmp_dir / 'train').iterdir()
yield from self.read_tuples(locs, limit=self.limit)
def count_train(self):
n = 0
i = 0
for raw_text, paragraph_tuples in self.train_tuples:
for sent_tuples, brackets in paragraph_tuples:
n += len(sent_tuples[1])
if self.limit and i >= self.limit:
i += len(paragraph_tuples)
return n
def train_docs(self, nlp, gold_preproc=False, max_length=None,
locs = list((self.tmp_dir / 'train').iterdir())
train_tuples = self.read_tuples(locs, limit=self.limit)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples,
yield from gold_docs
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0, make_projective=False):
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples, make_projective)
for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length:
yield doc, gold
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
if raw_text is not None:
raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)]
return [Doc(nlp.vocab,
words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples]
def _make_golds(cls, docs, paragraph_tuples, make_projective):
assert len(docs) == len(paragraph_tuples)
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0],
return [GoldParse.from_annot_tuples(doc, sent_tuples,
for doc, (sent_tuples, brackets)
in zip(docs, paragraph_tuples)]
def add_noise(orig, noise_level):
if random.random() >= noise_level:
@ -297,11 +267,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit)
with loc.open('r', encoding='utf8') as file_:
docs = ujson.load(file_)
if limit is not None:
docs = docs[:limit]
for doc in docs:
for doc in _json_iterate(loc):
if docs_filter is not None and not docs_filter(doc):
paragraphs = []
@ -331,6 +297,56 @@ def read_json_file(loc, docs_filter=None, limit=None):
yield [paragraph.get('raw', None), sents]
def _json_iterate(loc):
# We should've made these files jsonl...But since we didn't, parse out
# the docs one-by-one to reduce memory usage.
# It's okay to read in the whole file -- just don't parse it into JSON.
cdef bytes py_raw
loc = util.ensure_path(loc)
with loc.open('rb') as file_:
py_raw = file_.read()
raw = <char*>py_raw
cdef int square_depth = 0
cdef int curly_depth = 0
cdef int inside_string = 0
cdef int escape = 0
cdef int start = -1
cdef char c
cdef char quote = ord('"')
cdef char backslash = ord('\\')
cdef char open_square = ord('[')
cdef char close_square = ord(']')
cdef char open_curly = ord('{')
cdef char close_curly = ord('}')
for i in range(len(py_raw)):
c = raw[i]
if c == backslash:
escape = True
if escape:
escape = False
if c == quote:
inside_string = not inside_string
if inside_string:
if c == open_square:
square_depth += 1
elif c == close_square:
square_depth -= 1
elif c == open_curly:
if square_depth == 1 and curly_depth == 0:
start = i
curly_depth += 1
elif c == close_curly:
curly_depth -= 1
if square_depth == 1 and curly_depth == 0:
py_str = py_raw[start : i+1].decode('utf8')
yield ujson.loads(py_str)
start = -1
def iob_to_biluo(tags):
out = []
curr_label = None
@ -434,8 +450,21 @@ cdef class GoldParse:
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.cand_to_gold = align([t.orth_ for t in doc], words)
self.gold_to_cand = align(words, [t.orth_ for t in doc])
# This needs to be done before we align the words
if make_projective and heads is not None and deps is not None:
heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens.
# If we over-segment, we'll have one gold word that covers a sequence
# of predicted words
# If we under-segment, we'll have one predicted word that covers a
# sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that.
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
self.orig_annot = list(zip(*annot_tuples))
@ -443,12 +472,47 @@ cdef class GoldParse:
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
self.words[i] = doc[i].text
self.tags[i] = 'SP'
self.tags[i] = '_SP'
self.heads[i] = None
self.labels[i] = None
self.ner[i] = 'O'
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last
if not is_last:
self.heads[i] = i+1
self.labels[i] = 'subtok'
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
self.labels[i] = deps[i2j_multi[i]]
# Now set NER...This is annoying because if we've split
# got an entity word split into two, we need to adjust the
# BILOU tags. We can't have BB or LL etc.
# Case 1: O -- easy.
ner_tag = entities[i2j_multi[i]]
if ner_tag == 'O':
self.ner[i] = 'O'
# Case 2: U. This has to become a B I* L sequence.
elif ner_tag.startswith('U-'):
if is_first:
self.ner[i] = ner_tag.replace('U-', 'B-', 1)
elif is_last:
self.ner[i] = ner_tag.replace('U-', 'L-', 1)
self.ner[i] = ner_tag.replace('U-', 'I-', 1)
# Case 3: L. If not last, change to I.
elif ner_tag.startswith('L-'):
if is_last:
self.ner[i] = ner_tag
self.ner[i] = ner_tag.replace('L-', 'I-', 1)
# Case 4: I. Stays correct
elif ner_tag.startswith('I-'):
self.ner[i] = ner_tag
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
@ -463,10 +527,6 @@ cdef class GoldParse:
if cycle is not None:
raise Exception("Cycle found: %s" % cycle)
if make_projective:
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
self.heads = proj_heads
def __len__(self):
"""Get the number of gold-standard tokens.
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
must my myself
name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere
nothing now nowhere n't
of off often on once one only onto or other others otherwise our ours ourselves
out over own
@ -66,4 +66,6 @@ whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves
'd 'll 'm 're 's 've
@ -6,17 +6,19 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX
def noun_chunks(obj):
doc = obj.doc
np_label = doc.vocab.strings['NP']
if not len(doc):
np_label = doc.vocab.strings.add('NP')
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
right_labels = ['flat', 'fixed', 'compound', 'neg']
stop_labels = ['punct']
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
token = doc[0]
while token and token.i < len(doc):
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(token)
left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
yield left.i, right.i+1, np_label
token = right
token = next_token(token)
@ -33,7 +35,7 @@ def next_token(token):
return None
def noun_bounds(root):
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
@ -41,7 +43,7 @@ def noun_bounds(root):
right_bound = root
for token in root.rights:
if (token.dep in np_right_deps):
left, right = noun_bounds(token)
left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i: right.i])):
Normal file
Normal file
@ -0,0 +1,15 @@
# coding: utf8
from __future__ import unicode_literals
Example sentences to test spaCy and its language models.
>>> from spacy.lang.fi.examples import sentences
>>> docs = nlp.pipe(sentences)
sentences = [
"Apple harkitsee ostavansa startup-yrityksen UK:sta 1 miljardilla dollarilla.",
"Itseajavat autot siirtävät vakuutusriskin valmistajille.",
"San Francisco harkitsee jakelurobottien kieltämistä jalkakäytävillä.",
"Lontoo on iso kaupunki Iso-Britanniassa."
Normal file
Normal file
@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
# import the symbols for the attrs you want to overwrite
from ...attrs import LIKE_NUM
# check if token resembles a number
_num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'seitsemän', 'kahdeksan', 'yhdeksän', 'kymmenen', 'yksitoista', 'kaksitoista', 'kolmetoista' 'neljätoista', 'viisitoista', 'kuusitoista', 'seitsemäntoista', 'kahdeksantoista', 'yhdeksäntoista', 'kaksikymmentä', 'kolmekymmentä', 'neljäkymmentä', 'viisikymmentä', 'kuusikymmentä'v, 'seitsemänkymmentä', 'kahdeksankymmentä', 'yhdeksänkymmentä', 'sata', 'tuhat', 'miljoona', 'miljardi', 'triljoona']
def like_num(text):
text = text.replace('.', '').replace(',', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LIKE_NUM: like_num
@ -79,7 +79,7 @@ pienestä pieni pienin poikki puolesta puolestaan päälle
saakka sama samaa samaan samalla saman samat samoin sata sataa satojen se
saakka sama samaa samaan samalla saman samat samoin satojen se
seitsemän sekä sen seuraavat siellä sieltä siihen siinä siis siitä sijaan siksi
sille silloin sillä silti siltä sinne sinua sinulla sinulle sinulta sinun
sinussa sinusta sinut sinuun sinä sisäkkäin sisällä siten sitten sitä ssa sta
@ -89,7 +89,7 @@ taa taas taemmas tahansa tai takaa takaisin takana takia tallä tapauksessa
tarpeeksi tavalla tavoitteena te teidän teidät teihin teille teillä teiltä
teissä teistä teitä tietysti todella toinen toisaalla toisaalle toisaalta
toiseen toiseksi toisella toiselle toiselta toisemme toisen toisensa toisessa
toisesta toista toistaiseksi toki tosin tuhannen tuhat tule tulee tulemme tulen
toisesta toista toistaiseksi toki tosin tule tulee tulemme tulen
tulet tulette tulevat tulimme tulin tulisi tulisimme tulisin tulisit tulisitte
tulisivat tulit tulitte tulivat tulla tulleet tullut tuntuu tuo tuohon tuoksi
tuolla tuolle tuolloin tuolta tuon tuona tuonne tuossa tuosta tuota tuskin tykö
@ -35,14 +35,32 @@ class JapaneseTokenizer(object):
def from_disk(self, path, **exclude):
return self
class JapaneseCharacterSegmenter(object):
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = []
spaces = []
doc = self.tokenizer(text)
for token in self.tokenizer(text):
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)
class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ja'
use_janome = True
def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp)
if cls.use_janome:
return JapaneseTokenizer(cls, nlp)
return JapaneseCharacterSegmenter(cls, nlp.vocab)
class Japanese(Language):
@ -144,7 +144,7 @@ def is_lower(string): return string.islower()
def is_space(string): return string.isspace()
def is_title(string): return string.istitle()
def is_upper(string): return string.isupper()
def is_stop(string, stops=set()): return string in stops
def is_stop(string, stops=set()): return string.lower() in stops
def is_oov(string): return True
def get_prob(string): return -20.
@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -17,6 +18,7 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
tag_map = TAG_MAP
class Polish(Language):
@ -1,7 +1,7 @@
# encoding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
_exc = {}
@ -12,24 +12,11 @@ for exc_data in [
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
{ORTH: "adw.", LEMMA: "adwokat", POS: NOUN},
{ORTH: "afr.", LEMMA: "afrykański", POS: ADJ},
{ORTH: "c.b.d.o.", LEMMA: "co było do okazania", POS: ADV},
{ORTH: "cbdu.", LEMMA: "co było do udowodnienia", POS: ADV},
{ORTH: "mn.w.", LEMMA: "mniej więcej", POS: ADV},
{ORTH: "nt.", LEMMA: "na temat", POS: ADP},
{ORTH: "ok.", LEMMA: "około"},
{ORTH: "n.p.u.", LEMMA: "na psa urok"},
{ORTH: "ww.", LEMMA: "wyżej wymieniony", POS: ADV}]:
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
_exc[exc_data[ORTH]] = [exc_data]
for orth in [
"w.", "r.", "br.", "bm.", "b.r.", "amer.", "am.", "bdb.", "św.", "p.", "lit.",
"wym.", "czyt.", "daw.", "d.", "zob.", "gw.", "dn.", "dyr.", "im.", "mł.",
"min.", "dot.", "muz.", "k.k.", "k.p.a.", "k.p.c.", "n.p.m.", "p.p.m.", "nb.",
"ob.", "n.e.", "p.n.e.", "zw.", "zool.", "zach.", "żarg.", "żart.", "wzgl.",
"wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]:
"w.", "r."]:
_exc[orth] = [{ORTH: orth}]
@ -24,5 +24,5 @@ TAG_MAP = {
"ADJ": {POS: ADJ},
@ -0,0 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
class VietnameseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'vi' # for pickling
class Vietnamese(Language):
lang = 'vi'
Defaults = VietnameseDefaults # override defaults
__all__ = ['Vietnamese']
@ -9,6 +9,7 @@ from ...tokens import Doc
class ChineseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'zh' # for pickling
use_jieba = True
class Chinese(Language):
@ -16,14 +17,25 @@ class Chinese(Language):
Defaults = ChineseDefaults # override defaults
def make_doc(self, text):
import jieba
except ImportError:
raise ImportError("The Chinese tokenizer requires the Jieba library: "
words = list(jieba.cut(text, cut_all=False))
words = [x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
if self.Defaults.use_jieba:
import jieba
except ImportError:
msg = ("Jieba not installed. Either set Chinese.use_jieba = False, "
"or install it https://github.com/fxsjy/jieba")
raise ImportError(msg)
words = list(jieba.cut(text, cut_all=False))
words = [x for x in words if x]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
words = []
spaces = []
doc = self.tokenizer(text)
for token in self.tokenizer(text):
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)
__all__ = ['Chinese']
@ -17,7 +17,7 @@ from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
from .pipeline import merge_noun_chunks, merge_entities
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .compat import json_dumps, izip, basestring_
from .gold import GoldParse
from .scorer import Scorer
@ -108,7 +108,8 @@ class Language(object):
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
'merge_entities': lambda nlp, **cfg: merge_entities
'merge_entities': lambda nlp, **cfg: merge_entities,
'merge_subtokens': lambda nlp, **cfg: merge_subtokens,
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
@ -27,11 +27,13 @@ class Lemmatizer(object):
univ_pos = 'adj'
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
univ_pos = 'punct'
elif univ_pos in (PROPN, 'PROPN'):
return [string]
return list(set([string.lower()]))
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
return list(set([string.lower()]))
return [string.lower()]
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []))
@ -88,6 +90,7 @@ class Lemmatizer(object):
def lemmatize(string, index, exceptions, rules):
orig = string
string = string.lower()
forms = []
forms.extend(exceptions.get(string, []))
@ -105,5 +108,5 @@ def lemmatize(string, index, exceptions, rules):
if not forms:
if not forms:
return list(set(forms))
@ -1,24 +1,19 @@
# cython: profile=True
# cython: infer_types=True
# coding: utf8
# cython: profile=True
from __future__ import unicode_literals
import ujson
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libcpp.pair cimport pair
from libc.stdint cimport int32_t, uint64_t, uint16_t
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .typedefs cimport attr_t, hash_t
from .structs cimport TokenC
from .tokens.doc cimport Doc, get_token_attr
from .lexeme cimport attr_id_t
from .vocab cimport Vocab
from .tokens.doc cimport Doc
from .tokens.doc cimport get_token_attr
from .attrs cimport ID, attr_id_t, NULL_ATTR
from .attrs import IDS
from .attrs cimport attr_id_t, ID, NULL_ATTR
from .attrs import FLAG61 as U_ENT
from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT
@ -48,29 +43,24 @@ from .attrs import FLAG36 as L9_ENT
from .attrs import FLAG35 as L10_ENT
cpdef enum quantifier_t:
cdef enum action_t:
REJECT = 0000
MATCH = 1000
ADVANCE = 0100
RETRY = 0010
cdef enum quantifier_t:
cdef enum action_t:
# A "match expression" conists of one or more token patterns
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
# A state is an (int, pattern pointer) pair, where the int is the start
# position, and the pattern pointer shows where we're up to
# in the pattern.
cdef struct AttrValueC:
attr_id_t attr
attr_t value
@ -80,10 +70,231 @@ cdef struct TokenPatternC:
AttrValueC* attrs
int32_t nr_attr
quantifier_t quantifier
hash_t key
ctypedef TokenPatternC* TokenPatternC_ptr
ctypedef pair[int, TokenPatternC_ptr] StateC
cdef struct ActionC:
char emit_match
char next_state_next_token
char next_state_same_token
char same_state_next_token
cdef struct PatternStateC:
TokenPatternC* pattern
int32_t start
int32_t length
cdef struct MatchC:
attr_t pattern_id
int32_t start
int32_t length
cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
cdef vector[PatternStateC] states
cdef vector[MatchC] matches
cdef PatternStateC state
cdef Pool mem = Pool()
# TODO: Prefill this with the extra attribute values.
extra_attrs = <attr_t**>mem.alloc(len(doc), sizeof(attr_t*))
# Main loop
cdef int i, j
for i in range(doc.length):
for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, &doc.c[i], extra_attrs[i])
# Handle matches that end in 0-width patterns
finish_states(matches, states)
return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length)
for i in range(matches.size())]
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
const TokenC* token, const attr_t* extra_attrs) except *:
cdef int q = 0
cdef vector[PatternStateC] new_states
for i in range(states.size()):
action = get_action(states[i], token, extra_attrs)
if action == REJECT:
state = states[i]
states[q] = state
while action in (RETRY, RETRY_EXTEND):
if action == RETRY_EXTEND:
PatternStateC(pattern=state.pattern, start=state.start,
states[q].pattern += 1
action = get_action(states[q], token, extra_attrs)
if action == REJECT:
elif action == ADVANCE:
states[q].pattern += 1
states[q].length += 1
q += 1
ent_id = state.pattern[1].attrs.value
if action == MATCH:
MatchC(pattern_id=ent_id, start=state.start,
elif action == MATCH_REJECT:
MatchC(pattern_id=ent_id, start=state.start,
elif action == MATCH_EXTEND:
MatchC(pattern_id=ent_id, start=state.start,
states[q].length += 1
q += 1
for i in range(new_states.size()):
cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
'''Handle states that end in zero-width patterns.'''
cdef PatternStateC state
for i in range(states.size()):
state = states[i]
while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
is_final = get_is_final(state)
if is_final:
ent_id = state.pattern[1].attrs.value
MatchC(pattern_id=ent_id, start=state.start, length=state.length))
state.pattern += 1
cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
'''We need to consider:
a) Does the token match the specification? [Yes, No]
b) What's the quantifier? [1, 0+, ?]
c) Is this the last specification? [final, non-final]
We can transition in the following ways:
a) Do we emit a match?
b) Do we add a state with (next state, next token)?
c) Do we add a state with (next state, same token)?
d) Do we add a state with (same state, next token)?
We'll code the actions as boolean strings, so 0000 means no to all 4,
1000 means match but no states added, etc.
Yes, final:
Yes, non-final:
No, final:
No, non-final
Yes, final:
Yes, non-final:
No, final:
1000 (note: Don't include last token!)
No, non-final:
Yes, final:
Yes, non-final:
No, final:
1000 (note: Don't include last token!)
No, non-final:
Possible combinations: 1000, 0100, 0000, 1001, 0011, 0010,
We'll name the bits "match", "advance", "retry", "extend"
REJECT = 0000
MATCH = 1000
ADVANCE = 0100
RETRY = 0010
MATCH_REJECT = 2000 # Match, but don't include last token
Problem: If a quantifier is matching, we're adding a lot of open partials
cdef char is_match
is_match = get_is_match(state, token, extra_attrs)
quantifier = get_quantifier(state)
is_final = get_is_final(state)
if quantifier == ZERO:
is_match = not is_match
quantifier = ONE
if quantifier == ONE:
if is_match and is_final:
# Yes, final: 1000
return MATCH
elif is_match and not is_final:
# Yes, non-final: 0100
return ADVANCE
elif not is_match and is_final:
# No, final: 0000
return REJECT
return REJECT
elif quantifier == ZERO_PLUS:
if is_match and is_final:
# Yes, final: 1001
elif is_match and not is_final:
# Yes, non-final: 0011
elif not is_match and is_final:
# No, final 2000 (note: Don't include last token!)
# No, non-final 0010
return RETRY
elif quantifier == ZERO_ONE:
if is_match and is_final:
# Yes, final: 1000
return MATCH
elif is_match and not is_final:
# Yes, non-final: 0100
return ADVANCE
elif not is_match and is_final:
# No, final 2000 (note: Don't include last token!)
# No, non-final 0010
return RETRY
cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
spec = state.pattern
for attr in spec.attrs[:spec.nr_attr]:
if get_token_attr(token, attr.attr) != attr.value:
return 0
return 1
cdef char get_is_final(PatternStateC state) nogil:
if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0:
return 1
return 0
cdef char get_quantifier(PatternStateC state) nogil:
return state.pattern.quantifier
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
@ -97,6 +308,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
for j, (attr, value) in enumerate(spec):
pattern[i].attrs[j].attr = attr
pattern[i].attrs[j].value = value
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
i = len(token_specs)
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
pattern[i].attrs[0].attr = ID
@ -105,48 +317,16 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
return pattern
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
while pattern.nr_attr != 0:
pattern += 1
id_attr = pattern[0].attrs[0]
assert id_attr.attr == ID
return id_attr.value
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
lookahead = &pattern[1]
for attr in pattern.attrs[:pattern.nr_attr]:
if get_token_attr(token, attr.attr) != attr.value:
if pattern.quantifier == ONE:
return REJECT
elif pattern.quantifier == ZERO:
return ACCEPT if lookahead.nr_attr == 0 else ADVANCE
elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO
return PANIC
if pattern.quantifier == ZERO:
return REJECT
elif lookahead.nr_attr == 0:
return ACCEPT
elif pattern.quantifier in (ONE, ZERO_ONE):
return ADVANCE
elif pattern.quantifier == ZERO_PLUS:
# This is a bandaid over the 'shadowing' problem described here:
# https://github.com/explosion/spaCy/issues/864
next_action = get_action(lookahead, token)
if next_action is REJECT:
return REPEAT
return PANIC
def _convert_strings(token_specs, string_store):
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
'?': (ZERO_ONE,), '1': (ONE,)}
operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
'?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
tokens = []
op = ONE
for spec in token_specs:
@ -176,21 +356,6 @@ def _convert_strings(token_specs, string_store):
return tokens
def merge_phrase(matcher, doc, i, matches):
"""Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i]
span = doc[start:end]
span.merge(ent_type=label, ent_id=ent_id)
def unpickle_matcher(vocab, patterns, callbacks):
matcher = Matcher(vocab)
for key, specs in patterns.items():
callback = callbacks.get(key, None)
matcher.add(key, callback, *specs)
return matcher
cdef class Matcher:
"""Match sequences of tokens, based on pattern rules."""
cdef Pool mem
@ -311,7 +476,7 @@ cdef class Matcher:
if key not in self._patterns:
return default
return (self._callbacks[key], self._patterns[key])
def pipe(self, docs, batch_size=1000, n_threads=2):
"""Match a stream of documents, yielding them in turn.
@ -333,85 +498,9 @@ cdef class Matcher:
describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers.
cdef vector[StateC] partials
cdef int n_partials = 0
cdef int q = 0
cdef int i, token_i
cdef const TokenC* token
cdef StateC state
matches = []
for token_i in range(doc.length):
token = &doc.c[token_i]
q = 0
# Go over the open matches, extending or finalizing if able.
# Otherwise, we over-write them (q doesn't advance)
for state in partials:
action = get_action(state.second, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO:
state.second += 1
action = get_action(state.second, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
if action == REPEAT:
# Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match
# more pattern.
q += 1
elif action == REJECT:
elif action == ADVANCE:
partials[q] = state
partials[q].second += 1
q += 1
elif action in (ACCEPT, ACCEPT_PREV):
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
start = state.first
end = token_i+1 if action == ACCEPT else token_i
ent_id = state.second[1].attrs[0].value
label = state.second[1].attrs[1].value
matches.append((ent_id, start, end))
# Check whether we open any new patterns on this token
for pattern in self.patterns:
action = get_action(pattern, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
while action == ADVANCE_ZERO:
pattern += 1
action = get_action(pattern, token)
if action == REPEAT:
state.first = token_i
state.second = pattern
elif action == ADVANCE:
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
state.first = token_i
state.second = pattern + 1
elif action in (ACCEPT, ACCEPT_PREV):
start = token_i
end = token_i+1 if action == ACCEPT else token_i
ent_id = pattern[1].attrs[0].value
label = pattern[1].attrs[1].value
matches.append((ent_id, start, end))
# Look for open patterns that are actually satisfied
for state in partials:
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
state.second += 1
if state.second.nr_attr == 0:
start = state.first
end = len(doc)
ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value
matches.append((ent_id, start, end))
for i, (ent_id, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
matches = find_matches(&self.patterns[0], self.patterns.size(), doc)
for i, (key, start, end) in enumerate(matches):
on_match = self._callbacks.get(key, None)
if on_match is not None:
on_match(self, doc, i, matches)
return matches
@ -423,31 +512,37 @@ cdef class Matcher:
return key
def unpickle_matcher(vocab, patterns, callbacks):
matcher = Matcher(vocab)
for key, specs in patterns.items():
callback = callbacks.get(key, None)
matcher.add(key, callback, *specs)
return matcher
def _get_longest_matches(matches):
'''Filter out matches that have a longer equivalent.'''
longest_matches = {}
for pattern_id, start, end in matches:
key = (pattern_id, start)
length = end-start
if key not in longest_matches or length > longest_matches[key]:
longest_matches[key] = length
return [(pattern_id, start, start+length)
for (pattern_id, start), length in longest_matches.items()]
def get_bilou(length):
if length == 1:
if length == 0:
raise ValueError("Length must be >= 1")
elif length == 1:
return [U_ENT]
elif length == 2:
return [B2_ENT, L2_ENT]
elif length == 3:
return [B3_ENT, I3_ENT, L3_ENT]
elif length == 4:
return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
elif length == 5:
return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
elif length == 6:
return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
elif length == 7:
return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
elif length == 8:
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
elif length == 9:
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
elif length == 10:
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT]
raise ValueError("Max length currently 10 for phrase matching")
return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
cdef class PhraseMatcher:
@ -456,21 +551,21 @@ cdef class PhraseMatcher:
cdef Matcher matcher
cdef PreshMap phrase_ids
cdef int max_length
cdef attr_t* _phrase_key
cdef public object _callbacks
cdef public object _patterns
def __init__(self, Vocab vocab, max_length=10):
self.mem = Pool()
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
self.max_length = max_length
self.vocab = vocab
self.matcher = Matcher(self.vocab)
self.phrase_ids = PreshMap()
abstract_patterns = []
for length in range(1, max_length):
abstract_patterns.append([{tag: True}
for tag in get_bilou(length)])
abstract_patterns = [
[{U_ENT: True}],
[{B2_ENT: True}, {L2_ENT: True}],
[{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
[{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
self.matcher.add('Candidate', None, *abstract_patterns)
self._callbacks = {}
@ -504,29 +599,24 @@ cdef class PhraseMatcher:
*docs (Doc): `Doc` objects representing match patterns.
cdef Doc doc
for doc in docs:
if len(doc) >= self.max_length:
msg = (
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
"Length can be set on initialization, up to 10."
raise ValueError(msg % (len(doc), self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match
cdef int length
cdef int i
cdef hash_t phrase_hash
cdef Pool mem = Pool()
for doc in docs:
length = doc.length
if length == 0:
tags = get_bilou(length)
for i in range(self.max_length):
self._phrase_key[i] = 0
phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
for i, tag in enumerate(tags):
lexeme = self.vocab[doc.c[i].lex.orth]
lexeme.set_flag(tag, True)
self._phrase_key[i] = lexeme.orth
phrase_hash = hash64(self._phrase_key,
self.max_length * sizeof(attr_t), 0)
phrase_key[i] = lexeme.orth
phrase_hash = hash64(phrase_key,
length * sizeof(attr_t), 0)
self.phrase_ids.set(phrase_hash, <void*>ent_id)
def __call__(self, Doc doc):
@ -548,28 +638,45 @@ cdef class PhraseMatcher:
on_match(self, doc, i, matches)
return matches
def pipe(self, stream, batch_size=1000, n_threads=2):
def pipe(self, stream, batch_size=1000, n_threads=2, return_matches=False,
"""Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents.
batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the implementation supports multi-threading.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
and yield (result, context) tuples out.
If both return_matches and as_tuples are True, the output will
be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order.
for doc in stream:
yield doc
if as_tuples:
for doc, context in stream:
matches = self(doc)
if return_matches:
yield ((doc, matches), context)
yield (doc, context)
for doc in stream:
matches = self(doc)
if return_matches:
yield (doc, matches)
yield doc
def accept_match(self, Doc doc, int start, int end):
assert (end - start) < self.max_length
cdef int i, j
for i in range(self.max_length):
self._phrase_key[i] = 0
cdef Pool mem = Pool()
phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t))
for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.c[j].lex.orth
cdef hash_t key = hash64(self._phrase_key,
self.max_length * sizeof(attr_t), 0)
phrase_key[i] = doc.c[j].lex.orth
cdef hash_t key = hash64(phrase_key,
(end-start) * sizeof(attr_t), 0)
ent_id = <hash_t>self.phrase_ids.get(key)
if ent_id == 0:
return None
@ -47,7 +47,9 @@ cdef class Morphology:
cdef enum univ_morph_t:
NIL = 0
Animacy_anim = symbols.Animacy_anim
@ -184,7 +184,9 @@ cdef class Morphology:
IDS = {
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,
"Animacy_inan": Animacy_inan,
"Animacy_hum": Animacy_hum, # U20
"Animacy_nhum": Animacy_nhum,
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
@ -25,6 +25,7 @@ from .morphology cimport Morphology
from .vocab cimport Vocab
from .syntax import nonproj
from .compat import json_dumps
from .matcher import Matcher
from .attrs import POS
from .parts_of_speech import X
@ -97,6 +98,17 @@ def merge_entities(doc):
return doc
def merge_subtokens(doc, label='subtok'):
merger = Matcher(doc.vocab)
merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
matches = merger(doc)
spans = [doc[start:end+1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets:
doc.merge(start_char, end_char)
return doc
class Pipe(object):
"""This class is not instantiated directly. Components inherit from it, and
it defines the interface that components should follow to function as
@ -167,7 +179,7 @@ class Pipe(object):
raise NotImplementedError
def create_optimizer(self):
return create_default_optimizer(self.model.ops,
**self.cfg.get('optimizer', {}))
@ -652,11 +664,13 @@ class MultitaskObjective(Tagger):
self.make_label = self.make_dep_tag_offset
elif target == 'ent_tag':
self.make_label = self.make_ent_tag
elif target == 'sent_start':
self.make_label = self.make_sent_start
elif hasattr(target, '__call__'):
self.make_label = target
raise ValueError("MultitaskObjective target should be function or "
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
"one of: dep, tag, ent, sent_start, dep_tag_offset, ent_tag.")
self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
@ -716,11 +730,7 @@ class MultitaskObjective(Tagger):
for i, gold in enumerate(golds):
for j in range(len(docs[i])):
# Handes alignment for tokenization differences
gold_idx = gold.cand_to_gold[j]
if gold_idx is None:
idx += 1
label = self.make_label(gold_idx, gold.words, gold.tags,
label = self.make_label(j, gold.words, gold.tags,
gold.heads, gold.labels, gold.ents)
if label is None or label not in self.labels:
correct[idx] = guesses[idx]
@ -765,6 +775,51 @@ class MultitaskObjective(Tagger):
return '%s-%s' % (tags[i], ents[i])
def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
'''A multi-task objective for representing sentence boundaries,
using BILU scheme. (O is impossible)
The implementation of this method uses an internal cache that relies
on the identity of the heads array, to avoid requiring a new piece
of gold data. You can pass cache=False if you know the cache will
do the wrong thing.
assert len(words) == len(heads)
assert target < len(words), (target, len(words))
if cache:
if id(heads) in _cache:
return _cache[id(heads)][target]
for key in list(_cache.keys()):
sent_tags = ['I-SENT'] * len(words)
_cache[id(heads)] = sent_tags
sent_tags = ['I-SENT'] * len(words)
def _find_root(child):
seen = set([child])
while child is not None and heads[child] != child:
child = heads[child]
return child
sentences = {}
for i in range(len(words)):
root = _find_root(i)
if root is None:
sent_tags[i] = None
sentences.setdefault(root, []).append(i)
for root, span in sorted(sentences.items()):
if len(span) == 1:
sent_tags[span[0]] = 'U-SENT'
sent_tags[span[0]] = 'B-SENT'
sent_tags[span[-1]] = 'L-SENT'
return sent_tags[target]
class SimilarityHook(Pipe):
@ -823,8 +878,8 @@ class TextCategorizer(Pipe):
name = 'textcat'
def Model(cls, nr_class=1, width=64, **cfg):
return build_text_classifier(nr_class, width, **cfg)
def Model(cls, **cfg):
return build_text_classifier(**cfg)
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
@ -890,6 +945,15 @@ class TextCategorizer(Pipe):
if label in self.labels:
return 0
if self.model not in (None, True, False):
# This functionality was available previously, but was broken.
# The problem is that we resize the last layer, but the last layer
# is actually just an ensemble. We're not resizing the child layers
# -- a huge problem.
raise ValueError(
"Cannot currently add labels to pre-trained text classifier. "
"Add labels before training begins. This functionality was "
"available in previous versions, but had significant bugs that "
"let to poor performance")
smaller = self.model._layers[-1]
larger = Affine(len(self.labels)+1, smaller.nI)
copy_array(larger.W[:smaller.nO], smaller.W)
@ -905,8 +969,9 @@ class TextCategorizer(Pipe):
token_vector_width = 64
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(len(self.labels), token_vector_width,
self.cfg['nr_class'] = len(self.labels)
self.cfg['width'] = token_vector_width
self.model = self.Model(**self.cfg)
if sgd is None:
sgd = self.create_optimizer()
@ -920,7 +985,7 @@ cdef class DependencyParser(Parser):
def postprocesses(self):
return [nonproj.deprojectivize]
def add_multitask_objective(self, target):
labeller = MultitaskObjective(self.vocab, target=target)
@ -941,7 +1006,7 @@ cdef class EntityRecognizer(Parser):
TransitionSystem = BiluoPushDown
nr_feature = 6
def add_multitask_objective(self, target):
labeller = MultitaskObjective(self.vocab, target=target)
@ -1,7 +1,7 @@
# coding: utf8
from __future__ import division, print_function, unicode_literals
from .gold import tags_to_entities
from .gold import tags_to_entities, GoldParse
class PRFScore(object):
@ -84,6 +84,8 @@ class Scorer(object):
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
if len(tokens) != len(gold):
gold = GoldParse.from_annot_tuples(tokens, zip(*gold.orig_annot))
assert len(tokens) == len(gold)
gold_deps = set()
gold_tags = set()
@ -100,8 +102,7 @@ class Scorer(object):
gold_i = gold.cand_to_gold[token.i]
if gold_i is None:
if token.dep_.lower() not in punct_labels:
self.tokens.fp += 1
self.tokens.fp += 1
self.tokens.tp += 1
cand_tags.add((gold_i, token.tag_))
@ -85,6 +85,7 @@ cdef enum symbol_t:
@ -108,8 +109,9 @@ cdef enum symbol_t:
Animacy_hum # U20
@ -393,6 +395,7 @@ cdef enum symbol_t:
@ -451,10 +454,9 @@ cdef enum symbol_t:
@ -114,8 +114,9 @@ IDS = {
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,
"Animacy_inam": Animacy_inan,
"Animacy_hum": Animacy_hum, # U20
"Animacy_nhum": Animacy_nhum,
"Aspect_freq": Aspect_freq,
"Aspect_imp": Aspect_imp,
"Aspect_mod": Aspect_mod,
@ -458,6 +459,7 @@ IDS = {
"punct": punct,
"quantmod": quantmod,
"rcmod": rcmod,
"relcl": relcl,
"root": root,
"xcomp": xcomp,
@ -108,7 +108,7 @@ cdef cppclass StateC:
ids[1] = this.B(1)
ids[2] = this.S(0)
ids[3] = this.S(1)
ids[4] = this.H(this.S(0))
ids[4] = this.S(2)
ids[5] = this.L(this.B(0), 1)
ids[6] = this.L(this.S(0), 1)
ids[7] = this.R(this.S(0), 1)
@ -6,16 +6,19 @@ from __future__ import unicode_literals
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from collections import OrderedDict
from collections import OrderedDict, defaultdict, Counter
from thinc.extra.search cimport Beam
import json
from .stateclass cimport StateClass
from ._state cimport StateC
from .nonproj import is_nonproj_tree
from . import nonproj
from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse, GoldParseC
from ..structs cimport TokenC
# Calculate cost as gold/not gold. We don't use scalar value anyway.
cdef int BINARY_COSTS = 1
@ -54,6 +57,8 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
cost += 1
if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
cost += 1
if BINARY_COSTS and cost >= 1:
return cost
cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
return cost
@ -67,6 +72,8 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
cost += gold.heads[target] == B_i
if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
if BINARY_COSTS and cost >= 1:
return cost
if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
cost += 1
return cost
@ -110,7 +117,8 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
cdef class Shift:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1
sent_start = st._sent[st.B_(0).l_edge].sent_start
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and sent_start != 1
cdef int transition(StateC* st, attr_t label) nogil:
@ -170,7 +178,8 @@ cdef class Reduce:
cdef class LeftArc:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.B_(0).sent_start != 1
sent_start = st._sent[st.B_(0).l_edge].sent_start
return sent_start != 1
cdef int transition(StateC* st, attr_t label) nogil:
@ -205,7 +214,8 @@ cdef class RightArc:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
# If there's (perhaps partial) parse pre-set, don't allow cycle.
return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0)
sent_start = st._sent[st.B_(0).l_edge].sent_start
return sent_start != 1 and st.H(st.S(0)) != st.B(0)
cdef int transition(StateC* st, attr_t label) nogil:
@ -312,39 +322,42 @@ cdef class ArcEager(TransitionSystem):
def get_actions(cls, **kwargs):
actions = kwargs.get('actions', OrderedDict((
(SHIFT, ['']),
(REDUCE, ['']),
(RIGHT, []),
(LEFT, []),
(BREAK, ['ROOT']))
seen_actions = set()
min_freq = kwargs.get('min_freq', None)
actions = defaultdict(lambda: Counter())
actions[SHIFT][''] = 1
actions[REDUCE][''] = 1
for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT':
if (LEFT, label) not in seen_actions:
seen_actions.add((LEFT, label))
actions[LEFT][label] = 1
actions[SHIFT][label] = 1
for label in kwargs.get('right_labels', []):
if label.upper() != 'ROOT':
if (RIGHT, label) not in seen_actions:
seen_actions.add((RIGHT, label))
actions[RIGHT][label] = 1
actions[REDUCE][label] = 1
for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, iob), ctnts in sents:
heads, labels = nonproj.projectivize(heads, labels)
for child, head, label in zip(ids, heads, labels):
if label.upper() == 'ROOT':
if label.upper() == 'ROOT' :
label = 'ROOT'
if label != 'ROOT':
if head < child:
if (RIGHT, label) not in seen_actions:
seen_actions.add((RIGHT, label))
elif head > child:
if (LEFT, label) not in seen_actions:
seen_actions.add((LEFT, label))
if head == child:
actions[BREAK][label] += 1
elif head < child:
actions[RIGHT][label] += 1
actions[REDUCE][''] += 1
elif head > child:
actions[LEFT][label] += 1
actions[SHIFT][''] += 1
if min_freq is not None:
for action, label_freqs in actions.items():
for label, freq in list(label_freqs.items()):
if freq < min_freq:
# Ensure these actions are present
actions[BREAK].setdefault('ROOT', 0)
actions[RIGHT].setdefault('subtok', 0)
actions[LEFT].setdefault('subtok', 0)
# Used for backoff
actions[RIGHT].setdefault('dep', 0)
actions[LEFT].setdefault('dep', 0)
return actions
property action_types:
@ -376,18 +389,34 @@ cdef class ArcEager(TransitionSystem):
def preprocess_gold(self, GoldParse gold):
if not self.has_gold(gold):
return None
for i in range(gold.length):
for i, (head, dep) in enumerate(zip(gold.heads, gold.labels)):
# Missing values
if gold.heads[i] is None or gold.labels[i] is None:
if head is None or dep is None:
gold.c.heads[i] = i
gold.c.has_dep[i] = False
label = gold.labels[i]
if head > i:
action = LEFT
elif head < i:
action = RIGHT
action = BREAK
if dep not in self.labels[action]:
if action == BREAK:
dep = 'ROOT'
elif nonproj.is_decorated(dep):
backoff = nonproj.decompose(dep)[0]
if backoff in self.labels[action]:
dep = backoff
dep = 'dep'
dep = 'dep'
gold.c.has_dep[i] = True
if label.upper() == 'ROOT':
label = 'ROOT'
gold.c.heads[i] = gold.heads[i]
gold.c.labels[i] = self.strings.add(label)
if dep.upper() == 'ROOT':
dep = 'ROOT'
gold.c.heads[i] = head
gold.c.labels[i] = self.strings.add(dep)
return gold
def get_beam_parses(self, Beam beam):
@ -527,8 +556,13 @@ cdef class ArcEager(TransitionSystem):
is_valid[i] = False
costs[i] = 9000
if n_gold < 1:
# Check projectivity --- leading cause
if is_nonproj_tree(gold.heads):
# Check label set --- leading cause
label_set = set([self.strings[self.c[i].label] for i in range(self.n_moves)])
for label_str in gold.labels:
if label_str is not None and label_str not in label_set:
raise ValueError("Cannot get gold parser action: unknown label: %s" % label_str)
# Check projectivity --- other leading cause
if nonproj.is_nonproj_tree(gold.heads):
raise ValueError(
"Could not find a gold-standard action to supervise the "
"dependency parser. Likely cause: the tree is "
@ -3,7 +3,7 @@ from __future__ import unicode_literals
from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
from collections import OrderedDict
from collections import OrderedDict, Counter
from .stateclass cimport StateClass
from ._state cimport StateC
@ -64,21 +64,18 @@ cdef class BiluoPushDown(TransitionSystem):
def get_actions(cls, **kwargs):
actions = kwargs.get('actions', OrderedDict((
(MISSING, ['']),
(BEGIN, []),
(IN, []),
(LAST, []),
(UNIT, []),
(OUT, [''])
seen_entities = set()
actions = {
MISSING: Counter(),
BEGIN: Counter(),
IN: Counter(),
LAST: Counter(),
UNIT: Counter(),
OUT: Counter()
actions[OUT][''] = 1
for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities:
for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U')
for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, biluo), _ in sents:
@ -87,10 +84,8 @@ cdef class BiluoPushDown(TransitionSystem):
if ner_tag.count('-') != 1:
raise ValueError(ner_tag)
_, label = ner_tag.split('-')
if label not in seen_entities:
for move_str in ('B', 'I', 'L', 'U'):
for action in (BEGIN, IN, LAST, UNIT):
actions[action][label] += 1
return actions
property action_types:
@ -213,7 +208,7 @@ cdef class BiluoPushDown(TransitionSystem):
raise Exception(move)
return t
def add_action(self, int action, label_name):
def add_action(self, int action, label_name, freq=None):
cdef attr_t label_id
if not isinstance(label_name, (int, long)):
label_id = self.strings.add(label_name)
@ -234,6 +229,12 @@ cdef class BiluoPushDown(TransitionSystem):
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id
self.n_moves += 1
if self.labels.get(action, []):
freq = min(0, min(self.labels[action].values()))
self.labels[action][label_name] = freq-1
self.labels[action] = Counter()
self.labels[action][label_name] = -1
return 1
cdef int initialize_state(self, StateC* st) nogil:
@ -15,7 +15,7 @@ cdef class Parser:
cdef readonly object cfg
cdef public object _multitasks
cdef void _parseC(self, StateC* state,
cdef void _parseC(self, StateC** states, int nr_task,
const float* feat_weights, const float* bias,
const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
@ -1,7 +1,6 @@
# cython: infer_types=True
# cython: cdivision=True
# cython: boundscheck=False
# cython: profile=True
# coding: utf-8
from __future__ import unicode_literals, print_function
@ -28,6 +27,8 @@ from thinc.misc import LayerNorm
from thinc.neural.ops import CupyOps
from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec
from thinc cimport openblas
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
from .._ml import link_vectors_to_models, create_default_optimizer
@ -266,7 +267,7 @@ cdef class Parser:
with Model.use_device('cpu'):
upper = chain(
clone(LayerNorm(Maxout(hidden_width, hidden_width)), depth-1),
clone(Maxout(hidden_width, hidden_width), depth-1),
zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
@ -302,7 +303,7 @@ cdef class Parser:
self.vocab = vocab
if moves is True:
self.moves = self.TransitionSystem(self.vocab.strings, {})
self.moves = self.TransitionSystem(self.vocab.strings)
self.moves = moves
if 'beam_width' not in cfg:
@ -311,12 +312,7 @@ cdef class Parser:
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
if 'pretrained_dims' not in cfg:
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
cfg.setdefault('cnn_maxout_pieces', 3)
self.cfg = cfg
if 'actions' in self.cfg:
for action, labels in self.cfg.get('actions', {}).items():
for label in labels:
self.moves.add_action(action, label)
self.model = model
self._multitasks = []
@ -423,69 +419,81 @@ cdef class Parser:
cdef int nr_hidden = hidden_weights.shape[0]
cdef int nr_task = states.size()
with nogil:
for i in range(nr_task):
feat_weights, bias, hW, hb,
nr_class, nr_hidden, nr_feat, nr_piece)
self._parseC(&states[0], nr_task, feat_weights, bias, hW, hb,
nr_class, nr_hidden, nr_feat, nr_piece)
tokvecs = self.model[0].ops.unflatten(tokvecs,
[len(doc) for doc in docs])
return state_objs, tokvecs
cdef void _parseC(self, StateC* state,
cdef void _parseC(self, StateC** states, int nr_task,
const float* feat_weights, const float* bias,
const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
token_ids = <int*>calloc(nr_feat, sizeof(int))
is_valid = <int*>calloc(nr_class, sizeof(int))
vectors = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
scores = <float*>calloc(nr_class, sizeof(float))
vectors = <float*>calloc(nr_hidden * nr_task, sizeof(float))
unmaxed = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
scores = <float*>calloc(nr_class*nr_task, sizeof(float))
if not (token_ids and is_valid and vectors and scores):
with gil:
cdef float feature
while not state.is_final():
state.set_context_tokens(token_ids, nr_feat)
memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
memset(scores, 0, nr_class * sizeof(float))
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
for i in range(nr_hidden * nr_piece):
vectors[i] += bias[i]
V = vectors
W = hW
for i in range(nr_hidden):
if nr_piece == 1:
feature = V[0] if V[0] >= 0. else 0.
elif nr_piece == 2:
feature = V[0] if V[0] >= V[1] else V[1]
feature = Vec.max(V, nr_piece)
for j in range(nr_class):
scores[j] += feature * W[j]
W += nr_class
V += nr_piece
for i in range(nr_class):
scores[i] += hb[i]
self.moves.set_valid(is_valid, state)
guess = arg_max_if_valid(scores, is_valid, nr_class)
action = self.moves.c[guess]
action.do(state, action.label)
cdef int nr_todo = nr_task
cdef int i, j
cdef vector[StateC*] unfinished
while nr_todo >= 1:
memset(vectors, 0, nr_todo * nr_hidden * sizeof(float))
memset(scores, 0, nr_todo * nr_class * sizeof(float))
for i in range(nr_todo):
state = states[i]
state.set_context_tokens(token_ids, nr_feat)
memset(unmaxed, 0, nr_hidden * nr_piece * sizeof(float))
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
bias, 1., nr_hidden*nr_piece)
state_vector = &vectors[i*nr_hidden]
for j in range(nr_hidden):
index = j * nr_piece
which = Vec.arg_max(&unmaxed[index], nr_piece)
state_vector[j] = unmaxed[index + which]
# Compute hidden-to-output
openblas.simple_gemm(scores, nr_todo, nr_class,
vectors, nr_todo, nr_hidden, hW, nr_hidden, nr_class, 0, 0)
# Add bias
for i in range(nr_todo):
hb, 1., nr_class)
# Validate actions, argmax, take action.
for i in range(nr_todo):
state = states[i]
self.moves.set_valid(is_valid, state)
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
action = self.moves.c[guess]
action.do(state, action.label)
if not state.is_final():
for i in range(unfinished.size()):
states[i] = unfinished[i]
nr_todo = unfinished.size()
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001,
float drop=0.):
cdef Beam beam
cdef np.ndarray scores
cdef Doc doc
cdef int nr_class = self.moves.n_moves
cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
docs, cuda_stream, 0.0)
docs, cuda_stream, drop)
cdef int offset = 0
cdef int j = 0
cdef int k
@ -524,8 +532,8 @@ cdef class Parser:
n_states += 1
if n_states == 0:
vectors = state2vec(token_ids[:n_states])
scores = vec2scores(vectors)
vectors, _ = state2vec.begin_update(token_ids[:n_states], drop)
scores, _ = vec2scores.begin_update(vectors, drop=drop)
c_scores = <float*>scores.data
for beam in todo:
for i in range(beam.size):
@ -556,7 +564,10 @@ cdef class Parser:
for multitask in self._multitasks:
multitask.update(docs, golds, drop=drop, sgd=sgd)
cuda_stream = util.get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds)
# Chop sequences into lengths of this many transitions, to make the
# batch uniform length.
cut_gold = numpy.random.choice(range(20, 100))
states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
todo = [(s, g) for (s, g) in zip(states, golds)
@ -659,8 +670,7 @@ cdef class Parser:
for beam in beams:
def _init_gold_batch(self, whole_docs, whole_golds):
def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500):
"""Make a square batch, of length equal to the shortest doc. A long
doc will get multiple states. Let's say we have a doc of length 2*N,
where N is the shortest doc. We'll make two states, one representing
@ -669,7 +679,7 @@ cdef class Parser:
StateClass state
Transition action
whole_states = self.moves.init_batch(whole_docs)
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
max_moves = 0
states = []
golds = []
@ -791,6 +801,11 @@ cdef class Parser:
for doc in docs:
def labels(self):
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
return class_names
def tok2vec(self):
'''Return the embedding and convolutional layer of the model.'''
@ -809,9 +824,6 @@ cdef class Parser:
for action in self.moves.action_types:
added = self.moves.add_action(action, label)
if added:
# Important that the labels be stored as a list! We need the
# order, or the model goes out of synch
self.cfg.setdefault('extra_labels', []).append(label)
resized = True
if self.model not in (True, False, None) and resized:
# Weights are stored in (nr_out, nr_in) format, so we're basically
@ -9,7 +9,7 @@ from __future__ import unicode_literals
from copy import copy
from ..tokens.doc cimport Doc
from ..tokens.doc cimport Doc, set_children_from_heads
@ -74,7 +74,21 @@ def decompose(label):
def is_decorated(label):
return label.find(DELIMITER) != -1
return DELIMITER in label
def count_decorated_labels(gold_tuples):
freqs = {}
for raw_text, sents in gold_tuples:
for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads, deco_labels = projectivize(heads, labels)
# set the label to ROOT for each root dependent
deco_labels = ['ROOT' if head == i else deco_labels[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
for label in deco_labels:
if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1
return freqs
def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
@ -124,8 +138,9 @@ cpdef deprojectivize(Doc doc):
if DELIMITER in label:
new_label, head_label = label.split(DELIMITER)
new_head = _find_new_head(doc[i], head_label)
doc[i].head = new_head
doc.c[i].head = new_head.i - i
doc.c[i].dep = doc.vocab.strings.add(new_label)
set_children_from_heads(doc.c, doc.length)
return doc
@ -191,9 +206,12 @@ def _filter_labels(gold_tuples, cutoff, freqs):
for raw_text, sents in gold_tuples:
filtered_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = [decompose(label)[0]
if freqs.get(label, cutoff) < cutoff
else label for label in labels]
filtered_labels = []
for label in labels:
if is_decorated(label) and freqs.get(label, 0) < cutoff:
((ids, words, tags, heads, filtered_labels, iob), ctnts))
filtered.append((raw_text, filtered_sents))
@ -42,6 +42,7 @@ cdef class TransitionSystem:
cdef public attr_t root_label
cdef public freqs
cdef init_state_t init_beam_state
cdef public object labels
cdef int initialize_state(self, StateC* state) nogil
cdef int finalize_state(self, StateC* state) nogil
@ -5,7 +5,7 @@ from __future__ import unicode_literals
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from collections import OrderedDict
from collections import OrderedDict, Counter
import ujson
from ..structs cimport TokenC
@ -28,7 +28,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef class TransitionSystem:
def __init__(self, StringStore string_table, labels_by_action):
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
self.mem = Pool()
self.strings = string_table
self.n_moves = 0
@ -36,21 +36,14 @@ cdef class TransitionSystem:
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
for action, label_strs in labels_by_action.items():
for label_str in label_strs:
self.add_action(int(action), label_str)
self.labels = {}
if labels_by_action:
self.initialize_actions(labels_by_action, min_freq=min_freq)
self.root_label = self.strings.add('ROOT')
self.init_beam_state = _init_state
def __reduce__(self):
labels_by_action = OrderedDict()
cdef Transition t
for trans in self.c[:self.n_moves]:
label_str = self.strings[trans.label]
labels_by_action.setdefault(trans.move, []).append(label_str)
return (self.__class__,
(self.strings, labels_by_action),
None, None)
return (self.__class__, (self.strings, self.labels), None, None)
def init_batch(self, docs):
cdef StateClass state
@ -146,6 +139,22 @@ cdef class TransitionSystem:
act = self.c[clas]
return self.move_name(act.move, act.label)
def initialize_actions(self, labels_by_action, min_freq=None):
self.labels = {}
self.n_moves = 0
for action, label_freqs in sorted(labels_by_action.items()):
action = int(action)
# Make sure we take a copy here, and that we get a Counter
self.labels[action] = Counter()
# Have to be careful here: Sorting must be stable, or our model
# won't be read back in correctly.
sorted_labels = [(f, L) for L, f in label_freqs.items()]
for freq, label_str in sorted_labels:
self.add_action(int(action), label_str)
self.labels[action][label_str] = freq
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int) and \
@ -164,6 +173,14 @@ cdef class TransitionSystem:
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id
self.n_moves += 1
if self.labels.get(action, []):
new_freq = min(self.labels[action].values())
self.labels[action] = Counter()
new_freq = -1
if new_freq > 0:
new_freq = 0
self.labels[action][label_name] = new_freq-1
return 1
def to_disk(self, path, **exclude):
@ -178,26 +195,18 @@ cdef class TransitionSystem:
def to_bytes(self, **exclude):
transitions = []
for trans in self.c[:self.n_moves]:
'clas': trans.clas,
'move': trans.move,
'label': self.strings[trans.label],
'name': self.move_name(trans.move, trans.label)
serializers = {
'transitions': lambda: json_dumps(transitions),
'moves': lambda: json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes()
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, **exclude):
transitions = []
labels = {}
deserializers = {
'transitions': lambda b: transitions.extend(ujson.loads(b)),
'moves': lambda b: labels.update(ujson.loads(b)),
'strings': lambda b: self.strings.from_bytes(b)
msg = util.from_bytes(bytes_data, deserializers, exclude)
for trans in transitions:
self.add_action(trans['move'], trans['label'])
return self
@ -19,6 +19,15 @@ def doc(en_tokenizer):
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
def doc_not_parsed(en_tokenizer):
text = "This is a sentence. This is another sentence. And a third."
tokens = en_tokenizer(text)
d = get_doc(tokens.vocab, [t.text for t in tokens])
d.is_parsed = False
return d
def test_spans_sent_spans(doc):
sents = list(doc.sents)
assert sents[0].start == 0
@ -34,6 +43,7 @@ def test_spans_root(doc):
assert span.root.text == 'sentence'
assert span.root.head.text == 'is'
def test_spans_string_fn(doc):
span = doc[0:4]
assert len(span) == 4
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
assert span.upper_ == 'THIS IS A SENTENCE'
assert span.lower_ == 'this is a sentence'
def test_spans_root2(en_tokenizer):
text = "through North and South Carolina"
heads = [0, 3, -1, -2, -4]
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
assert doc[-2:].root.text == 'Carolina'
def test_spans_span_sent(doc):
def test_spans_span_sent(doc, doc_not_parsed):
"""Test span.sent property"""
assert len(list(doc.sents))
assert doc[:2].sent.root.text == 'is'
assert doc[:2].sent.text == 'This is a sentence .'
assert doc[6:7].sent.root.left_edge.text == 'This'
# test on manual sbd
doc_not_parsed[0].is_sent_start = True
doc_not_parsed[5].is_sent_start = True
assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
def test_spans_lca_matrix(en_tokenizer):
@ -129,7 +145,7 @@ def test_span_to_array(doc):
assert arr[0, 1] == len(span[0])
def test_span_as_doc(doc):
span = doc[4:10]
span_doc = span.as_doc()
assert span.text == span_doc.text.strip()
#def test_span_as_doc(doc):
# span = doc[4:10]
# span_doc = span.as_doc()
# assert span.text == span_doc.text.strip()
@ -24,8 +24,8 @@ def test_tag_names(EN):
text = "I ate pizzas with anchovies."
doc = EN(text, disable=['parser'])
assert type(doc[2].pos) == int
assert isinstance(doc[2].pos_, six.text_type)
assert isinstance(doc[2].dep_, six.text_type)
assert isinstance(doc[2].pos_, unicode_)
assert isinstance(doc[2].dep_, unicode_)
assert doc[2].tag_ == u'NNS'
Normal file
Normal file
@ -0,0 +1,75 @@
from __future__ import unicode_literals
from ...vocab import Vocab
from ...pipeline import DependencyParser
from ...tokens import Doc
from ...gold import GoldParse
from ...syntax.nonproj import projectivize
annot_tuples = [
(0, 'When', 'WRB', 11, 'advmod', 'O'),
(1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'),
(2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'),
(3, ',', ',', 2, 'punct', 'O'),
(4, 'our', 'PRP$', 6, 'poss', 'O'),
(5, 'embedded', 'VBN', 6, 'amod', 'O'),
(6, 'reporter', 'NN', 2, 'appos', 'O'),
(7, 'with', 'IN', 6, 'prep', 'O'),
(8, 'the', 'DT', 10, 'det', 'B-ORG'),
(9, '3rd', 'NNP', 10, 'compound', 'I-ORG'),
(10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'),
(11, 'says', 'VBZ', 44, 'advcl', 'O'),
(12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'),
(13, 'battalions', 'NNS', 16, 'nsubj', 'O'),
(14, 'of', 'IN', 13, 'prep', 'O'),
(15, 'troops', 'NNS', 14, 'pobj', 'O'),
(16, 'are', 'VBP', 11, 'ccomp', 'O'),
(17, 'on', 'IN', 16, 'prep', 'O'),
(18, 'the', 'DT', 19, 'det', 'O'),
(19, 'ground', 'NN', 17, 'pobj', 'O'),
(20, ',', ',', 17, 'punct', 'O'),
(21, 'inside', 'IN', 17, 'prep', 'O'),
(22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'),
(23, 'itself', 'PRP', 22, 'appos', 'O'),
(24, ',', ',', 16, 'punct', 'O'),
(25, 'have', 'VBP', 26, 'aux', 'O'),
(26, 'taken', 'VBN', 16, 'dep', 'O'),
(27, 'up', 'RP', 26, 'prt', 'O'),
(28, 'positions', 'NNS', 26, 'dobj', 'O'),
(29, 'they', 'PRP', 31, 'nsubj', 'O'),
(30, "'re", 'VBP', 31, 'aux', 'O'),
(31, 'going', 'VBG', 26, 'parataxis', 'O'),
(32, 'to', 'TO', 33, 'aux', 'O'),
(33, 'spend', 'VB', 31, 'xcomp', 'O'),
(34, 'the', 'DT', 35, 'det', 'B-TIME'),
(35, 'night', 'NN', 33, 'dobj', 'L-TIME'),
(36, 'there', 'RB', 33, 'advmod', 'O'),
(37, 'presumably', 'RB', 33, 'advmod', 'O'),
(38, ',', ',', 44, 'punct', 'O'),
(39, 'how', 'WRB', 40, 'advmod', 'O'),
(40, 'many', 'JJ', 41, 'amod', 'O'),
(41, 'soldiers', 'NNS', 44, 'pobj', 'O'),
(42, 'are', 'VBP', 44, 'aux', 'O'),
(43, 'we', 'PRP', 44, 'nsubj', 'O'),
(44, 'talking', 'VBG', 44, 'ROOT', 'O'),
(45, 'about', 'IN', 44, 'prep', 'O'),
(46, 'right', 'RB', 47, 'advmod', 'O'),
(47, 'now', 'RB', 44, 'advmod', 'O'),
(48, '?', '.', 44, 'punct', 'O')]
def test_get_oracle_actions():
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
parser = DependencyParser(doc.vocab)
parser.moves.add_action(0, '')
parser.moves.add_action(1, '')
parser.moves.add_action(1, '')
parser.moves.add_action(4, 'ROOT')
for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
if head > i:
parser.moves.add_action(2, dep)
elif head < i:
parser.moves.add_action(3, dep)
ids, words, tags, heads, deps, ents = zip(*annot_tuples)
heads, deps = projectivize(heads, deps)
gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
actions = parser.moves.get_oracle_sequence(doc, gold)
@ -13,8 +13,8 @@ from ...vocab import Vocab
('a b', 0, 2),
('a c', 0, 1),
('a b c', 0, 2),
('a b b c', 0, 2),
('a b b', 0, 2),
('a b b c', 0, 3),
('a b b', 0, 3),
def test_issue1450_matcher_end_zero_plus(string, start, end):
@ -54,5 +54,6 @@ def test_issue1450_matcher_end_zero_plus(string, start, end):
if start is None or end is None:
assert matches == []
assert matches[0][1] == start
assert matches[0][2] == end
assert matches[-1][1] == start
assert matches[-1][2] == end
Normal file
Normal file
@ -0,0 +1,65 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from ...matcher import Matcher
import pytest
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
re_pattern1 = 'AA*'
re_pattern2 = 'A*A'
re_pattern3 = 'AA'
re_pattern4 = 'BA*B'
re_pattern5 = 'B*A*B'
def text():
return "(ABBAAAAAB)."
def doc(en_tokenizer,text):
doc = en_tokenizer(' '.join(text))
return doc
def test_greedy_matching(doc,text,pattern,re_pattern):
Test that the greedy matching behavior of the * op
is consistant with other re implementations
matcher = Matcher(doc.vocab)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
for match,re_match in zip(matches,re_matches):
assert match[1:]==re_match
def test_match_consuming(doc,text,pattern,re_pattern):
Test that matcher.__call__ consumes tokens on a match
similar to re.findall
matcher = Matcher(doc.vocab)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
assert len(matches)==len(re_matches)
Normal file
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
@ -6,7 +6,6 @@ from ...vocab import Vocab
from ...tokens import Doc
from ...matcher import Matcher
def test_issue1945():
text = "a a a"
matcher = Matcher(Vocab())
@ -22,10 +22,9 @@ def test_basic_case():
assert end == 4
def test_issue850():
"""The problem here is that the variable-length pattern matches the
succeeding token. We then don't handle the ambiguity correctly."""
"""The variable-length pattern matches the
succeeding token. Check we handle the ambiguity correctly."""
matcher = Matcher(Vocab(
lex_attr_getters={LOWER: lambda string: string.lower()}))
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
Normal file
@ -21,187 +26,196 @@ def matcher(en_vocab):
return matcher
def test_matcher_from_api_docs(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{'ORTH': 'test'}]
assert len(matcher) == 0
matcher.add('Rule', None, pattern)
assert len(matcher) == 1
assert 'Rule' not in matcher
matcher.add('Rule', None, pattern)
assert 'Rule' in matcher
on_match, patterns = matcher.get('Rule')
assert len(patterns[0])
#def test_matcher_from_api_docs(en_vocab):
# matcher = Matcher(en_vocab)
# pattern = [{'ORTH': 'test'}]
# assert len(matcher) == 0
# matcher.add('Rule', None, pattern)
# assert len(matcher) == 1
# matcher.remove('Rule')
# assert 'Rule' not in matcher
# matcher.add('Rule', None, pattern)
# assert 'Rule' in matcher
# on_match, patterns = matcher.get('Rule')
# assert len(patterns[0])
#def test_matcher_from_usage_docs(en_vocab):
# text = "Wow 😀 This is really cool! 😂 😂"
# doc = get_doc(en_vocab, words=text.split(' '))
# pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
# pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
# def label_sentiment(matcher, doc, i, matches):
# match_id, start, end = matches[i]
# if doc.vocab.strings[match_id] == 'HAPPY':
# doc.sentiment += 0.1
# span = doc[start : end]
# token = span.merge()
# token.vocab[token.text].norm_ = 'happy emoji'
# matcher = Matcher(en_vocab)
# matcher.add('HAPPY', label_sentiment, *pos_patterns)
# matches = matcher(doc)
# assert doc.sentiment != 0
# assert doc[1].norm_ == 'happy emoji'
#@pytest.mark.parametrize('words', [["Some", "words"]])
#def test_matcher_init(en_vocab, words):
# matcher = Matcher(en_vocab)
# doc = get_doc(en_vocab, words)
# assert len(matcher) == 0
# assert matcher(doc) == []
#def test_matcher_contains(matcher):
# matcher.add('TEST', None, [{'ORTH': 'test'}])
# assert 'TEST' in matcher
# assert 'TEST2' not in matcher
#def test_matcher_no_match(matcher):
# words = ["I", "like", "cheese", "."]
# doc = get_doc(matcher.vocab, words)
# assert matcher(doc) == []
#def test_matcher_compile(en_vocab):
# rules = {
# 'JS': [[{'ORTH': 'JavaScript'}]],
# 'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
# 'Java': [[{'LOWER': 'java'}]]
# }
# matcher = Matcher(en_vocab)
# for key, patterns in rules.items():
# matcher.add(key, None, *patterns)
# assert len(matcher) == 3
#def test_matcher_match_start(matcher):
# words = ["JavaScript", "is", "good"]
# doc = get_doc(matcher.vocab, words)
# assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
#def test_matcher_match_end(matcher):
# words = ["I", "like", "java"]
# doc = get_doc(matcher.vocab, words)
# assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
#def test_matcher_match_middle(matcher):
# words = ["I", "like", "Google", "Now", "best"]
# doc = get_doc(matcher.vocab, words)
# assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
#def test_matcher_match_multi(matcher):
# words = ["I", "like", "Google", "Now", "and", "java", "best"]
# doc = get_doc(matcher.vocab, words)
# assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
# (doc.vocab.strings['Java'], 5, 6)]
#def test_matcher_empty_dict(en_vocab):
# '''Test matcher allows empty token specs, meaning match on any token.'''
# matcher = Matcher(en_vocab)
# abc = ["a", "b", "c"]
# doc = get_doc(matcher.vocab, abc)
# matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
# matches = matcher(doc)
# assert len(matches) == 1
# assert matches[0][1:] == (0, 3)
# matcher = Matcher(en_vocab)
# matcher.add('A.', None, [{'ORTH': 'a'}, {}])
# matches = matcher(doc)
# assert matches[0][1:] == (0, 2)
#def test_matcher_operator_shadow(en_vocab):
# matcher = Matcher(en_vocab)
# abc = ["a", "b", "c"]
# doc = get_doc(matcher.vocab, abc)
# matcher.add('A.C', None, [{'ORTH': 'a'},
# {"IS_ALPHA": True, "OP": "+"},
# {'ORTH': 'c'}])
# matches = matcher(doc)
# assert len(matches) == 1
# assert matches[0][1:] == (0, 3)
#def test_matcher_phrase_matcher(en_vocab):
# words = ["Google", "Now"]
# doc = get_doc(en_vocab, words)
# matcher = PhraseMatcher(en_vocab)
# matcher.add('COMPANY', None, doc)
# words = ["I", "like", "Google", "Now", "best"]
# doc = get_doc(en_vocab, words)
# assert len(matcher(doc)) == 1
#def test_phrase_matcher_length(en_vocab):
# matcher = PhraseMatcher(en_vocab)
# assert len(matcher) == 0
# matcher.add('TEST', None, get_doc(en_vocab, ['test']))
# assert len(matcher) == 1
# matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
# assert len(matcher) == 2
#def test_phrase_matcher_contains(en_vocab):
# matcher = PhraseMatcher(en_vocab)
# matcher.add('TEST', None, get_doc(en_vocab, ['test']))
# assert 'TEST' in matcher
# assert 'TEST2' not in matcher
#def test_matcher_match_zero(matcher):
# words1 = 'He said , " some words " ...'.split()
# words2 = 'He said , " some three words " ...'.split()
# pattern1 = [{'ORTH': '"'},
# {'OP': '!', 'IS_PUNCT': True},
# {'OP': '!', 'IS_PUNCT': True},
# {'ORTH': '"'}]
# pattern2 = [{'ORTH': '"'},
# {'IS_PUNCT': True},
# {'IS_PUNCT': True},
# {'IS_PUNCT': True},
# {'ORTH': '"'}]
# matcher.add('Quote', None, pattern1)
# doc = get_doc(matcher.vocab, words1)
# assert len(matcher(doc)) == 1
# doc = get_doc(matcher.vocab, words2)
# assert len(matcher(doc)) == 0
# matcher.add('Quote', None, pattern2)
# assert len(matcher(doc)) == 0
#def test_matcher_match_zero_plus(matcher):
# words = 'He said , " some words " ...'.split()
# pattern = [{'ORTH': '"'},
# {'OP': '*', 'IS_PUNCT': False},
# {'ORTH': '"'}]
# matcher = Matcher(matcher.vocab)
# matcher.add('Quote', None, pattern)
# doc = get_doc(matcher.vocab, words)
# assert len(matcher(doc)) == 1
#def test_matcher_match_one_plus(matcher):
# control = Matcher(matcher.vocab)
# control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
# doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
# m = control(doc)
# assert len(m) == 2
# matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
# {'ORTH': 'Philippe', 'OP': '+'}])
# m = matcher(doc)
# assert len(m) == 1
def test_operator_combos(matcher):
cases = [
@ -252,9 +266,8 @@ def test_matcher_end_zero_plus(matcher):
nlp = lambda string: Doc(matcher.vocab, words=string.split())
assert len(matcher(nlp(u'a'))) == 1
assert len(matcher(nlp(u'a b'))) == 1
assert len(matcher(nlp(u'a b'))) == 1
assert len(matcher(nlp(u'a b'))) == 2
assert len(matcher(nlp(u'a c'))) == 1
assert len(matcher(nlp(u'a b c'))) == 1
assert len(matcher(nlp(u'a b b c'))) == 1
assert len(matcher(nlp(u'a b b'))) == 1
assert len(matcher(nlp(u'a b c'))) == 2
assert len(matcher(nlp(u'a b b c'))) == 3
assert len(matcher(nlp(u'a b b'))) == 3
Normal file
@ -19,6 +19,9 @@ ctypedef fused LexemeOrToken:
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
@ -186,6 +186,20 @@ cdef class Doc:
def _(self):
return Underscore(Underscore.doc_extensions, self)
def is_sentenced(self):
# Check if the document has sentence boundaries,
# i.e at least one tok has the sent_start in (-1, 1)
if 'sents' in self.user_hooks:
return True
if self.is_parsed:
return True
for i in range(self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True
return False
def __getitem__(self, object i):
"""Get a `Token` or `Span` object.
@ -517,29 +531,23 @@ cdef class Doc:
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
def __get__(self):
if not self.is_sentenced:
raise ValueError(
"Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set "
"sentence boundaries by setting doc[i].sent_start")
if 'sents' in self.user_hooks:
yield from self.user_hooks['sents'](self)
cdef int i
if not self.is_parsed:
start = 0
for i in range(1, self.length):
if self.c[i].sent_start != 0:
raise ValueError(
"Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set "
"sentence boundaries by setting doc[i].sent_start")
start = 0
for i in range(1, self.length):
if self.c[i].sent_start == 1:
yield Span(self, start, i)
start = i
if start != self.length:
yield Span(self, start, self.length)
if self.c[i].sent_start == 1:
yield Span(self, start, i)
start = i
if start != self.length:
yield Span(self, start, self.length)
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
if self.length == 0:
@ -285,16 +285,42 @@ cdef class Span:
def __get__(self):
if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self)
# This should raise if we're not parsed.
# This should raise if we're not parsed
# or doesen't have any sbd component :)
# if doc is parsed we can use the deps to find the sentence
# otherwise we use the `sent_start` token attribute
cdef int n = 0
root = &self.doc.c[self.start]
while root.head != 0:
root += root.head
n += 1
if n >= self.doc.length:
raise RuntimeError
return self.doc[root.l_edge:root.r_edge + 1]
cdef int i
if self.doc.is_parsed:
root = &self.doc.c[self.start]
n = 0
while root.head != 0:
root += root.head
n += 1
if n >= self.doc.length:
raise RuntimeError
return self.doc[root.l_edge:root.r_edge + 1]
elif self.doc.is_sentenced:
# find start of the sentence
start = self.start
while self.doc.c[start].sent_start != 1 and start > 0:
start += -1
# find end of the sentence
end = self.end
n = 0
while end < self.doc.length and self.doc.c[end].sent_start != 1:
end += 1
n += 1
if n >= self.doc.length:
return self.doc[start:end]
raise ValueError(
"Access to sentence requires either the dependency parse "
"or sentence boundaries to be set by setting " +
"doc[i].is_sent_start = True")
property has_vector:
"""RETURNS (bool): Whether a word vector is associated with the object.
@ -34,11 +34,11 @@ cdef class Token:
def get_extension(cls, name):
return Underscore.token_extensions.get(name)
return Underscore.span_extensions.get(name)
def has_extension(cls, name):
return name in Underscore.token_extensions
return name in Underscore.span_extensions
def __cinit__(self, Vocab vocab, Doc doc, int offset):
"""Construct a `Token` object.
@ -442,6 +442,29 @@ def decaying(start, stop, decay):
nr_upd += 1
def minibatch_by_words(items, size, count_words=len):
'''Create minibatches of a given number of words.'''
if isinstance(size, int):
size_ = itertools.repeat(size)
size_ = size
items = iter(items)
while True:
batch_size = next(size_)
batch = []
while batch_size >= 0:
doc, gold = next(items)
except StopIteration:
if batch:
yield batch
batch_size -= count_words(doc)
batch.append((doc, gold))
if batch:
yield batch
def itershuffle(iterable, bufsize=1000):
"""Shuffle an iterator. This works by holding `bufsize` items back
and yielding them sometime later. Obviously, this is not unbiased –
@ -457,7 +480,7 @@ def itershuffle(iterable, bufsize=1000):
while True:
for i in range(random.randint(1, bufsize-len(buf))):
for i in range(random.randint(1, bufsize)):
if buf:
@ -120,9 +120,6 @@ include ../_includes/_mixins
| A Practical Real-World Approach to Gaining Actionable Insights
| from your Data
+card("Practical Machine Learning with Python", "", "Dipanjan Sarkar et al. (Apress, 2017)", "book")
| A Problem-Solver's Guide to Building Real-World Intelligent Systems
+h(2, "notebooks") Jupyter notebooks
Reference in New Issue
Block a user