Merge branch 'refactor' (and serializaton)

Add Huffman-code serialization, and do a lot of
refactoring. Highlights include:

* Much more efficient StringStore
* Vocab maintains a by-orth mapping of Lexemes
* Avoid manually slicing Py_UNICODE buffers,
  simplifying tokenizer and vocab C APIs
* Remove various bits of dead code
* Work on removing GIL around parser
* Work on bridge to Theano

Conflicts:
	spacy/strings.pxd
	spacy/strings.pyx
	spacy/structs.pxd
This commit is contained in:
Matthew Honnibal 2015-07-23 02:18:35 +02:00
commit df01a88763
97 changed files with 2987 additions and 3243 deletions

2
.gitignore vendored
View File

@ -17,6 +17,8 @@ models/
spacy/syntax/*.cpp
spacy/syntax/*.html
spacy/en/*.cpp
spacy/tokens/*.cpp
spacy/serialize/*.cpp
spacy/en/data/*
spacy/*.cpp
spacy/ner/*.cpp

103
bin/get_freqs.py Executable file
View File

@ -0,0 +1,103 @@
#!/usr/bin/env python
from __future__ import unicode_literals
import plac
import joblib
from os import path
import os
import bz2
import ujson
import codecs
from preshed.counter import PreshCounter
from joblib import Parallel, delayed
import spacy.en
from spacy.strings import StringStore
from spacy.en.attrs import ORTH
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for line in file_:
yield ujson.loads(line)
def null_props(string):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string,
'suffix': string,
'cluster': 0,
'prob': -22,
'sentiment': 0
}
def count_freqs(input_loc, output_loc):
nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False)
nlp.vocab.lexeme_props_getter = null_props
counts = PreshCounter()
tokenizer = nlp.tokenizer
for json_comment in iter_comments(input_loc):
doc = tokenizer(json_comment['body'])
doc.count_by(ORTH, counts=counts)
with codecs.open(output_loc, 'w', 'utf8') as file_:
for orth, freq in counts:
string = nlp.vocab.strings[orth]
file_.write('%d\t%s\n' % (freq, repr(string)))
def parallelize(func, iterator, n_jobs):
Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
def merge_counts(locs, out_loc):
string_map = StringStore()
counts = PreshCounter()
for loc in locs:
with codecs.open(loc, 'r', 'utf8') as file_:
for line in file_:
freq, word = line.strip().split('\t', 1)
orth = string_map[word]
counts.inc(orth, int(freq))
with codecs.open(out_loc, 'w', 'utf8') as file_:
for orth, count in counts:
string = string_map[orth]
file_.write('%d\t%s\n' % (count, string))
@plac.annotations(
input_loc=("Location of input file list"),
freqs_dir=("Directory for frequency files"),
output_loc=("Location for output file"),
n_jobs=("Number of workers", "option", "n", int),
skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
)
def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
tasks = []
outputs = []
for input_path in open(input_loc):
input_path = input_path.strip()
if not input_path:
continue
filename = input_path.split('/')[-1]
output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
outputs.append(output_path)
if not path.exists(output_path) or not skip_existing:
tasks.append((input_path, output_path))
parallelize(count_freqs, tasks, n_jobs)
merge_counts(outputs, output_loc)
if __name__ == '__main__':
plac.call(main)

View File

@ -30,8 +30,6 @@ from spacy.vocab import write_binary_vectors
from spacy.parts_of_speech import NOUN, VERB, ADJ
import spacy.senses
def setup_tokenizer(lang_data_dir, tok_dir):
if not tok_dir.exists():
@ -46,6 +44,9 @@ def setup_tokenizer(lang_data_dir, tok_dir):
def _read_clusters(loc):
if not loc.exists():
print "Warning: Clusters file not found"
return {}
clusters = {}
for line in codecs.open(str(loc), 'r', 'utf8'):
try:
@ -70,6 +71,9 @@ def _read_clusters(loc):
def _read_probs(loc):
if not loc.exists():
print "Warning: Probabilities file not found"
return {}
probs = {}
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
prob, word = line.split()
@ -80,6 +84,9 @@ def _read_probs(loc):
def _read_senses(loc):
lexicon = defaultdict(lambda: defaultdict(list))
if not loc.exists():
print "Warning: WordNet senses not found"
return lexicon
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
for line in codecs.open(str(loc), 'r', 'utf8'):
@ -101,13 +108,11 @@ def setup_vocab(src_dir, dst_dir):
vectors_src = src_dir / 'vectors.tgz'
if vectors_src.exists():
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
else:
print "Warning: Word vectors file not found"
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt')
senses = _read_senses(src_dir / 'supersenses.txt')
probs = _read_probs(src_dir / 'words.sgt.prob')
for word in set(clusters).union(set(senses)):
if word not in probs:
probs[word] = -17.0
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
@ -120,15 +125,6 @@ def setup_vocab(src_dir, dst_dir):
entry['cluster'] = int(cluster[::-1], 2)
orth_senses = set()
lemmas = []
for pos in [NOUN, VERB, ADJ]:
for lemma in lemmatizer(word.lower(), pos):
lemmas.append(lemma)
orth_senses.update(senses[lemma][pos])
if word.lower() == 'dogging':
print word
print lemmas
print [spacy.senses.STRINGS[si] for si in orth_senses]
entry['senses'] = list(sorted(orth_senses))
vocab[word] = entry
vocab.dump(str(dst_dir / 'lexemes.bin'))
vocab.strings.dump(str(dst_dir / 'strings.txt'))

261
bin/parser/nn_train.py Executable file
View File

@ -0,0 +1,261 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import codecs
import random
import plac
import cProfile
import pstats
import re
import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.syntax.parser import Parser, get_templates
from spacy._theano import TheanoModel
import theano
import theano.tensor as T
from theano.printing import Print
import numpy
from collections import OrderedDict, defaultdict
theano.config.profile = False
theano.config.floatX = 'float32'
floatX = theano.config.floatX
def L1(L1_reg, *weights):
return L1_reg * sum(abs(w).sum() for w in weights)
def L2(L2_reg, *weights):
return L2_reg * sum((w ** 2).sum() for w in weights)
def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
updates = OrderedDict()
for param in params:
value = param.get_value(borrow=True)
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
broadcastable=param.broadcastable)
grad = T.grad(loss, param)
accu_new = rho * accu + (1 - rho) * grad ** 2
updates[accu] = accu_new
updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
return updates
def relu(x):
return x * (x > 0)
def feed_layer(activation, weights, bias, input_):
return activation(T.dot(input_, weights) + bias)
def init_weights(n_in, n_out):
rng = numpy.random.RandomState(1235)
weights = numpy.asarray(
rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in),
dtype=theano.config.floatX
)
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
return [wrapper(weights, name='W'), wrapper(bias, name='b')]
def compile_model(n_classes, n_hidden, n_in, optimizer):
x = T.vector('x')
costs = T.ivector('costs')
loss = T.scalar('loss')
maxent_W, maxent_b = init_weights(n_hidden, n_classes)
hidden_W, hidden_b = init_weights(n_in, n_hidden)
# Feed the inputs forward through the network
p_y_given_x = feed_layer(
T.nnet.softmax,
maxent_W,
maxent_b,
feed_layer(
relu,
hidden_W,
hidden_b,
x))
loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
train_model = theano.function(
name='train_model',
inputs=[x, costs],
outputs=[p_y_given_x[0], T.grad(loss, x), loss],
updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
on_unused_input='warn'
)
evaluate_model = theano.function(
name='evaluate_model',
inputs=[x],
outputs=[
feed_layer(
T.nnet.softmax,
maxent_W,
maxent_b,
feed_layer(
relu,
hidden_W,
hidden_b,
x
)
)[0]
]
)
return train_model, evaluate_model
def score_model(scorer, nlp, annot_tuples, verbose=False):
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
seed=0, n_sents=0, verbose=False):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
Config.write(dep_model_dir, 'config',
seed=seed,
templates=tuple(),
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
vector_lengths=(nv_word, nv_tag, nv_label),
hidden_nodes=nv_hidden,
eta=eta,
mu=mu
)
# Bake-in hyper-parameters
optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
nlp = Language(data_dir=model_dir)
n_classes = nlp.parser.model.n_classes
train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
nlp.parser.model = TheanoModel(n_classes, input_spec, train,
predict, model_loc)
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
log_loc = path.join(model_dir, 'job.log')
for itn in range(n_iter):
scorer = Scorer()
loss = 0
for _, sents in gold_tuples:
for annot_tuples, ctnt in sents:
if len(annot_tuples[1]) == 1:
continue
score_model(scorer, nlp, annot_tuples)
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=True)
assert gold.is_projective
loss += nlp.parser.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags)
random.shuffle(gold_tuples)
logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
scorer.tags_acc,
scorer.token_acc)
print logline
with open(log_loc, 'aw') as file_:
file_.write(logline + '\n')
nlp.parser.model.end_training()
nlp.tagger.model.end_training()
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
return nlp
def evaluate(nlp, gold_tuples, gold_preproc=True):
scorer = Scorer()
for raw_text, sents in gold_tuples:
for annot_tuples, brackets in sents:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold)
return scorer
@plac.annotations(
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
verbose=("Verbose error reporting", "flag", "v", bool),
nv_word=("Word vector length", "option", "W", int),
nv_tag=("Tag vector length", "option", "T", int),
nv_label=("Label vector length", "option", "L", int),
nv_hidden=("Hidden nodes length", "option", "H", int),
eta=("Learning rate", "option", "E", float),
mu=("Momentum", "option", "M", float),
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False,
nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10,
eta=0.1, mu=0.9, eval_only=False):
gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id']))
nlp = train(English, gold_train, model_dir,
feat_set='embed',
eta=eta, mu=mu,
nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden,
n_sents=n_sents, n_iter=n_iter,
verbose=verbose)
scorer = evaluate(nlp, list(read_json_file(dev_loc)))
print 'TOK', 100-scorer.token_acc
print 'POS', scorer.tags_acc
print 'UAS', scorer.uas
print 'LAS', scorer.las
print 'NER P', scorer.ents_p
print 'NER R', scorer.ents_r
print 'NER F', scorer.ents_f
if __name__ == '__main__':
plac.call(main)

View File

@ -139,13 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
nlp.tagger.train(tokens, gold.tags)
random.shuffle(gold_tuples)
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc)
nlp.parser.model.end_training()
nlp.entity.model.end_training()
nlp.tagger.model.end_training()
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
scorer.tags_acc,
scorer.token_acc)
nlp.end_training()
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None):
@ -207,29 +203,22 @@ def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
out_loc=("Out location", "option", "o", str),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
beam_width=("Number of candidates to maintain in the beam", "option", "k", int),
verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool),
use_orig_arc_eager=("Use the original, monotonic arc-eager system", "flag", "m", bool)
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, beam_width=1,
eval_only=False, use_orig_arc_eager=False):
if use_orig_arc_eager:
English.ParserTransitionSystem = TreeArcEager
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
if not eval_only:
gold_train = list(read_json_file(train_loc))
train(English, gold_train, model_dir,
feat_set='basic' if not debug else 'debug',
gold_preproc=gold_preproc, n_sents=n_sents,
corruption_level=corruption_level, n_iter=n_iter,
beam_width=beam_width, verbose=verbose,
use_orig_arc_eager=use_orig_arc_eager)
verbose=verbose)
#if out_loc:
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
scorer = evaluate(English, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose,
beam_width=beam_width)
model_dir, gold_preproc=gold_preproc, verbose=verbose)
print 'TOK', scorer.token_acc
print 'POS', scorer.tags_acc
print 'UAS', scorer.uas

View File

@ -0,0 +1,116 @@
====================
Annotation Standards
====================
This document describes the target annotations spaCy is trained to predict.
This is currently a work in progress. Please ask questions on the issue tracker,
so that the answers can be integrated here to improve the documentation.
https://github.com/honnibal/spaCy/issues
English
=======
Tokenization
------------
Tokenization standards are based on the OntoNotes 5 corpus.
The tokenizer differs from most by including tokens for significant whitespace.
Any sequence of whitespace characters beyond a single space (' ') is included
as a token. For instance:
>>> from spacy.en import English
>>> nlp = English(parse=False)
>>> tokens = nlp(u'Some\nspaces and\ttab characters')
>>> print [t.orth_ for t in tokens]
[u'Some', u'\n', u'spaces', u' ', u'and', u'\t', u'tab', u'characters']
The whitespace tokens are useful for much the same reason punctuation is --- it's
often an important delimiter in the text. By preserving it in the token output,
we are able to maintain a simple alignment between the tokens and the original
string, and we ensure that the token stream does not lose information.
Sentence boundary detection
---------------------------
Sentence boundaries are calculated from the syntactic parse tree, so features
such as punctuation and capitalisation play an important but non-decisive role
in determining the sentence boundaries. Usually this means that the sentence
boundaries will at least coincide with clause boundaries, even given poorly
punctuated text.
Part-of-speech Tagging
----------------------
The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank
tag set. We also map the tags to the simpler Google Universal POS Tag set.
Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124
Lemmatization
-------------
A "lemma" is the uninflected form of a word. In English, this means:
* Adjectives: The form like "happy", not "happier" or "happiest"
* Adverbs: The form like "badly", not "worse" or "worst"
* Nouns: The form like "dog", not "dogs"; like "child", not "children"
* Verbs: The form like "write", not "writes", "writing", "wrote" or "written"
The lemmatization data is taken from WordNet. However, we also add a special
case for pronouns: all pronouns are lemmatized to the special token -PRON-.
Syntactic Dependency Parsing
----------------------------
The parser is trained on data produced by the ClearNLP converter. Details of
the annotation scheme can be found here:
http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
Named Entity Recognition
------------------------
+--------------+-----------------------------------------------------+
| PERSON | People, including fictional |
+--------------+-----------------------------------------------------+
| NORP | Nationalities or religious or political groups |
+--------------+-----------------------------------------------------+
| FACILITY | Buildings, airports, highways, bridges, etc. |
+--------------+-----------------------------------------------------+
| ORGANIZATION | Companies, agencies, institutions, etc. |
+--------------+-----------------------------------------------------+
| GPE | Countries, cities, states |
+--------------+-----------------------------------------------------+
| LOCATION | Non-GPE locations, mountain ranges, bodies of water |
+--------------+-----------------------------------------------------+
| PRODUCT | Vehicles, weapons, foods, etc. (Not services) |
+--------------+-----------------------------------------------------+
| EVENT | Named hurricanes, battles, wars, sports events, etc.|
+--------------+-----------------------------------------------------+
| WORK OF ART | Titles of books, songs, etc. |
+--------------+-----------------------------------------------------+
| LAW | Named documents made into laws |
+--------------+-----------------------------------------------------+
| LANGUAGE | Any named language |
+--------------+-----------------------------------------------------+
The following values are also annotated in a style similar to names:
+--------------+---------------------------------------------+
| DATE | Absolute or relative dates or periods |
+--------------+---------------------------------------------+
| TIME | Times smaller than a day |
+--------------+---------------------------------------------+
| PERCENT | Percentage (including “%”) |
+--------------+---------------------------------------------+
| MONEY | Monetary values, including unit |
+--------------+---------------------------------------------+
| QUANTITY | Measurements, as of weight or distance |
+--------------+---------------------------------------------+
| ORDINAL | "first", "second" |
+--------------+---------------------------------------------+
| CARDINAL | Numerals that do not fall under another type|
+--------------+---------------------------------------------+

View File

@ -1,3 +1,3 @@
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zA-Z0-9])-(?=[a-zA-z])
(?<=[a-zA-Z])-(?=[0-9a-zA-z])
(?<=[a-zA-Z])-(?=[a-zA-z])

View File

@ -6,21 +6,21 @@
"ain't": [{"F": "ai", "L": "be", "pos": "VBP", "number": 2},
{"F": "n't", "L": "not", "pos": "RB"}],
"aint": [{"F": "ai", "L": "be", "pos": "VBP", "number": 2},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"Ain't": [{"F": "Ai", "L": "be", "pos": "VBP", "number": 2},
{"F": "n't", "L": "not", "pos": "RB"}],
"aren't": [{"F": "are", "L": "be", "pos": "VBP", "number": 2},
{"F": "n't", "L": "not"}],
"arent": [{"F": "are", "L": "be", "pos": "VBP", "number": 2},
{"F": "n't", "L": "not"}],
{"F": "nt", "L": "not"}],
"Aren't": [{"F": "Are", "L": "be", "pos": "VBP", "number": 2},
{"F": "n't", "L": "not"}],
"can't": [{"F": "ca", "L": "can", "pos": "MD"},
{"F": "n't", "L": "not", "pos": "RB"}],
"cant": [{"F": "ca", "L": "can", "pos": "MD"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"Can't": [{"F": "Ca", "L": "can", "pos": "MD"},
{"F": "n't", "L": "not", "pos": "RB"}],
@ -32,14 +32,14 @@
"could've": [{"F": "could", "pos": "MD"},
{"F": "'ve", "L": "have", "pos": "VB"}],
"couldve": [{"F": "could", "pos": "MD"},
{"F": "'ve", "L": "have", "pos": "VB"}],
{"F": "ve", "L": "have", "pos": "VB"}],
"Could've": [{"F": "Could", "pos": "MD"},
{"F": "'ve", "L": "have", "pos": "VB"}],
"couldn't": [{"F": "could", "pos": "MD"},
{"F": "n't", "L": "not", "pos": "RB"}],
"couldnt": [{"F": "could", "pos": "MD"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"Couldn't": [{"F": "Could", "pos": "MD"},
{"F": "n't", "L": "not", "pos": "RB"}],
@ -47,8 +47,8 @@
{"F": "n't", "L": "not", "pos": "RB"},
{"F": "'ve", "pos": "VB"}],
"couldntve": [{"F": "could", "pos": "MD"},
{"F": "n't", "L": "not", "pos": "RB"},
{"F": "'ve", "pos": "VB"}],
{"F": "nt", "L": "not", "pos": "RB"},
{"F": "ve", "pos": "VB"}],
"Couldn't've": [{"F": "Could", "pos": "MD"},
{"F": "n't", "L": "not", "pos": "RB"},
{"F": "'ve", "pos": "VB"}],
@ -56,28 +56,28 @@
"didn't": [{"F": "did", "pos": "VBD", "L": "do"},
{"F": "n't", "L": "not", "pos": "RB"}],
"didnt": [{"F": "did", "pos": "VBD", "L": "do"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"Didn't": [{"F": "Did", "pos": "VBD", "L": "do"},
{"F": "n't", "L": "not", "pos": "RB"}],
"doesn't": [{"F": "does", "L": "do", "pos": "VBZ"},
{"F": "n't", "L": "not", "pos": "RB"}],
"doesnt": [{"F": "does", "L": "do", "pos": "VBZ"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"Doesn't": [{"F": "Does", "L": "do", "pos": "VBZ"},
{"F": "n't", "L": "not", "pos": "RB"}],
"don't": [{"F": "do", "L": "do"},
{"F": "n't", "L": "not", "pos": "RB"}],
"dont": [{"F": "do", "L": "do"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"Don't": [{"F": "Do", "L": "do"},
{"F": "n't", "L": "not", "pos": "RB"}],
"hadn't": [{"F": "had", "L": "have", "pos": "VBD"},
{"F": "n't", "L": "not", "pos": "RB"}],
"hadnt": [{"F": "had", "L": "have", "pos": "VBD"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"Hadn't": [{"F": "Had", "L": "have", "pos": "VBD"},
{"F": "n't", "L": "not", "pos": "RB"}],
@ -88,25 +88,25 @@
"hasn't": [{"F": "has"},
{"F": "n't", "L": "not", "pos": "RB"}],
"hasnt": [{"F": "has"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"haven't": [{"F": "have", "pos": "VB"},
{"F": "n't", "L": "not", "pos": "RB"}],
"havent": [{"F": "have", "pos": "VB"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"he'd": [{"F": "he", "L": "-PRON-"},
{"F": "'d", "L": "would", "pos": "MD"}],
"hed": [{"F": "he", "L": "-PRON-"},
{"F": "'d", "L": "would", "pos": "MD"}],
{"F": "d", "L": "would", "pos": "MD"}],
"he'd've": [{"F": "he", "L": "-PRON-"},
{"F": "'d", "L": "would", "pos": "MD"},
{"F": "'ve", "pos": "VB"}],
"hedve": [{"F": "he", "L": "-PRON-"},
{"F": "'d", "L": "would", "pos": "MD"},
{"F": "'ve", "pos": "VB"}],
{"F": "d", "L": "would", "pos": "MD"},
{"F": "ve", "pos": "VB"}],
"he'll": [{"F": "he", "L": "-PRON-"},
@ -116,25 +116,25 @@
{"F": "'s"}],
"hes": [{"F": "he", "L": "-PRON-"},
{"F": "'s"}],
{"F": "s"}],
"how'd": [{"F": "how"},
{"F": "'d", "L": "would", "pos": "MD"}],
"howd": [{"F": "how"},
{"F": "'d", "L": "would", "pos": "MD"}],
{"F": "d", "L": "would", "pos": "MD"}],
"how'll": [{"F": "how"},
{"F": "'ll", "L": "will", "pos": "MD"}],
"howll": [{"F": "how"},
{"F": "'ll", "L": "will", "pos": "MD"}],
{"F": "ll", "L": "will", "pos": "MD"}],
"how's": [{"F": "how"},
{"F": "'s"}],
"hows": [{"F": "how"},
{"F": "'s"}],
{"F": "s"}],
"I'd": [{"F": "I", "L": "-PRON-"},
@ -150,9 +150,9 @@
"I'm": [{"F": "I", "L": "-PRON-"},
{"F": "'m", "L": "be", "pos": "VBP", "number": 1, "tenspect": 1}],
"Im": [{"F": "I", "L": "-PRON-"},
{"F": "'m", "L": "be", "pos": "VBP", "number": 1, "tenspect": 1}],
"im": [{"F": "m", "L": "-PRON-"},
{"F": "'m", "L": "be", "pos": "VBP", "number": 1, "tenspect": 1}],
{"F": "m", "L": "be", "pos": "VBP", "number": 1, "tenspect": 1}],
"im": [{"F": "i", "L": "-PRON-"},
{"F": "m", "L": "be", "pos": "VBP", "number": 1, "tenspect": 1}],
"I'ma": [{"F": "I", "L": "-PRON-"},
{"F": "'ma"}],
@ -163,7 +163,7 @@
"isn't": [{"F": "is", "L": "be", "pos": "VBZ"},
{"F": "n't", "L": "not", "pos": "RB"}],
"isnt": [{"F": "is", "L": "be", "pos": "VBZ"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"Isn't": [{"F": "Is", "L": "be", "pos": "VBZ"},
{"F": "n't", "L": "not", "pos": "RB"}],
@ -179,7 +179,7 @@
"it'll": [{"F": "it", "L": "-PRON-"},
{"F": "'ll", "L": "will", "pos": "MD"}],
"itll": [{"F": "it", "L": "-PRON-"},
{"F": "'ll", "L": "will", "pos": "MD"}],
{"F": "ll", "L": "will", "pos": "MD"}],
"it's": [{"F": "it", "L": "-PRON-"},
@ -188,7 +188,7 @@
"let's": [{"F": "let"},
{"F": "'s"}],
"lets": [{"F": "let"},
{"F": "'s"}],
{"F": "s", "L": "'s"}],
"mightn't": [{"F": "might"},
@ -224,7 +224,7 @@
{"F": "'ve", "pos": "VB"}],
"she'll": [{"F": "she", "L": "-PRON-"},
{"F": "will"}],
{"F": "'ll", "L": "will"}],
"she's": [{"F": "she", "L": "-PRON-"},
{"F": "'s"}],
@ -243,7 +243,7 @@
{"F": "'s"}],
"thats": [{"F": "that"},
{"F": "'s"}],
{"F": "s", "L": "'s"}],
"there'd": [{"F": "there"},
@ -369,7 +369,7 @@
"won't": [{"F": "wo"},
{"F": "n't", "L": "not", "pos": "RB"}],
"wont": [{"F": "wo"},
{"F": "n't", "L": "not", "pos": "RB"}],
{"F": "nt", "L": "not", "pos": "RB"}],
"would've": [{"F": "would"},
@ -392,6 +392,10 @@
"you'll": [{"F": "you", "L": "-PRON-"},
{"F": "'ll", "L": "will", "pos": "MD"}],
"You'll": [{"F": "You", "L": "-PRON-"},
{"F": "'ll", "L": "will", "pos": "MD"}],
"you're": [{"F": "you", "L": "-PRON-"},
{"F": "'re"}],
"You're": [{"F": "You", "L": "-PRON-"},
@ -401,6 +405,10 @@
"you've": [{"F": "you", "L": "-PRON-"},
{"F": "'ve", "L": "have", "pos": "VB"}],
"You've": [{"F": "You", "L": "-PRON-"},
{"F": "'ve", "L": "have", "pos": "VB"}],
"'em": [{"F": "'em"}],
"'ol": [{"F": "'ol"}],

View File

@ -93,6 +93,8 @@ def cython_setup(mod_names, language, includes, compile_args, link_args):
"data/wordnet/*", "data/tokenizer/*",
"data/vocab/lexemes.bin",
"data/vocab/strings.txt"],
"spacy.tokens": ["*.pxd"],
"spacy.serialize": ["*.pxd"],
"spacy.syntax": ["*.pxd"]},
ext_modules=exts,
cmdclass={'build_ext': Cython.Distutils.build_ext},
@ -103,7 +105,7 @@ def cython_setup(mod_names, language, includes, compile_args, link_args):
def run_setup(exts):
setup(
name='spacy',
packages=['spacy', 'spacy.en', 'spacy.syntax', 'spacy.munge'],
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.syntax', 'spacy.munge'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
@ -148,15 +150,19 @@ def main(modules, is_pypy):
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.lexeme', 'spacy.vocab', 'spacy.tokens', 'spacy.spans',
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
'spacy.morphology',
'spacy.syntax.stateclass',
'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs',
'spacy._ml', 'spacy._theano',
'spacy.tokenizer', 'spacy.en.attrs',
'spacy.en.pos', 'spacy.syntax.parser',
'spacy.syntax.transition_system',
'spacy.syntax.arc_eager',
'spacy.syntax._parse_features',
'spacy.gold', 'spacy.orth',
'spacy.gold', 'spacy.orth',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile',
'spacy.syntax.ner']

View File

@ -5,20 +5,26 @@ from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor, Feature
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from thinc.api cimport ExampleC
from preshed.maps cimport PreshMapArray
from .typedefs cimport hash_t, id_t
from .typedefs cimport hash_t
cdef int arg_max(const weight_t* scores, const int n_classes) nogil
cdef int arg_max_if_true(const weight_t* scores, const int* is_valid, int n_classes) nogil
cdef int arg_max_if_zero(const weight_t* scores, const int* costs, int n_classes) nogil
cdef class Model:
cdef int n_classes
cdef readonly int n_classes
cdef readonly int n_feats
cdef const weight_t* score(self, atom_t* context) except NULL
cdef int set_scores(self, weight_t* scores, atom_t* context) except -1
cdef int set_scores(self, weight_t* scores, atom_t* context) nogil
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1

View File

@ -10,6 +10,7 @@ import cython
import numpy.random
from thinc.features cimport Feature, count_feats
from thinc.api cimport Example
cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
@ -23,23 +24,58 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil:
return best
cdef int arg_max_if_true(const weight_t* scores, const int* is_valid,
const int n_classes) nogil:
cdef int i
cdef int best = 0
cdef weight_t mode = -900000
for i in range(n_classes):
if is_valid[i] and scores[i] > mode:
mode = scores[i]
best = i
return best
cdef int arg_max_if_zero(const weight_t* scores, const int* costs,
const int n_classes) nogil:
cdef int i
cdef int best = 0
cdef weight_t mode = -900000
for i in range(n_classes):
if costs[i] == 0 and scores[i] > mode:
mode = scores[i]
best = i
return best
cdef class Model:
def __init__(self, n_classes, templates, model_loc=None):
if model_loc is not None and path.isdir(model_loc):
model_loc = path.join(model_loc, 'model')
self.n_classes = n_classes
self._extractor = Extractor(templates)
self.n_feats = self._extractor.n_templ
self._model = LinearModel(n_classes, self._extractor.n_templ)
self.model_loc = model_loc
if self.model_loc and path.exists(self.model_loc):
self._model.load(self.model_loc, freq_thresh=0)
def predict(self, Example eg):
self.set_scores(eg.c.scores, eg.c.atoms)
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
def train(self, Example eg):
self.predict(eg)
eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
eg.c.cost = eg.c.costs[eg.c.guess]
self.update(eg.c.atoms, eg.c.guess, eg.c.best, eg.c.cost)
cdef const weight_t* score(self, atom_t* context) except NULL:
cdef int n_feats
feats = self._extractor.get_feats(context, &n_feats)
return self._model.get_scores(feats, n_feats)
cdef int set_scores(self, weight_t* scores, atom_t* context) except -1:
cdef int set_scores(self, weight_t* scores, atom_t* context) nogil:
cdef int n_feats
feats = self._extractor.get_feats(context, &n_feats)
self._model.set_scores(scores, feats, n_feats)

3
spacy/_nn.py Normal file
View File

@ -0,0 +1,3 @@
"""Feed-forward neural network, using Thenao."""

146
spacy/_nn.pyx Normal file
View File

@ -0,0 +1,146 @@
"""Feed-forward neural network, using Thenao."""
import os
import sys
import time
import numpy
import theano
import theano.tensor as T
import plac
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
def build_model(n_classes, n_vocab, n_hidden, n_word_embed, n_tag_embed):
# allocate symbolic variables for the data
words = T.vector('words')
tags = T.vector('tags')
word_e = _init_embedding(n_words, n_word_embed)
tag_e = _init_embedding(n_tags, n_tag_embed)
label_e = _init_embedding(n_labels, n_label_embed)
maxent_W, maxent_b = _init_maxent_weights(n_hidden, n_classes)
hidden_W, hidden_b = _init_hidden_weights(28*28, n_hidden, T.tanh)
params = [hidden_W, hidden_b, maxent_W, maxent_b, word_e, tag_e, label_e]
x = T.concatenate([
T.flatten(word_e[word_indices], outdim=1),
T.flatten(tag_e[tag_indices], outdim=1)])
p_y_given_x = feed_layer(
T.nnet.softmax,
maxent_W,
maxent_b,
feed_layer(
T.tanh,
hidden_W,
hidden_b,
x))[0]
guess = T.argmax(p_y_given_x)
cost = (
-T.log(p_y_given_x[y])
+ L1(L1_reg, maxent_W, hidden_W, word_e, tag_e)
+ L2(L2_reg, maxent_W, hidden_W, wod_e, tag_e)
)
train_model = theano.function(
inputs=[words, tags, y],
outputs=guess,
updates=[update(learning_rate, param, cost) for param in params]
)
evaluate_model = theano.function(
inputs=[x, y],
outputs=T.neq(y, T.argmax(p_y_given_x[0])),
)
return train_model, evaluate_model
def _init_embedding(vocab_size, n_dim):
embedding = 0.2 * numpy.random.uniform(-1.0, 1.0, (vocab_size+1, n_dim))
return theano.shared(embedding).astype(theano.config.floatX)
def _init_maxent_weights(n_hidden, n_out):
weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
bias = numpy.zeros((10,), dtype=theano.config.floatX)
return (
theano.shared(name='W', borrow=True, value=weights),
theano.shared(name='b', borrow=True, value=bias)
)
def _init_hidden_weights(n_in, n_out, activation=T.tanh):
rng = numpy.random.RandomState(1234)
weights = numpy.asarray(
rng.uniform(
low=-numpy.sqrt(6. / (n_in + n_out)),
high=numpy.sqrt(6. / (n_in + n_out)),
size=(n_in, n_out)
),
dtype=theano.config.floatX
)
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
return (
theano.shared(value=weights, name='W', borrow=True),
theano.shared(value=bias, name='b', borrow=True)
)
def feed_layer(activation, weights, bias, input):
return activation(T.dot(input, weights) + bias)
def L1(L1_reg, w1, w2):
return L1_reg * (abs(w1).sum() + abs(w2).sum())
def L2(L2_reg, w1, w2):
return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())
def update(eta, param, cost):
return (param, param - (eta * T.grad(cost, param)))
def main(train_loc, eval_loc, model_dir):
learning_rate = 0.01
L1_reg = 0.00
L2_reg = 0.0001
print "... reading the data"
gold_train = list(read_json_file(train_loc))
print '... building the model'
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(pos_model_dir)
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
train_model, evaluate_model = build_model(n_hidden, len(POS_TAGS), learning_rate,
L1_reg, L2_reg)
print '... training'
for epoch in range(1, n_epochs+1):
for raw_text, sents in gold_tuples:
for (ids, words, tags, ner, heads, deps), _ in sents:
tokens = nlp.tokenizer.tokens_from_list(words)
for t in tokens:
guess = train_model([t.orth], [t.tag])
loss += guess != t.tag
print loss
# compute zero-one loss on validation set
#error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples])
#print('epoch %i, validation error %f %%' % (epoch, error * 100))
if __name__ == '__main__':
plac.call(main)

13
spacy/_theano.pxd Normal file
View File

@ -0,0 +1,13 @@
from ._ml cimport Model
from thinc.nn cimport InputLayer
cdef class TheanoModel(Model):
cdef InputLayer input_layer
cdef object train_func
cdef object predict_func
cdef object debug
cdef public float eta
cdef public float mu
cdef public float t

52
spacy/_theano.pyx Normal file
View File

@ -0,0 +1,52 @@
from thinc.api cimport Example, ExampleC
from thinc.typedefs cimport weight_t
from ._ml cimport arg_max_if_true
from ._ml cimport arg_max_if_zero
import numpy
from os import path
cdef class TheanoModel(Model):
def __init__(self, n_classes, input_spec, train_func, predict_func, model_loc=None,
eta=0.001, mu=0.9, debug=None):
if model_loc is not None and path.isdir(model_loc):
model_loc = path.join(model_loc, 'model')
self.eta = eta
self.mu = mu
self.t = 1
initializer = lambda: 0.2 * numpy.random.uniform(-1.0, 1.0)
self.input_layer = InputLayer(input_spec, initializer)
self.train_func = train_func
self.predict_func = predict_func
self.debug = debug
self.n_classes = n_classes
self.n_feats = len(self.input_layer)
self.model_loc = model_loc
def predict(self, Example eg):
self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=True)
theano_scores = self.predict_func(eg.embeddings)[0]
cdef int i
for i in range(self.n_classes):
eg.c.scores[i] = theano_scores[i]
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
def train(self, Example eg):
self.input_layer.fill(eg.embeddings, eg.atoms, use_avg=False)
theano_scores, update, y, loss = self.train_func(eg.embeddings, eg.costs,
self.eta, self.mu)
self.input_layer.update(update, eg.atoms, self.t, self.eta, self.mu)
for i in range(self.n_classes):
eg.c.scores[i] = theano_scores[i]
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
eg.c.best = arg_max_if_zero(eg.c.scores, eg.c.costs, self.n_classes)
eg.c.cost = eg.c.costs[eg.c.guess]
eg.c.loss = loss
self.t += 1
def end_training(self):
pass

View File

@ -79,3 +79,7 @@ cpdef enum attr_id_t:
POS
TAG
DEP
ENT_IOB
ENT_TYPE
HEAD
SPACY

12
spacy/cfile.pxd Normal file
View File

@ -0,0 +1,12 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool
cdef class CFile:
cdef FILE* fp
cdef bint is_open
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *

38
spacy/cfile.pyx Normal file
View File

@ -0,0 +1,38 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
cdef class CFile:
def __init__(self, loc, bytes mode):
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode)
if self.fp == NULL:
raise IOError("Could not open binary file %s" % bytes_loc)
self.is_open = True
def __dealloc__(self):
if self.is_open:
fclose(self.fp)
def close(self):
fclose(self.fp)
self.is_open = False
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
st = fread(dest, elem_size, number, self.fp)
if st != number:
raise IOError
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
st = fwrite(src, elem_size, number, self.fp)
if st != number:
raise IOError
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)

View File

@ -1,6 +1,8 @@
from __future__ import unicode_literals
from os import path
import re
import struct
import json
from .. import orth
from ..vocab import Vocab
@ -8,6 +10,7 @@ from ..tokenizer import Tokenizer
from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown
from ..syntax.parser import ParserFactory
from ..serialize.bits import BitArray
from ..tokens import Doc
from ..multi_words import RegexMerger
@ -19,6 +22,8 @@ from . import regexes
from ..util import read_lang_data
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
def get_lex_props(string):
return {
@ -70,10 +75,11 @@ class English(object):
Tagger=EnPosTagger,
Parser=ParserFactory(ParserTransitionSystem),
Entity=ParserFactory(EntityTransitionSystem),
Packer=None,
load_vectors=True
):
self._data_dir = data_dir
self.data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props, load_vectors=load_vectors,
@ -101,6 +107,10 @@ class English(object):
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
else:
self.entity = None
if Packer:
self.packer = Packer(self.vocab, data_dir)
else:
self.packer = None
self.mwe_merger = RegexMerger([
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
('CD', 'TIME', regexes.TIME_RE),
@ -135,7 +145,24 @@ class English(object):
self.mwe_merger(tokens)
return tokens
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training()
self.entity.model.end_training()
self.tagger.model.end_training()
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
file_.write(
json.dumps([
(TAG, self.tagger.freqs[TAG].items()),
(DEP, self.parser.moves.freqs[DEP].items()),
(ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
(ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items()),
(HEAD, self.parser.moves.freqs[HEAD].items())]))
@property
def tags(self):
"""List of part-of-speech tag names."""
"""Deprecated. List of part-of-speech tag names."""
return self.tagger.tag_names

View File

@ -14,6 +14,9 @@ from ..attrs cimport LEMMA as _LEMMA
from ..attrs cimport POS as _POS
from ..attrs cimport TAG as _TAG
from ..attrs cimport DEP as _DEP
from ..attrs cimport HEAD as _HEAD
from ..attrs cimport ENT_IOB as _ENT_IOB
from ..attrs cimport ENT_TYPE as _ENT_TYPE
cpdef enum:

View File

@ -1,4 +1,5 @@
from preshed.maps cimport PreshMapArray
from preshed.counter cimport PreshCounter
from cymem.cymem cimport Pool
from .._ml cimport Model
@ -14,6 +15,7 @@ cdef class EnPosTagger:
cdef readonly Model model
cdef public object lemmatizer
cdef PreshMapArray _morph_cache
cdef public dict freqs
cdef PosTag* tags
cdef readonly object tag_names

View File

@ -7,18 +7,19 @@ from libc.string cimport memset
from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t
from collections import defaultdict
from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens cimport Doc
from ..tokens.doc cimport Doc
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max
from .attrs cimport IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from ..typedefs cimport attr_t
from .lemmatizer import Lemmatizer
@ -260,6 +261,10 @@ cdef class EnPosTagger:
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.strings[tag]] = 1
self.freqs[TAG][0] = 1
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
@ -309,6 +314,7 @@ cdef class EnPosTagger:
tokens.data[i].tag = self.strings[self.tag_names[guess]]
self.set_morph(i, &self.tags[guess], tokens.data)
correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1
return correct
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
@ -342,7 +348,7 @@ cdef class EnPosTagger:
cdef dict entries
cdef dict props
cdef int lemma
cdef id_t orth
cdef attr_t orth
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)

View File

@ -217,8 +217,9 @@ cdef class GoldParse:
self.orig_annot = zip(*annot_tuples)
words = [w.orth_ for w in tokens]
for i, gold_i in enumerate(self.cand_to_gold):
if self.words[i].isspace():
if words[i].isspace():
self.tags[i] = 'SP'
self.heads[i] = None
self.labels[i] = None

View File

@ -1,5 +1,7 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
from .attrs cimport attr_id_t
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .structs cimport LexemeC
from .strings cimport StringStore

View File

@ -1,169 +0,0 @@
from spacy.context cimport FIELD_IDS, Token
cdef Token P4 = FIELD_IDS.P4
cdef Token P3 = FIELD_IDS.P3
cdef Token P2 = FIELD_IDS.P2
cdef Token P1 = FIELD_IDS.P1
cdef Token N0 = FIELD_IDS.N0
cdef Token N1 = FIELD_IDS.N1
cdef Token N2 = FIELD_IDS.N2
cdef Token N3 = FIELD_IDS.N3
cdef Token N4 = FIELD_IDS.N4
"""
TEMPLATES = (
(N0.sic,),
(N0.cluster,),
(P1.pos,),
(P1.sic,),
(N1.norm,),
(N1.pos,),
(P1.ner,),
(P2.ner,),
(N0.cluster,),
(P1.cluster,),
(N1.cluster,),
(N0.is_alpha,),
(N0.is_digit,),
(N0.is_title,),
(N0.is_upper,),
(N0.is_title, N0.oft_title),
(N0.is_upper, N0.oft_upper),
(P1.cluster, N0.norm),
(N0.norm, N1.cluster),
(P1.ner, N0.pos),
(P2.ner, P1.ner, N0.pos),
(P2.pos, P1.pos, N0.sic),
(N0.sic, N1.pos, N2.pos)
)
"""
LOCAL = (
(N0.sic,),
(P1.sic,),
(N1.sic,),
(P2.sic,),
(N2.sic,),
(P3.sic,),
(N3.sic,),
(P4.sic,),
(N4.sic,),
(P1.sic, N0.sic,),
(N0.sic, N1.sic),
(N0.prefix,),
(N0.suffix,),
(P1.shape,),
(N0.shape,),
(N1.shape,),
(P1.shape, N0.shape,),
(N0.shape, P1.shape,),
(P1.shape, N0.shape, N1.shape),
(N2.shape,),
(P2.shape,),
(P3.shape,),
(N3.shape,),
(P4.shape,),
(N4.shape,),
(P2.norm, P1.norm, N0.norm),
(P1.norm, N0.norm, N1.norm),
(N0.norm, N1.norm, N2.norm)
)
BOOLS = (
(N0.is_title,),
)
HISTORY = (
(P1.ner,),
(P1.ner, N0.sic,),
(P2.ner,),
(P2.ner, P1.ner),
(P2.ner, P1.ner, N0.sic),
(P2.pos, P1.ner, N0.pos),
(P2.ner, P1.pos, N0.pos),
(P3.ner,),
(P4.ner,),
)
POS = (
(P4.pos,),
(P3.pos,),
(P2.pos,),
(P1.pos,),
(N0.pos,),
(N1.pos,),
(N2.pos,),
(N3.pos,),
(N4.pos,),
(P1.pos, N0.pos),
(N0.pos, N1.pos),
(P2.pos, P1.pos, N0.pos),
(P1.pos, N0.pos, N1.pos),
(N0.pos, N1.pos, N2.pos)
)
CLUSTERS = (
(P4.cluster,),
(P3.cluster,),
(P2.cluster,),
(P1.cluster,),
(N0.cluster,),
(N1.cluster,),
(N2.cluster,),
(N3.cluster,),
(N4.cluster,),
(P1.cluster, N0.cluster),
(N0.cluster, N1.cluster),
)
CLUSTER_POS = (
(P1.cluster, N0.pos),
(N0.pos, P1.cluster),
(N0.cluster, N1.pos),
(N0.pos, N1.cluster)
)
GAZ = (
(N0.in_males,),
(N0.in_females,),
(N0.in_surnames,),
(N0.in_places,),
(N0.in_games,),
(N0.in_celebs,),
(N0.in_names,),
(P1.in_males,),
(P1.in_females,),
(P1.in_surnames,),
(P1.in_places,),
(P1.in_games,),
(P1.in_celebs,),
(P1.in_names,),
(N1.in_males,),
(N1.in_females,),
(N1.in_surnames,),
(N1.in_places,),
(N1.in_games,),
(N1.in_celebs,),
(N1.in_names,),
)
TEMPLATES = LOCAL + HISTORY + CLUSTERS + POS + CLUSTER_POS + GAZ + BOOLS

View File

@ -1,12 +0,0 @@
from cymem.cymem cimport Pool
from .structs cimport State, Entity, Move
cdef int begin_entity(State* s, label) except -1
cdef int end_entity(State* s) except -1
cdef State* init_state(Pool mem, int sent_length) except NULL
cdef bint entity_is_open(State *s) except -1
cdef bint entity_is_sunk(State *s, Move* golds) except -1

View File

@ -1,44 +0,0 @@
from .bilou_moves cimport BEGIN, UNIT
cdef int begin_entity(State* s, label) except -1:
s.curr.start = s.i
s.curr.label = label
cdef int end_entity(State* s) except -1:
s.curr.end = s.i
s.ents[s.j] = s.curr
s.j += 1
s.curr.start = 0
s.curr.label = -1
s.curr.end = 0
cdef State* init_state(Pool mem, int sent_length) except NULL:
s = <State*>mem.alloc(1, sizeof(State))
s.j = 0
s.ents = <Entity*>mem.alloc(sent_length, sizeof(Entity))
for i in range(sent_length):
s.ents[i].label = -1
s.curr.label = -1
s.tags = <int*>mem.alloc(sent_length, sizeof(int))
s.length = sent_length
return s
cdef bint entity_is_open(State *s) except -1:
return s.curr.label != -1
cdef bint entity_is_sunk(State *s, Move* golds) except -1:
if not entity_is_open(s):
return False
cdef Move* gold = &golds[s.curr.start]
if gold.action != BEGIN and gold.action != UNIT:
return True
elif gold.label != s.curr.label:
return True
else:
return False

View File

@ -1,8 +0,0 @@
from cymem.cymem cimport Pool
cdef class NERAnnotation:
cdef Pool mem
cdef int* starts
cdef int* ends
cdef int* labels
cdef readonly list entities

View File

@ -1,94 +0,0 @@
from libc.string cimport memset
cdef class NERAnnotation:
def __init__(self, entities, length, entity_types):
self.mem = Pool()
self.starts = <int*>self.mem.alloc(length, sizeof(int))
self.ends = <int*>self.mem.alloc(length, sizeof(int))
self.labels = <int*>self.mem.alloc(length, sizeof(int))
self.entities = entities
memset(self.starts, -1, sizeof(int) * length)
memset(self.ends, -1, sizeof(int) * length)
memset(self.labels, -1, sizeof(int) * length)
cdef int start, end, label
for start, end, label in entities:
for i in range(start, end):
self.starts[i] = start
self.ends[i] = end
self.labels[i] = label
@classmethod
def from_bilous(cls, tag_strs, entity_types):
entities = []
start = None
for i, tag_str in enumerate(tag_strs):
if tag_str == 'O' or tag_str == '-':
continue
move, label_str = tag_str.split('-')
label = entity_types.index(label_str)
if label == -1:
label = len(entity_types)
entity_types.append(label)
if move == 'U':
assert start is None
entities.append((i, i+1, label))
elif move == 'B':
assert start is None
start = i
elif move == 'L':
assert start is not None
entities.append((start, i+1, label))
start = None
return cls(entities, len(tag_strs), entity_types)
def read_iob(file_, entity_types, create_tokens):
sent_strs = file_.read().strip().split('\n\n')
sents = []
for sent_str in sent_strs:
if sent_str.startswith('-DOCSTART-'):
continue
words = []
iob = []
for token_str in sent_str.split('\n'):
word, pos, chunk, ner = token_str.split()
words.append(word)
iob.append(ner)
bilou = iob_to_bilou(iob)
tokens = create_tokens(words)
sents.append((tokens, NERAnnotation.from_bilous(bilou, entity_types)))
return sents
def iob_to_bilou(tags):
out = []
curr_label = None
tags = list(tags)
while tags:
out.extend(_consume_os(tags))
out.extend(_consume_ent(tags))
return out
def _consume_os(tags):
while tags and tags[0] == 'O':
yield tags.pop(0)
def _consume_ent(tags):
if not tags:
return []
target = tags.pop(0).replace('B', 'I')
length = 1
while tags and tags[0] == target:
length += 1
tags.pop(0)
label = target[2:]
if length == 1:
return ['U-' + label]
else:
start = 'B-' + label
end = 'L-' + label
middle = ['I-%s' % label for _ in range(1, length - 1)]
return [start] + middle + [end]

View File

@ -1,27 +0,0 @@
from cymem.cymem cimport Pool
from thinc.typedefs cimport class_t
from thinc.typedefs cimport weight_t
from .structs cimport State, Move
cpdef enum ActionType:
MISSING
BEGIN
IN
LAST
UNIT
OUT
N_ACTIONS
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n, State* s) except 0
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
cdef int transition(State *s, Move* m) except -1
cdef int fill_moves(Move* moves, list tag_names) except -1

View File

@ -1,207 +0,0 @@
from __future__ import unicode_literals
from ._state cimport begin_entity
from ._state cimport end_entity
from ._state cimport entity_is_open
from ._state cimport entity_is_sunk
ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
ACTION_NAMES[<int>MISSING] = '?'
ACTION_NAMES[<int>BEGIN] = 'B'
ACTION_NAMES[<int>IN] = 'I'
ACTION_NAMES[<int>LAST] = 'L'
ACTION_NAMES[<int>UNIT] = 'U'
ACTION_NAMES[<int>OUT] = 'O'
cdef bint can_begin(State* s, int label):
return not entity_is_open(s)
cdef bint can_in(State* s, int label):
return entity_is_open(s) and s.curr.label == label
cdef bint can_last(State* s, int label):
return entity_is_open(s) and s.curr.label == label
cdef bint can_unit(State* s, int label):
return not entity_is_open(s)
cdef bint can_out(State* s, int label):
return not entity_is_open(s)
cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
ActionType next_act, bint is_sunk):
if g_act == MISSING:
return True
if act == BEGIN:
if g_act == BEGIN:
# B, Gold B --> Label match
return tag == g_tag
else:
# B, Gold I --> False (P)
# B, Gold L --> False (P)
# B, Gold O --> False (P)
# B, Gold U --> False (P)
return False
elif act == IN:
if g_act == BEGIN:
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
return True
elif g_act == IN:
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
return True
elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O
return is_sunk and (next_act == OUT or next_act == MISSING)
elif g_act == OUT:
# I, Gold O --> True iff next tag == O
return next_act == OUT or next_act == MISSING
elif g_act == UNIT:
# I, Gold U --> True iff next tag == O
return next_act == OUT
elif act == LAST:
if g_act == BEGIN:
# L, Gold B --> True
return True
elif g_act == IN:
# L, Gold I --> True iff this entity sunk
return is_sunk
elif g_act == LAST:
# L, Gold L --> True
return True
elif g_act == OUT:
# L, Gold O --> True
return True
elif g_act == UNIT:
# L, Gold U --> True
return True
elif act == OUT:
if g_act == BEGIN:
# O, Gold B --> False
return False
elif g_act == IN:
# O, Gold I --> True
return True
elif g_act == LAST:
# O, Gold L --> True
return True
elif g_act == OUT:
# O, Gold O --> True
return True
elif g_act == UNIT:
# O, Gold U --> False
return False
elif act == UNIT:
if g_act == UNIT:
# U, Gold U --> True iff tag match
return tag == g_tag
else:
# U, Gold B --> False
# U, Gold I --> False
# U, Gold L --> False
# U, Gold O --> False
return False
cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
cdef int n_accept = 0
cdef Move* m
moves[0].accept = False
for i in range(1, n_classes):
m = &moves[i]
if m.action == BEGIN:
m.accept = can_begin(s, m.label)
elif m.action == IN:
m.accept = can_in(s, m.label)
elif m.action == LAST:
m.accept = can_last(s, m.label)
elif m.action == UNIT:
m.accept = can_unit(s, m.label)
elif m.action == OUT:
m.accept = can_out(s, m.label)
n_accept += m.accept
assert n_accept != 0
return n_accept
cdef int set_accept_if_oracle(Move* moves, Move* golds, int n_classes, State* s) except 0:
cdef Move* g = &golds[s.i]
cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
cdef bint is_sunk = entity_is_sunk(s, golds)
cdef Move* m
cdef int n_accept = 0
set_accept_if_valid(moves, n_classes, s)
for i in range(1, n_classes):
m = &moves[i]
if not m.accept:
continue
m.accept = is_oracle(<ActionType>m.action, m.label, <ActionType>g.action,
g.label, next_act, is_sunk)
n_accept += m.accept
assert n_accept != 0
return n_accept
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
cdef int first_accept = -1
for first_accept in range(1, n):
if moves[first_accept].accept:
break
else:
raise StandardError
assert first_accept != -1
cdef int best = first_accept
cdef weight_t score = scores[first_accept-1]
cdef int i
for i in range(first_accept+1, n):
if moves[i].accept and scores[i-1] > score:
best = i
score = scores[i-1]
return &moves[best]
cdef int transition(State *s, Move* move) except -1:
if move.action == BEGIN:
begin_entity(s, move.label)
elif move.action == IN:
pass
elif move.action == LAST:
end_entity(s)
elif move.action == UNIT:
begin_entity(s, move.label)
end_entity(s)
elif move.action == OUT:
pass
s.tags[s.i] = move.clas
s.i += 1
def get_n_moves(n_tags):
return n_tags + n_tags + n_tags + n_tags + 1
cdef int fill_moves(Move* moves, list tag_names) except -1:
cdef Move* m
label_names = {'-': 0}
for i, tag_name in enumerate(tag_names):
m = &moves[i]
if '-' in tag_name:
action_str, label = tag_name.split('-')
elif tag_name == 'O':
action_str = 'O'
label = '-'
elif tag_name == 'NULL' or tag_name == 'EOL':
action_str = '?'
label = '-'
else:
raise StandardError(tag_name)
m.action = ACTION_NAMES.index(action_str)
m.label = label_names.setdefault(label, len(label_names))
m.clas = i

View File

@ -1,151 +0,0 @@
from thinc.typedefs cimport atom_t
from ..typedefs cimport hash_t
from ..tokens cimport Tokens
from ..lexeme cimport Lexeme
from .structs cimport State
cpdef enum:
T_sic
T_cluster
T_norm
T_shape
T_asciied
T_prefix
T_suffix
T_length
T_postype
T_nertype
T_sensetype
T_is_alpha
T_is_ascii
T_is_digit
T_is_lower
T_is_punct
T_is_space
T_is_title
T_is_upper
T_like_url
T_like_number
T_oft_lower
T_oft_title
T_oft_upper
T_in_males
T_in_females
T_in_surnames
T_in_places
T_in_celebs
T_in_names
T_pos
T_sense
T_ner
cpdef enum:
P2_sic
P2_cluster
P2_norm
P2_shape
P2_prefix
P2_suffix
P2_length
P2_postype
P2_is_alpha
P2_is_digit
P2_is_lower
P2_is_punct
P2_is_title
P2_is_upper
P2_like_number
P2_pos
P1_sic
P1_cluster
P1_norm
P1_shape
P1_prefix
P1_suffix
P1_length
P1_postype
P1_is_alpha
P1_is_digit
P1_is_lower
P1_is_punct
P1_is_title
P1_is_upper
P1_like_number
P1_pos
W_sic
W_cluster
W_norm
W_shape
W_prefix
W_suffix
W_length
W_postype
W_is_alpha
W_is_digit
W_is_lower
W_is_punct
W_is_space
W_is_title
W_is_upper
W_like_number
W_pos
N1_sic
N1_cluster
N1_norm
N1_shape
N1_prefix
N1_suffix
N1_length
N1_postype
N1_is_alpha
N1_is_ascii
N1_is_digit
N1_is_lower
N1_is_punct
N1_is_space
N1_is_title
N1_is_upper
N1_like_number
N1_pos
N2_sic
N2_cluster
N2_norm
N2_shape
N2_asciied
N2_prefix
N2_suffix
N2_length
N2_postype
N2_is_alpha
N2_is_digit
N2_is_lower
N2_is_punct
N2_is_space
N2_is_title
N2_is_upper
N2_like_number
N2_pos
N2_sense
E0_sic
E0_cluster
E0_pos
E1_sic
E1_cluster
E1_pos
E_last_sic
E_last_cluster
E_last_pos
N_FIELDS
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1

View File

@ -1,76 +0,0 @@
from libc.string cimport memset
from murmurhash.mrmr cimport hash64
from ._state cimport entity_is_open
from ..lexeme cimport *
cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
c[T_sic] = lex.sic
c[T_cluster] = lex.cluster
c[T_norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
c[T_shape] = lex.shape
c[T_asciied] = lex.asciied
c[T_prefix] = lex.prefix
c[T_suffix] = lex.suffix
c[T_length] = lex.length
c[T_postype] = lex.postype
c[T_nertype] = 0
c[T_sensetype] = 0
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
c[T_is_lower] = lex.flags & (1 << IS_LOWER)
c[T_is_punct] = lex.flags & (1 << IS_PUNCT)
c[T_is_space] = lex.flags & (1 << IS_SPACE)
c[T_is_title] = lex.flags & (1 << IS_TITLE)
c[T_is_upper] = lex.flags & (1 << IS_UPPER)
c[T_like_url] = lex.flags & (1 << LIKE_URL)
c[T_like_number] = lex.flags & (1 << LIKE_NUMBER)
c[T_oft_lower] = lex.flags & (1 << OFT_LOWER)
c[T_oft_title] = lex.flags & (1 << OFT_TITLE)
c[T_oft_upper] = lex.flags & (1 << OFT_UPPER)
c[T_in_males] = lex.flags & (1 << IN_MALES)
c[T_in_females] = lex.flags & (1 << IN_FEMALES)
c[T_in_surnames] = lex.flags & (1 << IN_SURNAMES)
c[T_in_places] = lex.flags & (1 << IN_PLACES)
c[T_in_celebs] = lex.flags & (1 << IN_CELEBS)
c[T_in_names] = lex.flags & (1 << IN_NAMES)
c[T_pos] = pos
c[T_sense] = 0
cdef int _fill_outer_token(atom_t* c, Lexeme* lex, atom_t pos):
c[0] = lex.sic
c[1] = lex.cluster
c[2] = lex.shape
c[3] = pos
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1:
cdef int i
for i in range(N_FIELDS):
context[i] = 0
i = s.i
_fill_token(&context[P2_sic], tokens.lex[i-2], tokens.pos[i-2])
_fill_token(&context[P1_sic], tokens.lex[i-1], tokens.pos[i-1])
_fill_token(&context[W_sic], tokens.lex[i], tokens.pos[i])
_fill_token(&context[N1_sic], tokens.lex[i+1], tokens.pos[i+1])
_fill_token(&context[N2_sic], tokens.lex[i+2], tokens.pos[i+2])
cdef atom_t[5] ent_vals
if entity_is_open(s):
context[E0_sic] = tokens.lex[s.curr.start].sic
context[E0_cluster] = tokens.lex[s.curr.start].cluster
context[E0_pos] = tokens.pos[s.curr.start]
context[E_last_sic] = tokens.lex[s.i-1].sic
context[E_last_cluster] = tokens.lex[s.i-1].cluster
context[E_last_pos] = tokens.pos[s.i-1]
if (s.curr.start + 1) < s.i:
context[E1_sic] = tokens.lex[s.curr.start+1].sic
context[E1_cluster] = tokens.lex[s.curr.start+1].cluster
context[E1_pos] = tokens.pos[s.curr.start+1]
return 1

View File

@ -1,99 +0,0 @@
from .context import *
LOCAL = (
(W_sic,),
(P1_sic,),
(N1_sic,),
(P2_sic,),
(N2_sic,),
(P1_sic, W_sic,),
(W_sic, N1_sic),
(W_prefix,),
(W_suffix,),
(P1_shape,),
(W_shape,),
(N1_shape,),
(P1_shape, W_shape,),
(W_shape, P1_shape,),
(P1_shape, W_shape, N1_shape),
(N2_shape,),
(P2_shape,),
(P2_norm, P1_norm, W_norm),
(P1_norm, W_norm, N1_norm),
(W_norm, N1_norm, N2_norm)
)
POS = (
(P2_pos,),
(P1_pos,),
(W_pos,),
(N1_pos,),
(N2_pos,),
(P1_pos, W_pos),
(W_pos, N1_pos),
(P2_pos, P1_pos, W_pos),
(P1_pos, W_pos, N1_pos),
(W_pos, N1_pos, N2_pos)
)
CLUSTERS = (
(P2_cluster,),
(P1_cluster,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster, W_cluster),
(W_cluster, N1_cluster),
)
CLUSTER_POS = (
(P1_cluster, W_pos),
(W_pos, P1_cluster),
(W_cluster, N1_pos),
(W_pos, N1_cluster)
)
STATE = (
(E0_sic,),
(E0_cluster,),
(E0_pos,),
(E_last_sic,),
(E_last_cluster,),
(E_last_pos,),
(E0_sic, W_sic),
(E0_cluster, W_cluster),
(E0_pos, W_pos),
(E_last_sic, W_sic),
(E_last_pos, W_pos),
(E0_pos, E_last_pos, W_pos),
(E0_cluster, E_last_cluster, W_cluster),
(E0_sic, E_last_sic),
(E0_pos, E_last_pos),
(E0_cluster, E_last_cluster),
(E0_pos, E_last_cluster),
(E0_cluster, E_last_pos),
(E1_sic,),
(E1_cluster,),
(E1_pos,),
(E0_sic, E1_sic),
(E0_sic, E1_pos,),
(E0_pos, E1_sic,),
(E0_pos, E1_pos),
)
TEMPLATES = LOCAL + CLUSTERS + POS + CLUSTER_POS + STATE

View File

@ -1,29 +0,0 @@
from cymem.cymem cimport Pool
from thinc.features cimport Extractor
from thinc.learner cimport LinearModel
from thinc.typedefs cimport *
from ..tokens cimport Tokens
from ..typedefs cimport *
from .structs cimport Move
from .annot cimport NERAnnotation
cdef class NERParser:
cdef Pool mem
cdef Extractor extractor
cdef LinearModel model
cdef readonly list tag_names
cdef readonly list entity_types
cdef readonly int n_classes
cdef Move* _moves
cdef atom_t* _context
cdef feat_t* _feats
cdef weight_t* _values
cdef weight_t* _scores
cpdef list train(self, Tokens tokens, NERAnnotation annot)
cpdef list set_tags(self, Tokens tokens)

View File

@ -1,139 +0,0 @@
from __future__ import division
from __future__ import unicode_literals
cimport cython
import random
import os
from os import path
import shutil
import json
from thinc.features cimport ConjFeat
from .context cimport fill_context
from .context cimport N_FIELDS
from .structs cimport Move, State
from .io_moves cimport fill_moves, transition, best_accepted
from .io_moves cimport set_accept_if_valid, set_accept_if_oracle
from .io_moves import get_n_moves
from ._state cimport init_state
from ._state cimport entity_is_open
from ._state cimport end_entity
from .annot cimport NERAnnotation
def setup_model_dir(entity_types, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'templates': templates,
'entity_types': entity_types,
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
json.dump(config, file_)
def train(train_sents, model_dir, nr_iter=10):
cdef Tokens tokens
cdef NERAnnotation gold_ner
parser = NERParser(model_dir)
for _ in range(nr_iter):
tp = 0
fp = 0
fn = 0
for i, (tokens, gold_ner) in enumerate(train_sents):
#print [tokens[i].string for i in range(tokens.length)]
test_ents = set(parser.train(tokens, gold_ner))
#print 'Test', test_ents
gold_ents = set(gold_ner.entities)
#print 'Gold', set(gold_ner.entities)
tp += len(gold_ents.intersection(test_ents))
fp += len(test_ents - gold_ents)
fn += len(gold_ents - test_ents)
p = tp / (tp + fp)
r = tp / (tp + fn)
f = 2 * ((p * r) / (p + r))
print 'P: %.3f' % p,
print 'R: %.3f' % r,
print 'F: %.3f' % f
random.shuffle(train_sents)
parser.model.end_training()
parser.model.dump(path.join(model_dir, 'model'))
cdef class NERParser:
def __init__(self, model_dir):
self.mem = Pool()
cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates']
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
self.entity_types = cfg['entity_types']
self.n_classes = get_n_moves(len(self.entity_types))
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
fill_moves(self._moves, self.n_classes, self.entity_types)
self.model = LinearModel(self.n_classes)
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
cpdef list train(self, Tokens tokens, NERAnnotation annot):
cdef Pool mem = Pool()
cdef State* s = init_state(mem, tokens.length)
cdef Move* guess
cdef Move* oracle_move
n_correct = 0
cdef int f = 0
while s.i < tokens.length:
fill_context(self._context, s, tokens)
self.extractor.extract(self._feats, self._values, self._context, NULL)
self.model.score(self._scores, self._feats, self._values)
set_accept_if_valid(self._moves, self.n_classes, s)
guess = best_accepted(self._moves, self._scores, self.n_classes)
assert guess.clas != 0
set_accept_if_oracle(self._moves, self.n_classes, s,
annot.starts, annot.ends, annot.labels)
oracle_move = best_accepted(self._moves, self._scores, self.n_classes)
assert oracle_move.clas != 0
if guess.clas == oracle_move.clas:
counts = {}
n_correct += 1
else:
counts = {guess.clas: {}, oracle_move.clas: {}}
self.extractor.count(counts[oracle_move.clas], self._feats, 1)
self.extractor.count(counts[guess.clas], self._feats, -1)
self.model.update(counts)
transition(s, guess)
tokens.ner[s.i-1] = s.tags[s.i-1]
if entity_is_open(s):
s.curr.label = annot.labels[s.curr.start]
end_entity(s)
entities = []
for i in range(s.j):
entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label))
return entities
cpdef list set_tags(self, Tokens tokens):
cdef Pool mem = Pool()
cdef State* s = init_state(mem, tokens.length)
cdef Move* move
while s.i < tokens.length:
fill_context(self._context, s, tokens)
self.extractor.extract(self._feats, self._values, self._context, NULL)
self.model.score(self._scores, self._feats, self._values)
set_accept_if_valid(self._moves, self.n_classes, s)
move = best_accepted(self._moves, self._scores, self.n_classes)
transition(s, move)
tokens.ner[s.i-1] = s.tags[s.i-1]
if entity_is_open(s):
s.curr.label = move.label
end_entity(s)
entities = []
for i in range(s.j):
entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label))
return entities

View File

@ -1,26 +0,0 @@
from cymem.cymem cimport Pool
from thinc.typedefs cimport class_t
from thinc.typedefs cimport weight_t
from .structs cimport State, Move
cpdef enum ActionType:
MISSING
SHIFT
REDUCE
OUT
N_ACTIONS
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
int* g_starts, int* g_ends, int* g_labels) except 0
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
cdef int transition(State *s, Move* m) except -1
cdef int fill_moves(Move* moves, int n, list entity_types) except -1

View File

@ -1,152 +0,0 @@
from __future__ import unicode_literals
from cymem.cymem cimport Pool
from thinc.typedefs cimport class_t
from thinc.typedefs cimport weight_t
from ._state cimport begin_entity
from ._state cimport end_entity
from ._state cimport entity_is_open
ACTION_NAMES = ['' for _ in range(N_ACTIONS)]
ACTION_NAMES[<int>MISSING] = '?'
ACTION_NAMES[<int>SHIFT] = 'S'
ACTION_NAMES[<int>REDUCE] = 'R'
ACTION_NAMES[<int>OUT] = 'O'
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
int* g_starts, int* g_ends, int* g_labels) except 0:
# If curr entity: (O invalid)
# if cost is not sunk (start matches, end is i-1 or greater
# - If i-1 == gold.end --> R=True, S=False
# - Shift if end >= i --> S=True, R=False
# else
# - If i == gold.start --> R=True, S=False
# - Else --> R=True, S=True
# Else (R invalid):
# if start == gold.start: S=True, O=False
# else: O=True, S=False
if entity_is_open(s):
g_start = g_starts[s.curr.start]
g_end = g_ends[s.curr.start]
accept_o = False
if g_start == s.curr.start and g_end == s.i:
accept_r = True
accept_s = False
elif g_start == s.curr.start and g_end > s.i:
accept_s = True
s_label = s.curr.label
accept_r = False
elif g_starts[s.i] == s.i:
accept_r = True
accept_s = False
else:
accept_r = True
accept_s = True
s_label = s.curr.label
else:
accept_r = False
if g_starts[s.i] == s.i:
accept_s = True
s_label = g_labels[s.i]
accept_o = False
else:
accept_o = True
accept_s = False
n_accept = 0
moves[0].accept = False
for i in range(1, n):
m = &moves[i]
if m.action == SHIFT:
m.accept = accept_s and m.label == s_label
elif m.action == REDUCE:
m.accept = accept_r
elif m.action == OUT:
m.accept = accept_o
n_accept += m.accept
assert n_accept != 0
return n_accept
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0:
cdef int i
cdef bint open_ent = entity_is_open(s)
cdef int n_accept = 0
moves[0].accept = False
for i in range(1, n):
if moves[i].action == SHIFT:
moves[i].accept = moves[i].label == s.curr.label or not entity_is_open(s)
elif moves[i].action == REDUCE:
moves[i].accept = open_ent
elif moves[i].action == OUT:
moves[i].accept = not open_ent
n_accept += moves[i].accept
return n_accept
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
cdef int first_accept = -1
for first_accept in range(1, n):
if moves[first_accept].accept:
break
else:
raise StandardError
assert first_accept != -1
cdef int best = first_accept
cdef weight_t score = scores[first_accept-1]
cdef int i
for i in range(first_accept+1, n):
if moves[i].accept and scores[i-1] > score:
best = i
score = scores[i-1]
return &moves[best]
cdef int transition(State *s, Move* move) except -1:
s.tags[s.i] = move.clas
if move.action == OUT:
s.i += 1
elif move.action == SHIFT:
if not entity_is_open(s):
s.curr.start = s.i
s.curr.label = move.label
s.i += 1
elif move.action == REDUCE:
s.curr.end = s.i
s.ents[s.j] = s.curr
s.j += 1
s.curr.start = 0
s.curr.label = -1
s.curr.end = 0
else:
raise ValueError(move.action)
def get_n_moves(n_tags):
return 1 + 1 + 1 + n_tags
cdef int fill_moves(Move* moves, int n, list entity_types) except -1:
cdef Move* m
label_names = {'-': 0}
# Reserve class 0
cdef int i = 0
moves[i].clas = i
moves[i].action = MISSING
moves[i].label = 0
i += 1
for entity_type in entity_types:
moves[i].action = SHIFT
moves[i].label = label_names.setdefault(entity_type, len(label_names))
moves[i].clas = i
i += 1
moves[i].clas = i
moves[i].action = OUT
moves[i].label = 0
i += 1
moves[i].action = REDUCE
moves[i].clas = i
moves[i].label = 0
i += 1

View File

@ -1,16 +0,0 @@
from cymem.cymem cimport Pool
from .structs cimport Move, State
cdef class PyState:
cdef Pool mem
cdef readonly list tag_names
cdef readonly int n_classes
cdef readonly dict moves_by_name
cdef Move* _moves
cdef Move* _golds
cdef State* _s
cdef Move* _get_move(self, unicode move_name) except NULL

View File

@ -1,60 +0,0 @@
from __future__ import unicode_literals
from ._state cimport init_state
from ._state cimport entity_is_open
from .bilou_moves cimport fill_moves
from .bilou_moves cimport transition
from .bilou_moves cimport set_accept_if_valid, set_accept_if_oracle
from .bilou_moves import get_n_moves
from .bilou_moves import ACTION_NAMES
cdef class PyState:
def __init__(self, tag_names, n_tokens):
self.mem = Pool()
self.tag_names = tag_names
self.n_classes = len(tag_names)
assert self.n_classes != 0
self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
fill_moves(self._moves, tag_names)
self._s = init_state(self.mem, n_tokens)
self._golds = <Move*>self.mem.alloc(n_tokens, sizeof(Move))
cdef Move* _get_move(self, unicode move_name) except NULL:
return &self._moves[self.tag_names.index(move_name)]
def set_golds(self, list gold_names):
cdef Move* m
for i, name in enumerate(gold_names):
m = self._get_move(name)
self._golds[i] = m[0]
def transition(self, unicode move_name):
cdef Move* m = self._get_move(move_name)
transition(self._s, m)
def is_valid(self, unicode move_name):
cdef Move* m = self._get_move(move_name)
set_accept_if_valid(self._moves, self.n_classes, self._s)
return m.accept
def is_gold(self, unicode move_name):
cdef Move* m = self._get_move(move_name)
set_accept_if_oracle(self._moves, self._golds, self.n_classes, self._s)
return m.accept
property ent:
def __get__(self):
return self._s.curr
property n_ents:
def __get__(self):
return self._s.j
property i:
def __get__(self):
return self._s.i
property open_entity:
def __get__(self):
return entity_is_open(self._s)

View File

@ -1,23 +0,0 @@
from thinc.typedefs cimport class_t
cdef struct Entity:
int start
int end
int label
cdef struct State:
Entity curr
Entity* ents
int* tags
int i
int j
int length
cdef struct Move:
class_t clas
int action
int label
bint accept

View File

@ -112,6 +112,8 @@ cpdef bint like_number(unicode string):
cpdef unicode word_shape(unicode string):
if len(string) >= 100:
return 'LONG'
length = len(string)
shape = []
last = ""

View File

@ -1,243 +0,0 @@
from libcpp.vector cimport vector
from libc.stdint cimport uint32_t
from libc.stdint cimport int64_t
from libc.stdint cimport int32_t
from libc.stdint cimport uint64_t
from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64
import numpy
cimport cython
ctypedef unsigned char uchar
# Format
# - Total number of bytes in message (32 bit int)
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
# - Spaces ~1 bit per word
# - Parse: Huffman coded head offset / dep label / POS tag / entity IOB tag
# combo. ? bits per word. 40 * 80 * 40 * 12 = 1.5m symbol vocab
# Note that we're setting the most significant bits here first, when in practice
# we're actually wanting the last bit to be most significant (for Huffman coding,
# anyway).
cdef Code bit_append(Code code, bint bit) nogil:
cdef uint64_t one = 1
if bit:
code.bits |= one << code.length
else:
code.bits &= ~(one << code.length)
code.length += 1
return code
cdef class BitArray:
cdef bytes data
cdef unsigned char byte
cdef unsigned char bit_of_byte
cdef uint32_t i
def __init__(self):
self.data = b''
self.byte = 0
self.bit_of_byte = 0
self.i = 0
def __iter__(self):
cdef uchar byte, i
cdef uchar one = 1
start_byte = self.i // 8
if (self.i % 8) != 0:
for i in range(self.i % 8):
yield 1 if (self.data[start_byte] & (one << i)) else 0
start_byte += 1
for byte in self.data[start_byte:]:
for i in range(8):
yield 1 if byte & (one << i) else 0
for i in range(self.bit_of_byte):
yield 1 if self.byte & (one << i) else 0
def as_bytes(self):
if self.bit_of_byte != 0:
return self.data + chr(self.byte)
else:
return self.data
def append(self, bint bit):
cdef uint64_t one = 1
print 'append', bit
if bit:
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.byte = 0
self.bit_of_byte = 0
cdef int extend(self, uint64_t code, char n_bits) except -1:
cdef uint64_t one = 1
cdef unsigned char bit_of_code
for bit_of_code in range(n_bits):
if code & (one << bit_of_code):
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.byte = 0
self.bit_of_byte = 0
cdef class HuffmanCodec:
"""Create a Huffman code table, and use it to pack and unpack sequences into
byte strings. Emphasis is on efficiency, so API is quite strict:
Messages will be encoded/decoded as indices that refer to the probability sequence.
For instance, the sequence [5, 10, 8] indicates the 5th most frequent item,
the 10th most frequent item, the 8th most frequent item. The codec will add
the EOL symbol to your message. An exception will be raised if you include
the EOL symbol in your message.
Arguments:
probs (float[:]): A descending-sorted sequence of probabilities/weights.
Must include a weight for an EOL symbol.
eol (uint32_t): The index of the weight of the EOL symbol.
"""
def __init__(self, float[:] probs, uint32_t eol):
self.eol = eol
self.codes.resize(len(probs))
for i in range(len(self.codes)):
self.codes[i].bits = 0
self.codes[i].length = 0
populate_nodes(self.nodes, probs)
cdef Code path
path.bits = 0
path.length = 0
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
def encode(self, uint32_t[:] sequence, BitArray bits=None):
if bits is None:
bits = BitArray()
for i in sequence:
bits.extend(self.codes[i].bits, self.codes[i].length)
bits.extend(self.codes[self.eol].bits, self.codes[self.eol].length)
return bits
def decode(self, bits):
node = self.nodes.back()
symbols = []
for bit in bits:
branch = node.right if bit else node.left
if branch >= 0:
node = self.nodes.at(branch)
else:
symbol = -(branch + 1)
if symbol == self.eol:
return symbols
else:
symbols.append(symbol)
node = self.nodes.back()
return symbols
property strings:
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def __get__(self):
output = []
cdef int i, j
cdef bytes string
cdef Code code
for i in range(self.codes.size()):
code = self.codes[i]
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
string = string[::-1]
output.append(string)
return output
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1:
assert len(probs) >= 3
cdef int size = len(probs)
cdef int i = size - 1
cdef int j = 0
while i >= 0 or (j+1) < nodes.size():
if i < 0:
_cover_two_nodes(nodes, j)
j += 2
elif j >= nodes.size():
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
i -= 2
elif i >= 1 and (j == nodes.size() or probs[i-1] < nodes[j].prob):
_cover_two_words(nodes, i, i-1, probs[i] + probs[i-1])
i -= 2
elif (j+1) < nodes.size() and nodes[j+1].prob < probs[i]:
_cover_two_nodes(nodes, j)
j += 2
else:
_cover_one_word_one_node(nodes, j, i, probs[i])
i -= 1
j += 1
return 0
cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil:
cdef Node node
node.left = j
node.right = j+1
node.prob = nodes[j].prob + nodes[j+1].prob
nodes.push_back(node)
cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil:
cdef Node node
# Encode leaves as negative integers, where the integer is the index of the
# word in the vocabulary.
cdef int64_t leaf_id = - <int64_t>(id_ + 1)
cdef float new_prob = prob + nodes[j].prob
if prob < nodes[j].prob:
node.left = leaf_id
node.right = j
node.prob = new_prob
else:
node.left = j
node.right = leaf_id
node.prob = new_prob
nodes.push_back(node)
cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil:
cdef Node node
node.left = -(id1+1)
node.right = -(id2+1)
node.prob = prob
nodes.push_back(node)
cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
cdef Code left_path = bit_append(path, 0)
cdef Code right_path = bit_append(path, 1)
# Assign down left branch
if nodes[i].left >= 0:
assign_codes(nodes, codes, nodes[i].left, left_path)
else:
# Leaf on left
id_ = -(nodes[i].left + 1)
codes[id_] = left_path
# Assign down right branch
if nodes[i].right >= 0:
assign_codes(nodes, codes, nodes[i].right, right_path)
else:
# Leaf on right
id_ = -(nodes[i].right + 1)
codes[id_] = right_path

23
spacy/serialize/bits.pxd Normal file
View File

@ -0,0 +1,23 @@
from libc.stdint cimport uint64_t
from libc.stdint cimport uint32_t
ctypedef unsigned char uchar
cdef struct Code:
uint64_t bits
char length
cdef Code bit_append(Code code, bint bit) nogil
cdef class BitArray:
cdef bytes data
cdef uchar byte
cdef uchar bit_of_byte
cdef uint32_t i
cdef int extend(self, uint64_t code, char n_bits) except -1
cdef uint32_t read32(self) except 0

112
spacy/serialize/bits.pyx Normal file
View File

@ -0,0 +1,112 @@
from libc.string cimport memcpy
# Note that we're setting the most significant bits here first, when in practice
# we're actually wanting the last bit to be most significant (for Huffman coding,
# anyway).
cdef Code bit_append(Code code, bint bit) nogil:
cdef uint64_t one = 1
if bit:
code.bits |= one << code.length
else:
code.bits &= ~(one << code.length)
code.length += 1
return code
cdef class BitArray:
def __init__(self, data=b''):
self.data = data
self.byte = 0
self.bit_of_byte = 0
self.i = 0
def __len__(self):
return 8 * len(self.data) + self.bit_of_byte
def __str__(self):
cdef uchar byte, i
cdef uchar one = 1
string = b''
for i in range(len(self.data)):
byte = ord(self.data[i])
for j in range(8):
string += b'1' if (byte & (one << j)) else b'0'
for i in range(self.bit_of_byte):
string += b'1' if (byte & (one << i)) else b'0'
return string
def seek(self, i):
self.i = i
def __iter__(self):
cdef uchar byte, i
cdef uchar one = 1
start_byte = self.i // 8
start_bit = self.i % 8
if start_bit != 0 and start_byte < len(self.data):
byte = ord(self.data[start_byte])
for i in range(start_bit, 8):
self.i += 1
yield 1 if (byte & (one << i)) else 0
start_byte += 1
start_bit = 0
for byte in self.data[start_byte:]:
for i in range(8):
self.i += 1
yield 1 if byte & (one << i) else 0
if self.bit_of_byte != 0:
byte = self.byte
for i in range(start_bit, self.bit_of_byte):
self.i += 1
yield 1 if self.byte & (one << i) else 0
cdef uint32_t read32(self) except 0:
cdef int start_byte = self.i // 8
# TODO portability
cdef uchar[4] chars
chars[0] = <uchar>ord(self.data[start_byte])
chars[1] = <uchar>ord(self.data[start_byte+1])
chars[2] = <uchar>ord(self.data[start_byte+2])
chars[3] = <uchar>ord(self.data[start_byte+3])
cdef uint32_t output
memcpy(&output, chars, 4)
self.i += 32
return output
def as_bytes(self):
if self.bit_of_byte != 0:
return self.data + chr(self.byte)
else:
return self.data
def append(self, bint bit):
cdef uint64_t one = 1
if bit:
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
self.i += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.byte = 0
self.bit_of_byte = 0
cdef int extend(self, uint64_t code, char n_bits) except -1:
cdef uint64_t one = 1
cdef unsigned char bit_of_code
for bit_of_code in range(n_bits):
if code & (one << bit_of_code):
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.byte = 0
self.bit_of_byte = 0
self.i += 1

View File

@ -4,20 +4,21 @@ from libc.stdint cimport int64_t
from libc.stdint cimport int32_t
from libc.stdint cimport uint64_t
from .bits cimport BitArray, Code
cdef struct Node:
float prob
int32_t left
int32_t right
cdef struct Code:
uint64_t bits
char length
cdef class HuffmanCodec:
cdef vector[Node] nodes
cdef vector[Code] codes
cdef uint32_t eol
cdef Node root
cdef readonly list leaves
cdef readonly dict _map
cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1
cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1

173
spacy/serialize/huffman.pyx Normal file
View File

@ -0,0 +1,173 @@
# cython: profile=True
cimport cython
from libcpp.queue cimport priority_queue
from libcpp.pair cimport pair
import numpy
from ..typedefs cimport attr_t
from .bits cimport bit_append
from .bits cimport BitArray
cdef class HuffmanCodec:
def __init__(self, freqs):
cdef float count
cdef Code code
cdef pair[float, int] item
cdef pair[float, int] item1
cdef pair[float, int] item2
cdef priority_queue[pair[float, int]] queue
cdef int i = 0
self._map = {}
self.leaves = []
for word, weight in freqs:
item.first = -weight
item.second = -(i+1)
queue.push(item)
self.leaves.append(word)
code.bits = 0
code.length = 0
self.codes.push_back(code)
self._map[word] = i
i += 1
cdef Node node
while queue.size() >= 2:
item1 = queue.top(); queue.pop()
item2 = queue.top(); queue.pop()
node = Node(left=item1.second, right=item2.second)
self.nodes.push_back(node)
item.first = item1.first + item2.first
item.second = self.nodes.size()-1
queue.push(item)
item = queue.top()
self.root = self.nodes[item.second]
cdef Code path
path.bits = 0
path.length = 0
assign_codes(self.nodes, self.codes, item.second, path)
def encode(self, msg, BitArray bits=None):
if bits is None:
bits = BitArray()
cdef int i
for word in msg:
i = self._map[word]
bits.extend(self.codes[i].bits, self.codes[i].length)
return bits
cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1:
cdef int msg_i
cdef int leaf_i
cdef int length = 0
for msg_i in range(msg.shape[0]):
leaf_i = self._map.get(msg[msg_i], -1)
if leaf_i is -1:
return 0
code = self.codes[leaf_i]
bits.extend(code.bits, code.length)
length += code.length
return length
def n_bits(self, msg, overhead=0):
cdef int i
length = 0
for word in msg:
if word not in self._map:
return numpy.nan
i = self._map[word]
length += self.codes[i].length
return length + overhead * len(msg)
def decode(self, bits, msg):
node = self.root
cdef int i = 0
cdef int n = len(msg)
cdef int branch
cdef bint bit
for bit in bits:
branch = node.right if bit else node.left
if branch >= 0:
node = self.nodes.at(branch)
else:
msg[i] = self.leaves[-(branch + 1)]
node = self.nodes.back()
i += 1
if i == n:
break
else:
raise Exception("Buffer exhausted at %d/%d symbols read." % (i, len(msg)))
@cython.boundscheck(False)
cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1:
assert bits.i % 8 == 0
cdef Node node = self.root
cdef int branch
cdef int n_msg = msg.shape[0]
cdef bytes bytes_ = bits.as_bytes()
cdef unsigned char byte
cdef int i_msg = 0
cdef int i_byte = bits.i // 8
cdef unsigned char i_bit = 0
cdef unsigned char one = 1
while i_msg < n_msg:
byte = ord(bytes_[i_byte])
i_byte += 1
for i_bit in range(8):
branch = node.right if (byte & (one << i_bit)) else node.left
bits.i += 1
if branch >= 0:
node = self.nodes.at(branch)
else:
msg[i_msg] = self.leaves[-(branch + 1)]
i_msg += 1
if i_msg == n_msg:
break
node = self.root
property strings:
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def __get__(self):
output = []
cdef int i, j
cdef bytes string
cdef Code code
for i in range(self.codes.size()):
code = self.codes[i]
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
string = string[::-1]
output.append(string)
return output
cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
"""Recursively assign paths, from the top down. At the end, the entry codes[i]
knows the bit-address of the node[j] that points to entry i in the vocabulary.
So, to encode i, we go to codes[i] and read its bit-string. To decode, we
navigate nodes recursively.
"""
cdef Code left_path = bit_append(path, 0)
cdef Code right_path = bit_append(path, 1)
# Assign down left branch
if nodes[i].left >= 0:
assign_codes(nodes, codes, nodes[i].left, left_path)
else:
# Leaf on left
id_ = -(nodes[i].left + 1)
codes[id_] = left_path
# Assign down right branch
if nodes[i].right >= 0:
assign_codes(nodes, codes, nodes[i].right, right_path)
else:
# Leaf on right
id_ = -(nodes[i].right + 1)
codes[id_] = right_path

View File

@ -0,0 +1,9 @@
from ..vocab cimport Vocab
cdef class Packer:
cdef readonly tuple attrs
cdef readonly tuple _codecs
cdef readonly object orth_codec
cdef readonly object char_codec
cdef readonly Vocab vocab

195
spacy/serialize/packer.pyx Normal file
View File

@ -0,0 +1,195 @@
# cython: profile=True
from __future__ import unicode_literals
from libc.stdint cimport uint32_t, int32_t
from libc.stdint cimport uint64_t
from libc.math cimport exp as c_exp
from libcpp.queue cimport priority_queue
from libcpp.pair cimport pair
from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter
import json
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..structs cimport LexemeC
from ..typedefs cimport attr_t
from .bits cimport BitArray
from .huffman cimport HuffmanCodec
from os import path
import numpy
from .. import util
cimport cython
# Format
# - Total number of bytes in message (32 bit int) --- handled outside this
# - Number of words (32 bit int)
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
# - Spaces 1 bit per word
# - Attributes:
# POS tag
# Head offset
# Dep label
# Entity IOB
# Entity tag
cdef class _BinaryCodec:
def encode(self, attr_t[:] msg, BitArray bits):
cdef int i
for i in range(len(msg)):
bits.append(msg[i])
def decode(self, BitArray bits, attr_t[:] msg):
cdef int i = 0
for bit in bits:
msg[i] = bit
i += 1
if i == len(msg):
break
def _gen_orths(Vocab vocab):
cdef attr_t orth
cdef size_t addr
for orth, addr in vocab._by_orth.items():
lex = <LexemeC*>addr
yield orth, c_exp(lex.prob)
def _gen_chars(Vocab vocab):
cdef attr_t orth
cdef size_t addr
char_weights = {chr(i): 1e-20 for i in range(256)}
cdef unicode string
cdef bytes char
cdef bytes utf8_str
for orth, addr in vocab._by_orth.items():
lex = <LexemeC*>addr
string = vocab.strings[lex.orth]
utf8_str = string.encode('utf8')
for char in utf8_str:
char_weights.setdefault(char, 0.0)
char_weights[char] += c_exp(lex.prob)
char_weights[b' '] += c_exp(lex.prob)
return char_weights.items()
cdef class Packer:
def __init__(self, Vocab vocab, attr_freqs, char_freqs=None):
if char_freqs is None:
char_freqs = _gen_chars(vocab)
self.vocab = vocab
self.orth_codec = HuffmanCodec(_gen_orths(vocab))
self.char_codec = HuffmanCodec(char_freqs)
codecs = []
attrs = []
for attr, freqs in sorted(attr_freqs):
if attr in (ORTH, ID, SPACY):
continue
codecs.append(HuffmanCodec(freqs))
attrs.append(attr)
self._codecs = tuple(codecs)
self.attrs = tuple(attrs)
def pack(self, Doc doc):
bits = self._orth_encode(doc)
if bits is None:
bits = self._char_encode(doc)
cdef int i
if self.attrs:
array = doc.to_array(self.attrs)
for i, codec in enumerate(self._codecs):
codec.encode(array[:, i], bits)
return bits.as_bytes()
def unpack(self, bytes data):
doc = Doc(self.vocab)
self.unpack_into(data, doc)
return doc
def unpack_into(self, bytes byte_string, Doc doc):
bits = BitArray(byte_string)
bits.seek(0)
cdef int32_t length = bits.read32()
if length >= 0:
self._orth_decode(bits, length, doc)
else:
self._char_decode(bits, -length, doc)
array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
for i, codec in enumerate(self._codecs):
codec.decode(bits, array[:, i])
doc.from_array(self.attrs, array)
return doc
def _orth_encode(self, Doc doc):
cdef BitArray bits = BitArray()
cdef int32_t length = len(doc)
bits.extend(length, 32)
orths = doc.to_array([ORTH])
n_bits = self.orth_codec.encode_int32(orths[:, 0], bits)
if n_bits == 0:
return None
for token in doc:
bits.append(bool(token.whitespace_))
return bits
def _char_encode(self, Doc doc):
cdef bytes utf8_str = doc.string.encode('utf8')
cdef BitArray bits = BitArray()
cdef int32_t length = len(utf8_str)
# Signal chars with negative length
bits.extend(-length, 32)
self.char_codec.encode(utf8_str, bits)
cdef int i, j
for i in range(doc.length):
for j in range(doc.data[i].lex.length-1):
bits.append(False)
bits.append(True)
if doc.data[i].spacy:
bits.append(False)
return bits
def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
self.orth_codec.decode_int32(bits, orths)
cdef int i
cdef bint space
spaces = iter(bits)
for i in range(n):
orth = orths[i]
space = spaces.next()
lex = self.vocab.get_by_orth(doc.mem, orth)
doc.push_back(lex, space)
return doc
def _char_decode(self, BitArray bits, int32_t n, Doc doc):
cdef bytearray utf8_str = bytearray(n)
self.char_codec.decode(bits, utf8_str)
cdef unicode string = utf8_str.decode('utf8')
cdef int start = 0
cdef bint is_spacy
cdef int length = len(string)
cdef int i = 0
cdef bint is_end_token
for is_end_token in bits:
if is_end_token:
span = string[start:i+1]
lex = self.vocab.get(doc.mem, span)
is_spacy = (i+1) < length and string[i+1] == u' '
doc.push_back(lex, is_spacy)
start = i + 1 + is_spacy
i += 1
if i >= n:
break
return doc

View File

@ -1,14 +0,0 @@
from .tokens cimport Doc
from .typedefs cimport flags_t, attr_id_t, attr_t
from .parts_of_speech cimport univ_pos_t
from .structs cimport Morphology, TokenC, LexemeC
from .vocab cimport Vocab
from .strings cimport StringStore
cdef class Span:
cdef readonly Doc _seq
cdef public int i
cdef public int start
cdef public int end
cdef readonly int label

View File

@ -1,25 +1,26 @@
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64
from .typedefs cimport attr_t
from libc.stdint cimport int64_t
from .structs cimport Utf8Str, UniStr
from .typedefs cimport hash_t
cpdef hash_t hash_string(unicode string) except 0
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
s.chars = &chars[start]
s.n = end - start
s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
ctypedef union Utf8Str:
unsigned char[8] s
unsigned char* p
cdef class StringStore:
cdef Pool mem
cdef Utf8Str* strings
cdef size_t size
cdef Utf8Str* c
cdef int64_t size
cdef PreshMap _map
cdef size_t _resize_at
cdef const Utf8Str* intern(self, char* chars, int length, int* id_) except NULL
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL

View File

@ -3,49 +3,63 @@ import codecs
from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64
from libc.stdint cimport int64_t
from .typedefs cimport hash_t
from .typedefs cimport hash_t, attr_t
SEPARATOR = '\n|-SEP-|\n'
cpdef hash_t hash_string(unicode string) except 0:
# This should probably use Py_UCS4 API, but I can't in Python2.7
chars = <Py_UNICODE*>string
return hash64(chars, len(string) * sizeof(Py_UNICODE), 0)
"""
cdef class SymbolMap:
def __init__(self):
self._string_to_id = {'': 0}
self._id_to_string = ['']
cdef unicode _decode(const Utf8Str* string):
cdef int i, length
if string.s[0] < sizeof(string.s) and string.s[0] != 0:
return string.s[1:string.s[0]+1].decode('utf8')
elif string.p[0] < 255:
return string.p[1:string.p[0]+1].decode('utf8')
else:
i = 0
length = 0
while string.p[i] == 255:
i += 1
length += 255
length += string.p[i]
i += 1
return string.p[i:length + i].decode('utf8')
def __iter__(self):
for id_, string in enumerate(self._id_to_string[1:]):
yield string, id_
def __len__(self):
return len(self._id_to_string)
def __getitem__(self, object string_or_id):
cdef bytes byte_string
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
if string_or_id < 1 or string_or_id >= self.size:
raise IndexError(string_or_id)
return self._int_to_string[string_or_id]
else:
string = string_or_id
if isinstance(string, unicode):
string = string.encode('utf8')
if string in self._string_to_id:
id_ = self._string_to_id[string]
else:
id_ = len(self._string_to_id)
self._string_to_id[string] = id_
self._id_to_string.append(string)
return id_
"""
cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except *:
cdef int n_length_bytes
cdef int i
cdef Utf8Str string
assert length != 0
if length < sizeof(string.s):
string.s[0] = <unsigned char>length
memcpy(&string.s[1], chars, length)
return string
elif length < 255:
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
string.p[0] = length
memcpy(&string.p[1], chars, length)
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string
else:
i = 0
n_length_bytes = (length // 255) + 1
string.p = <unsigned char*>mem.alloc(length + n_length_bytes, sizeof(unsigned char))
for i in range(n_length_bytes-1):
string.p[i] = 255
string.p[n_length_bytes-1] = length % 255
memcpy(&string.p[n_length_bytes], chars, length)
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string
cdef class StringStore:
@ -54,15 +68,15 @@ cdef class StringStore:
self.mem = Pool()
self._map = PreshMap()
self._resize_at = 10000
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
property size:
def __get__(self):
return self.size-1
return self.size -1
def __len__(self):
return self.size
return self.size-1
def __getitem__(self, object string_or_id):
cdef bytes byte_string
@ -73,57 +87,76 @@ cdef class StringStore:
return u''
elif string_or_id < 1 or string_or_id >= self.size:
raise IndexError(string_or_id)
utf8str = &self.strings[<int>string_or_id]
return utf8str.chars[:utf8str.length].decode('utf8')
utf8str = &self.c[<int>string_or_id]
return _decode(utf8str)
elif isinstance(string_or_id, bytes):
utf8str = self.intern(<char*>string_or_id, len(string_or_id), &id_)
return id_
if len(string_or_id) == 0:
return 0
utf8str = self.intern(<unsigned char*>string_or_id, len(string_or_id))
return utf8str - self.c
elif isinstance(string_or_id, unicode):
if len(string_or_id) == 0:
return 0
byte_string = string_or_id.encode('utf8')
utf8str = self.intern(<char*>byte_string, len(byte_string), &id_)
return id_
utf8str = self.intern(<unsigned char*>byte_string, len(byte_string))
return utf8str - self.c
else:
raise TypeError(type(string_or_id))
cdef const Utf8Str* intern(self, char* chars, int length, int* id_) except NULL:
# 0 means missing, but we don't bother offsetting the index. We waste
# slot 0 to simplify the code, because it doesn't matter.
assert length != 0
cdef hash_t key = hash64(chars, length * sizeof(char), 0)
cdef void* value = self._map.get(key)
cdef size_t i
if value == NULL:
if self.size == self._resize_at:
self._resize_at *= 2
self.strings = <Utf8Str*>self.mem.realloc(self.strings, self._resize_at * sizeof(Utf8Str))
i = self.size
self.strings[i].i = self.size
self.strings[i].chars = <unsigned char*>self.mem.alloc(length, sizeof(char))
memcpy(self.strings[i].chars, chars, length)
self.strings[i].length = length
self._map.set(key, <void*>self.size)
self.size += 1
else:
i = <size_t>value
return &self.strings[i]
cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL:
# 0 means missing, but we don't bother offsetting the index.
key = hash64(chars, length * sizeof(char), 0)
value = <Utf8Str*>self._map.get(key)
if value != NULL:
return value
if self.size == self._resize_at:
self._realloc()
self.c[self.size] = _allocate(self.mem, chars, length)
self._map.set(key, <void*>&self.c[self.size])
self.size += 1
return &self.c[self.size-1]
def dump(self, loc):
strings = []
cdef Utf8Str* string
cdef bytes py_string
for i in range(self.size):
string = &self.strings[i]
py_string = string.chars[:string.length]
strings.append(py_string.decode('utf8'))
cdef unicode py_string
cdef int i
with codecs.open(loc, 'w', 'utf8') as file_:
file_.write(SEPARATOR.join(strings))
for i in range(1, self.size):
string = &self.c[i]
py_string = _decode(string)
file_.write(py_string)
if (i+1) != self.size:
file_.write(SEPARATOR)
def load(self, loc):
with codecs.open(loc, 'r', 'utf8') as file_:
strings = file_.read().split(SEPARATOR)
cdef unicode string
cdef bytes byte_string
cdef int id_
for string in strings[1:]:
for string in strings:
byte_string = string.encode('utf8')
self.intern(byte_string, len(byte_string), &id_)
self.intern(byte_string, len(byte_string))
def _realloc(self):
# We want to map straight to pointers, but they'll be invalidated if
# we resize our array. So, first we remap to indices, then we resize,
# then we can acquire the new pointers.
cdef Pool tmp_mem = Pool()
keys = <hash_t*>tmp_mem.alloc(self.size, sizeof(hash_t))
cdef hash_t key
cdef size_t addr
cdef const Utf8Str ptr
cdef size_t i
for key, addr in self._map.items():
# Find array index with pointer arithmetic
i = (<Utf8Str*>addr) - self.c
keys[i] = key
self._resize_at *= 2
cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
self._map = PreshMap(self.size)
for i in range(self.size):
self._map.set(keys[i], &self.c[i])

View File

@ -1,6 +1,6 @@
from libc.stdint cimport uint8_t, uint32_t, int32_t
from .typedefs cimport flags_t, attr_t, id_t, hash_t
from .typedefs cimport flags_t, attr_t, hash_t
from .parts_of_speech cimport univ_pos_t
@ -62,6 +62,7 @@ cdef struct TokenC:
Morphology morph
const Constituent* ctnt
univ_pos_t pos
bint spacy
int tag
int idx
int lemma
@ -77,14 +78,3 @@ cdef struct TokenC:
int ent_iob
int ent_type
cdef struct Utf8Str:
unsigned char* chars
int length
cdef struct UniStr:
Py_UNICODE* chars
size_t n
hash_t key

View File

@ -12,7 +12,7 @@ from libc.string cimport memset
from itertools import combinations
from ..tokens cimport TokenC
from ..structs cimport TokenC
from .stateclass cimport StateClass

View File

@ -10,6 +10,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
@ -309,6 +310,9 @@ cdef class ArcEager(TransitionSystem):
label = 'ROOT'
gold.c.heads[i] = gold.heads[i]
gold.c.labels[i] = self.strings[label]
# Count frequencies, for use in encoder
self.freqs[HEAD][gold.c.heads[i] - i] += 1
self.freqs[DEP][gold.c.labels[i]] += 1
for end, brackets in gold.brackets.items():
for start, label_strs in brackets.items():
gold.c.brackets[start][end] = 1
@ -374,17 +378,16 @@ cdef class ArcEager(TransitionSystem):
st._sent[i].r_edge = i
st.fast_forward()
cdef int finalize_state(self, StateClass st) except -1:
cdef int root_label = self.strings['ROOT']
cdef int finalize_state(self, StateClass st) nogil:
for i in range(st.length):
if st._sent[i].head == 0 and st._sent[i].dep == 0:
st._sent[i].dep = root_label
st._sent[i].dep = self.root_label
# If we're not using the Break transition, we segment via root-labelled
# arcs between the root words.
elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == root_label:
elif USE_ROOT_ARC_SEGMENT and st._sent[i].dep == self.root_label:
st._sent[i].head = 0
cdef int set_valid(self, bint* output, StateClass stcls) except -1:
cdef int set_valid(self, int* output, StateClass stcls) nogil:
cdef bint[N_MOVES] is_valid
is_valid[SHIFT] = Shift.is_valid(stcls, -1)
is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
@ -392,13 +395,11 @@ cdef class ArcEager(TransitionSystem):
is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
is_valid[BREAK] = Break.is_valid(stcls, -1)
cdef int i
n_valid = 0
for i in range(self.n_moves):
output[i] = is_valid[self.c[i].move]
n_valid += output[i]
assert n_valid >= 1
cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
cdef int set_costs(self, int* is_valid, int* costs,
StateClass stcls, GoldParse gold) except -1:
cdef int i, move, label
cdef label_cost_func_t[N_MOVES] label_cost_funcs
cdef move_cost_func_t[N_MOVES] move_cost_funcs
@ -423,30 +424,14 @@ cdef class ArcEager(TransitionSystem):
n_gold = 0
for i in range(self.n_moves):
if self.c[i].is_valid(stcls, self.c[i].label):
is_valid[i] = True
move = self.c[i].move
label = self.c[i].label
if move_costs[move] == -1:
move_costs[move] = move_cost_funcs[move](stcls, &gold.c)
output[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
n_gold += output[i] == 0
costs[i] = move_costs[move] + label_cost_funcs[move](stcls, &gold.c, label)
n_gold += costs[i] == 0
else:
output[i] = 9000
is_valid[i] = False
costs[i] = 9000
assert n_gold >= 1
cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
cdef bint[N_MOVES] is_valid
is_valid[SHIFT] = Shift.is_valid(stcls, -1)
is_valid[REDUCE] = Reduce.is_valid(stcls, -1)
is_valid[LEFT] = LeftArc.is_valid(stcls, -1)
is_valid[RIGHT] = RightArc.is_valid(stcls, -1)
is_valid[BREAK] = Break.is_valid(stcls, -1)
cdef Transition best
cdef weight_t score = MIN_SCORE
cdef int i
for i in range(self.n_moves):
if scores[i] > score and is_valid[self.c[i].move]:
best = self.c[i]
score = scores[i]
assert best.clas < self.n_moves
assert score > MIN_SCORE, (stcls.stack_depth(), stcls.buffer_length(), stcls.is_final(), stcls._b_i, stcls.length)
return best

View File

@ -8,6 +8,7 @@ from ..structs cimport TokenC, Entity
from thinc.typedefs cimport weight_t
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
from .stateclass cimport StateClass
@ -74,6 +75,19 @@ cdef class BiluoPushDown(TransitionSystem):
cdef int preprocess_gold(self, GoldParse gold) except -1:
for i in range(gold.length):
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
# Count frequencies, for use in encoder
if gold.c.ner[i].move in (BEGIN, UNIT):
self.freqs[ENT_IOB][3] += 1
self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1
elif gold.c.ner[i].move in (IN, LAST):
self.freqs[ENT_IOB][2] += 1
self.freqs[ENT_TYPE][0] += 1
elif gold.c.ner[i].move == OUT:
self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1
else:
self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1
cdef Transition lookup_transition(self, object name) except *:
if name == '-':
@ -128,27 +142,6 @@ cdef class BiluoPushDown(TransitionSystem):
raise Exception(move)
return t
cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *:
cdef int best = -1
cdef weight_t score = -90000
cdef const Transition* m
cdef int i
for i in range(self.n_moves):
m = &self.c[i]
if m.is_valid(stcls, m.label) and scores[i] > score:
best = i
score = scores[i]
assert best >= 0
cdef Transition t = self.c[best]
t.score = score
return t
cdef int set_valid(self, bint* output, StateClass stcls) except -1:
cdef int i
for i in range(self.n_moves):
m = &self.c[i]
output[i] = m.is_valid(stcls, m.label)
cdef class Missing:
@staticmethod

View File

@ -4,7 +4,10 @@ from .._ml cimport Model
from .arc_eager cimport TransitionSystem
from ..tokens cimport Doc, TokenC
from ..tokens.doc cimport Doc
from ..structs cimport TokenC
from thinc.api cimport Example, ExampleC
from .stateclass cimport StateClass
cdef class Parser:
@ -12,5 +15,4 @@ cdef class Parser:
cdef readonly Model model
cdef readonly TransitionSystem moves
cdef int _greedy_parse(self, Doc tokens) except -1
cdef int _beam_parse(self, Doc tokens) except -1
cdef void parse(self, StateClass stcls, ExampleC eg) nogil

View File

@ -20,19 +20,14 @@ from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
from util import Config
from thinc.features cimport Extractor
from thinc.features cimport Feature
from thinc.features cimport count_feats
from thinc.api cimport Example, ExampleC
from thinc.learner cimport LinearModel
from thinc.search cimport Beam
from thinc.search cimport MaxViolation
from ..structs cimport TokenC
from ..tokens cimport Doc, TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
@ -46,6 +41,8 @@ from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context
from .stateclass cimport StateClass
from .._ml cimport arg_max_if_true
DEBUG = False
def set_debug(val):
@ -59,6 +56,8 @@ def get_templates(name):
return pf.ner
elif name == 'debug':
return pf.unigrams
elif name.startswith('embed'):
return (pf.words, pf.tags, pf.labels)
else:
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
pf.tree_shape + pf.trigrams)
@ -81,179 +80,46 @@ cdef class Parser:
self.model = Model(self.moves.n_moves, templates, model_dir)
def __call__(self, Doc tokens):
if self.model is not None:
if self.cfg.get('beam_width', 0) < 1:
self._greedy_parse(tokens)
else:
self._beam_parse(tokens)
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
self.moves.initialize_state(stcls)
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
self.model.n_feats, self.model.n_feats)
self.parse(stcls, eg.c)
tokens.set_parse(stcls._sent)
cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
while not stcls.is_final():
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
self.moves.set_valid(eg.is_valid, stcls)
fill_context(eg.atoms, stcls)
self.model.set_scores(eg.scores, eg.atoms)
eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
self.moves.finalize_state(stcls)
def train(self, Doc tokens, GoldParse gold):
self.moves.preprocess_gold(gold)
if self.cfg.get('beam_width', 0) < 1:
return self._greedy_train(tokens, gold)
else:
return self._beam_train(tokens, gold)
cdef int _greedy_parse(self, Doc tokens) except -1:
cdef atom_t[CONTEXT_SIZE] context
cdef int n_feats
cdef Pool mem = Pool()
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
self.moves.initialize_state(stcls)
cdef Transition guess
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
self.model.n_feats, self.model.n_feats)
cdef weight_t loss = 0
words = [w.orth_ for w in tokens]
cdef Transition G
while not stcls.is_final():
fill_context(context, stcls)
scores = self.model.score(context)
guess = self.moves.best_valid(scores, stcls)
#print self.moves.move_name(guess.move, guess.label), stcls.print_state(words)
guess.do(stcls, guess.label)
assert stcls._s_i >= 0
self.moves.finalize_state(stcls)
tokens.set_parse(stcls._sent)
memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t))
self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
fill_context(eg.c.atoms, stcls)
cdef int _beam_parse(self, Doc tokens) except -1:
cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
words = [w.orth_ for w in tokens]
beam.initialize(_init_state, tokens.length, tokens.data)
beam.check_done(_check_final_state, NULL)
while not beam.is_done:
self._advance_beam(beam, None, False, words)
state = <StateClass>beam.at(0)
self.moves.finalize_state(state)
tokens.set_parse(state._sent)
_cleanup(beam)
self.model.train(eg)
def _greedy_train(self, Doc tokens, GoldParse gold):
cdef Pool mem = Pool()
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
self.moves.initialize_state(stcls)
G = self.moves.c[eg.c.guess]
cdef int cost
cdef const Feature* feats
cdef const weight_t* scores
cdef Transition guess
cdef Transition best
cdef atom_t[CONTEXT_SIZE] context
loss = 0
words = [w.orth_ for w in tokens]
history = []
while not stcls.is_final():
fill_context(context, stcls)
scores = self.model.score(context)
guess = self.moves.best_valid(scores, stcls)
best = self.moves.best_gold(scores, stcls, gold)
cost = guess.get_cost(stcls, &gold.c, guess.label)
self.model.update(context, guess.clas, best.clas, cost)
guess.do(stcls, guess.label)
loss += cost
self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label)
loss += eg.c.loss
return loss
def _beam_train(self, Doc tokens, GoldParse gold_parse):
cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
pred.initialize(_init_state, tokens.length, tokens.data)
pred.check_done(_check_final_state, NULL)
cdef Beam gold = Beam(self.moves.n_moves, self.cfg.beam_width)
gold.initialize(_init_state, tokens.length, tokens.data)
gold.check_done(_check_final_state, NULL)
violn = MaxViolation()
words = [w.orth_ for w in tokens]
while not pred.is_done and not gold.is_done:
self._advance_beam(pred, gold_parse, False, words)
self._advance_beam(gold, gold_parse, True, words)
violn.check(pred, gold)
if pred.loss >= 1:
counts = {clas: {} for clas in range(self.model.n_classes)}
self._count_feats(counts, tokens, violn.g_hist, 1)
self._count_feats(counts, tokens, violn.p_hist, -1)
else:
counts = {}
self.model._model.update(counts)
_cleanup(pred)
_cleanup(gold)
return pred.loss
def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold, words):
cdef atom_t[CONTEXT_SIZE] context
cdef int i, j, cost
cdef bint is_valid
cdef const Transition* move
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if not stcls.is_final():
fill_context(context, stcls)
self.model.set_scores(beam.scores[i], context)
self.moves.set_valid(beam.is_valid[i], stcls)
if gold is not None:
for i in range(beam.size):
stcls = <StateClass>beam.at(i)
if not stcls.is_final():
self.moves.set_costs(beam.costs[i], stcls, gold)
if follow_gold:
for j in range(self.moves.n_moves):
beam.is_valid[i][j] *= beam.costs[i][j] == 0
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL)
def _count_feats(self, dict counts, Doc tokens, list hist, int inc):
cdef atom_t[CONTEXT_SIZE] context
cdef Pool mem = Pool()
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
self.moves.initialize_state(stcls)
cdef class_t clas
cdef int n_feats
for clas in hist:
fill_context(context, stcls)
feats = self.model._extractor.get_feats(context, &n_feats)
count_feats(counts[clas], feats, n_feats, inc)
self.moves.c[clas].do(stcls, self.moves.c[clas].label)
# These are passed as callbacks to thinc.search.Beam
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
dest = <StateClass>_dest
src = <StateClass>_src
moves = <const Transition*>_moves
dest.clone(src)
moves[clas].do(dest, moves[clas].label)
cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
st.fast_forward()
Py_INCREF(st)
return <void*>st
cdef int _check_final_state(void* _state, void* extra_args) except -1:
return (<StateClass>_state).is_final()
def _cleanup(Beam beam):
for i in range(beam.width):
Py_XDECREF(<PyObject*>beam._states[i].content)
Py_XDECREF(<PyObject*>beam._parents[i].content)
cdef hash_t _hash_state(void* _state, void* _) except 0:
return <hash_t>_state
#state = <const State*>_state
#cdef atom_t[10] rep
#rep[0] = state.stack[0] if state.stack_len >= 1 else 0
#rep[1] = state.stack[-1] if state.stack_len >= 2 else 0
#rep[2] = state.stack[-2] if state.stack_len >= 3 else 0
#rep[3] = state.i
#rep[4] = state.sent[state.stack[0]].l_kids if state.stack_len >= 1 else 0
#rep[5] = state.sent[state.stack[0]].r_kids if state.stack_len >= 1 else 0
#rep[6] = state.sent[state.stack[0]].dep if state.stack_len >= 1 else 0
#rep[7] = state.sent[state.stack[-1]].dep if state.stack_len >= 2 else 0
#if get_left(state, get_n0(state), 1) != NULL:
# rep[8] = get_left(state, get_n0(state), 1).dep
#else:
# rep[8] = 0
#rep[9] = state.sent[state.i].l_kids
#return hash64(rep, sizeof(atom_t) * 10, 0)

View File

@ -34,9 +34,11 @@ cdef class TransitionSystem:
cdef const Transition* c
cdef bint* _is_valid
cdef readonly int n_moves
cdef public int root_label
cdef public freqs
cdef int initialize_state(self, StateClass state) except -1
cdef int finalize_state(self, StateClass state) except -1
cdef int finalize_state(self, StateClass state) nogil
cdef int preprocess_gold(self, GoldParse gold) except -1
@ -44,11 +46,7 @@ cdef class TransitionSystem:
cdef Transition init_transition(self, int clas, int move, int label) except *
cdef int set_valid(self, bint* output, StateClass state) except -1
cdef int set_valid(self, int* output, StateClass state) nogil
cdef int set_costs(self, int* output, StateClass state, GoldParse gold) except -1
cdef Transition best_valid(self, const weight_t* scores, StateClass stcls) except *
cdef Transition best_gold(self, const weight_t* scores, StateClass state,
GoldParse gold) except *
cdef int set_costs(self, int* is_valid, int* costs,
StateClass state, GoldParse gold) except -1

View File

@ -1,8 +1,10 @@
from cymem.cymem cimport Pool
from ..structs cimport TokenC
from thinc.typedefs cimport weight_t
from collections import defaultdict
from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
cdef weight_t MIN_SCORE = -90000
@ -27,11 +29,20 @@ cdef class TransitionSystem:
moves[i] = self.init_transition(i, int(action), label_id)
i += 1
self.c = moves
self.root_label = self.strings['ROOT']
self.freqs = {}
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
self.freqs[attr] = defaultdict(int)
self.freqs[attr][0] = 1
# Ensure we've seen heads. Need an official dependency length limit...
for i in range(512):
self.freqs[HEAD][i] = 1
self.freqs[HEAD][-i] = 1
cdef int initialize_state(self, StateClass state) except -1:
pass
cdef int finalize_state(self, StateClass state) except -1:
cdef int finalize_state(self, StateClass state) nogil:
pass
cdef int preprocess_gold(self, GoldParse gold) except -1:
@ -43,30 +54,17 @@ cdef class TransitionSystem:
cdef Transition init_transition(self, int clas, int move, int label) except *:
raise NotImplementedError
cdef Transition best_valid(self, const weight_t* scores, StateClass s) except *:
raise NotImplementedError
cdef int set_valid(self, bint* output, StateClass state) except -1:
raise NotImplementedError
cdef int set_costs(self, int* output, StateClass stcls, GoldParse gold) except -1:
cdef int set_valid(self, int* is_valid, StateClass stcls) nogil:
cdef int i
for i in range(self.n_moves):
if self.c[i].is_valid(stcls, self.c[i].label):
output[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
is_valid[i] = self.c[i].is_valid(stcls, self.c[i].label)
cdef int set_costs(self, int* is_valid, int* costs,
StateClass stcls, GoldParse gold) except -1:
cdef int i
self.set_valid(is_valid, stcls)
for i in range(self.n_moves):
if is_valid[i]:
costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
else:
output[i] = 9000
cdef Transition best_gold(self, const weight_t* scores, StateClass stcls,
GoldParse gold) except *:
cdef Transition best
cdef weight_t score = MIN_SCORE
cdef int i
for i in range(self.n_moves):
if self.c[i].is_valid(stcls, self.c[i].label):
cost = self.c[i].get_cost(stcls, &gold.c, self.c[i].label)
if scores[i] > score and cost == 0:
best = self.c[i]
score = scores[i]
assert score > MIN_SCORE
return best
costs[i] = 9000

View File

@ -1,14 +1,12 @@
from libcpp.vector cimport vector
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC, Morphology, UniStr
from .structs cimport LexemeC, TokenC, Morphology
from .strings cimport StringStore
from .tokens cimport Doc
from .tokens.doc cimport Doc
from .vocab cimport Vocab, _Cached
@ -29,13 +27,11 @@ cdef class Tokenizer:
cpdef Doc tokens_from_list(self, list strings)
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
cdef unicode _split_affixes(self, unicode string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes)
cdef int _attach_tokens(self, Doc tokens, unicode string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1

View File

@ -6,17 +6,19 @@ import re
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from cpython cimport Py_UNICODE_ISSPACE
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64
from .structs cimport UniStr
from .strings cimport slice_unicode
from .morphology cimport set_morph_from_dict
from .strings cimport hash_string
cimport cython
from . import util
from .util import read_lang_data
from .tokens import Doc
from .tokens.doc cimport Doc
cdef class Tokenizer:
@ -39,19 +41,19 @@ cdef class Tokenizer:
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
cpdef Doc tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
cdef Doc tokens = Doc(self.vocab, ' '.join(strings))
if length == 0:
cdef Doc tokens = Doc(self.vocab)
if sum([len(s) for s in strings]) == 0:
return tokens
cdef UniStr string_struct
cdef unicode py_string
cdef int idx = 0
for i, py_string in enumerate(strings):
slice_unicode(&string_struct, py_string, 0, len(py_string))
tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
# Note that we pass tokens.mem here --- the Doc object has ownership
tokens.push_back(
<const LexemeC*>self.vocab.get(tokens.mem, py_string), True)
idx += len(py_string) + 1
return tokens
@cython.boundscheck(False)
def __call__(self, unicode string):
"""Tokenize a string.
@ -73,139 +75,152 @@ cdef class Tokenizer:
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
"""
cdef int length = len(string)
cdef Doc tokens = Doc(self.vocab, string)
cdef Doc tokens = Doc(self.vocab)
if length == 0:
return tokens
cdef int i = 0
cdef int start = 0
cdef bint cache_hit
cdef Py_UNICODE* chars = string
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
cdef UniStr span
cdef bint in_ws = Py_UNICODE_ISSPACE(string[0])
cdef unicode span
# Use of Py_UNICODE is deprecated, and I should be using Py_UCS4.
# But this is hard --- I need to acquire a pointer, but there's no
# Py_UCS4 API in Python 2.
cdef Py_UNICODE uc
cdef Py_UNICODE* chars_ptr = <Py_UNICODE*>string
# The task here is much like string.split, but not quite
# We find spans of whitespace and non-space characters, and ignore
# spans that are exactly ' '. So, our sequences will all be separated
# by either ' ' or nothing.
for i in range(1, length):
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
uc = chars_ptr[i]
if Py_UNICODE_ISSPACE(uc) != in_ws:
if start < i:
slice_unicode(&span, chars, start, i)
cache_hit = self._try_cache(start, span.key, tokens)
key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
cache_hit = self._try_cache(key, tokens)
if not cache_hit:
self._tokenize(tokens, &span, start, i)
self._tokenize(tokens, string[start:i], key)
in_ws = not in_ws
start = i
if chars[i] == ' ':
start += 1
if uc == ' ':
tokens.data[tokens.length - 1].spacy = True
start = i + 1
else:
start = i
i += 1
if start < i:
slice_unicode(&span, chars, start, i)
cache_hit = self._try_cache(start, span.key, tokens)
key = hash64(&chars_ptr[start], (i - start) * sizeof(Py_UNICODE), 0)
cache_hit = self._try_cache(key, tokens)
if not cache_hit:
self._tokenize(tokens, &span, start, i)
self._tokenize(tokens, string[start:], key)
tokens.data[tokens.length - 1].spacy = string[-1] == ' '
return tokens
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1:
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key)
if cached == NULL:
return False
cdef int i
if cached.is_lex:
for i in range(cached.length):
idx = tokens.push_back(idx, cached.data.lexemes[i])
tokens.push_back(cached.data.lexemes[i], False)
else:
for i in range(cached.length):
idx = tokens.push_back(idx, &cached.data.tokens[i])
tokens.push_back(&cached.data.tokens[i], False)
return True
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key) except -1:
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef hash_t orig_key
cdef int orig_size
orig_key = span.key
orig_size = tokens.length
self._split_affixes(span, &prefixes, &suffixes)
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
span = self._split_affixes(span, &prefixes, &suffixes)
self._attach_tokens(tokens, span, &prefixes, &suffixes)
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except NULL:
cdef unicode _split_affixes(self, unicode string, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes):
cdef size_t i
cdef UniStr prefix
cdef UniStr suffix
cdef UniStr minus_pre
cdef UniStr minus_suf
cdef unicode prefix
cdef unicode suffix
cdef unicode minus_pre
cdef unicode minus_suf
cdef size_t last_size = 0
while string.n != 0 and string.n != last_size:
last_size = string.n
pre_len = self._find_prefix(string.chars, string.n)
while string and len(string) != last_size:
last_size = len(string)
pre_len = self.find_prefix(string)
if pre_len != 0:
slice_unicode(&prefix, string.chars, 0, pre_len)
slice_unicode(&minus_pre, string.chars, pre_len, string.n)
prefix = string[:pre_len]
minus_pre = string[pre_len:]
# Check whether we've hit a special-case
if minus_pre.n >= 1 and self._specials.get(minus_pre.key) != NULL:
string[0] = minus_pre
prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix))
if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
string = minus_pre
prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
break
suf_len = self._find_suffix(string.chars, string.n)
suf_len = self.find_suffix(string)
if suf_len != 0:
slice_unicode(&suffix, string.chars, string.n - suf_len, string.n)
slice_unicode(&minus_suf, string.chars, 0, string.n - suf_len)
suffix = string[-suf_len:]
minus_suf = string[:-suf_len]
# Check whether we've hit a special-case
if minus_suf.n >= 1 and self._specials.get(minus_suf.key) != NULL:
string[0] = minus_suf
suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix))
if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
string = minus_suf
suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
break
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
slice_unicode(string, string.chars, pre_len, string.n - suf_len)
prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix))
suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix))
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
string = string[pre_len:-suf_len]
prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
elif pre_len:
string[0] = minus_pre
prefixes.push_back(self.vocab.get(self.vocab.mem, &prefix))
string = minus_pre
prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
elif suf_len:
string[0] = minus_suf
suffixes.push_back(self.vocab.get(self.vocab.mem, &suffix))
if self._specials.get(string.key):
string = minus_suf
suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
if string and (self._specials.get(hash_string(string)) != NULL):
break
return string
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
cdef int _attach_tokens(self, Doc tokens, unicode string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit
cdef int split
cdef int split, end
cdef const LexemeC* const* lexemes
cdef LexemeC* lexeme
cdef UniStr span
cdef const LexemeC* lexeme
cdef unicode span
cdef int i
if prefixes.size():
for i in range(prefixes.size()):
idx = tokens.push_back(idx, prefixes[0][i])
if string.n != 0:
cache_hit = self._try_cache(idx, string.key, tokens)
tokens.push_back(prefixes[0][i], False)
if string:
cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit:
# Get last idx
idx = tokens.data[tokens.length - 1].idx
# Increment by last length
idx += tokens.data[tokens.length - 1].lex.length
pass
else:
split = self._find_infix(string.chars, string.n)
if split == 0 or split == -1:
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, string))
match = self.find_infix(string)
if match is None:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
slice_unicode(&span, string.chars, 0, split)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
slice_unicode(&span, string.chars, split, split+1)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
slice_unicode(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
split = match.start()
end = match.end()
# Append the beginning, affix, end of the infix span
span = string[:split]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
span = string[split:end]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
span = string[end:]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
idx = tokens.push_back(idx, deref(it))
lexeme = deref(it)
preinc(it)
tokens.push_back(lexeme, False)
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
cdef int i
for i in range(n):
if tokens[i].lex.id == 1:
if tokens[i].lex.id == 0:
return 0
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = n
@ -216,18 +231,14 @@ cdef class Tokenizer:
cached.data.lexemes = <const LexemeC* const*>lexemes
self._cache.set(key, cached)
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length]
match = self._infix_re.search(string)
return match.start() if match is not None else 0
def find_infix(self, unicode string):
return self._infix_re.search(string)
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length]
def find_prefix(self, unicode string):
match = self._prefix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length]
def find_suffix(self, unicode string):
match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
@ -235,21 +246,19 @@ cdef class Tokenizer:
'''Add a special-case tokenization rule.
'''
cdef int i
cdef unicode chunk
cdef list substrings
cdef unicode chunk
cdef unicode form
cdef unicode lemma
cdef dict props
cdef LexemeC** lexemes
cdef hash_t hashed
cdef UniStr string
for chunk, substrings in sorted(rules.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
form = props['F']
lemma = props.get("L", None)
slice_unicode(&string, form, 0, len(form))
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
if lemma is not None:
tokens[i].lemma = self.vocab.strings[lemma]
else:
@ -267,6 +276,6 @@ cdef class Tokenizer:
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = tokens
slice_unicode(&string, chunk, 0, len(chunk))
self._specials.set(string.key, cached)
self._cache.set(string.key, cached)
hashed = hash_string(chunk)
self._specials.set(hashed, cached)
self._cache.set(hashed, cached)

View File

@ -1,89 +0,0 @@
from libc.stdint cimport uint32_t
from numpy cimport ndarray
cimport numpy as np
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from .typedefs cimport flags_t, attr_id_t, attr_t
from .parts_of_speech cimport univ_pos_t
from .structs cimport Morphology, TokenC, LexemeC
from .vocab cimport Vocab
from .strings cimport StringStore
ctypedef const LexemeC* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr
ctypedef fused LexemeOrToken:
const_Lexeme_ptr
TokenC_ptr
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
cdef class Doc:
cdef Pool mem
cdef Vocab vocab
cdef TokenC* data
cdef list _py_tokens
cdef unicode _string
cdef tuple _tag_strings
cdef public bint is_tagged
cdef public bint is_parsed
cdef int length
cdef int max_length
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
cpdef np.ndarray to_array(self, object features)
cdef int set_parse(self, const TokenC* parsed) except -1
cdef class Token:
cdef Vocab vocab
cdef unicode _string
cdef const TokenC* c
cdef readonly int i
cdef int array_len
cdef bint _owns_c_data
cdef Doc _seq
@staticmethod
cdef inline Token cinit(Vocab vocab, unicode string,
const TokenC* token, int offset, int array_len,
Doc parent_seq):
if offset < 0 or offset >= array_len:
msg = "Attempt to access token at %d, max length %d"
raise IndexError(msg % (offset, array_len))
if parent_seq._py_tokens[offset] is not None:
return parent_seq._py_tokens[offset]
cdef Token self = Token.__new__(Token, vocab, string)
self.c = token
self.i = offset
self.array_len = array_len
self._seq = parent_seq
self._seq._py_tokens[offset] = self
return self
cdef int take_ownership_of_c_data(self) except -1
cpdef bint check_flag(self, attr_id_t flag_id) except -1

View File

@ -1,716 +0,0 @@
# cython: embedsignature=True
from libc.string cimport memset
from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter
from .strings cimport slice_unicode
from .vocab cimport EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA, TAG, DEP
from .parts_of_speech import UNIV_POS_NAMES
from .parts_of_speech cimport CONJ, PUNCT
from .lexeme cimport check_flag
from .spans import Span
from .structs cimport UniStr
from .serialize import BitArray
from unidecode import unidecode
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()
import numpy
cimport cython
from cpython.mem cimport PyMem_Malloc, PyMem_Free
from libc.string cimport memcpy
DEF PADDING = 5
cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0:
raise IndexError
if (i - padding) >= length:
raise IndexError
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
if feat_name == LEMMA:
return token.lemma
elif feat_name == POS:
return token.pos
elif feat_name == TAG:
return token.tag
elif feat_name == DEP:
return token.dep
else:
return get_lex_attr(token.lex, feat_name)
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == ORTH:
return lex.orth
elif feat_name == LOWER:
return lex.lower
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
return lex.suffix
elif feat_name == LENGTH:
return lex.length
elif feat_name == CLUSTER:
return lex.cluster
else:
return 0
cdef class Doc:
"""
Container class for annotated text. Constructed via English.__call__ or
Tokenizer.__call__.
"""
def __cinit__(self, Vocab vocab, unicode string):
self.vocab = vocab
self._string = string
string_length = len(string)
if string_length >= 3:
size = int(string_length / 3.0)
else:
size = 5
self.mem = Pool()
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can
# realloc.
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
cdef int i
for i in range(size + (PADDING*2)):
data_start[i].lex = &EMPTY_LEXEME
self.data = data_start + PADDING
self.max_length = size
self.length = 0
self.is_tagged = False
self.is_parsed = False
self._py_tokens = []
def __getitem__(self, object i):
"""Retrieve a token.
The Python Token objects are created lazily from internal C data, and
cached in _py_tokens
Returns:
token (Token):
"""
if isinstance(i, slice):
if i.step is not None:
raise ValueError("Stepped slices not supported in Span objects."
"Try: list(doc)[start:stop:step] instead.")
return Span(self, i.start, i.stop, label=0)
if i < 0:
i = self.length + i
bounds_check(i, self.length, PADDING)
return Token.cinit(self.vocab, self._string,
&self.data[i], i, self.length,
self)
def __iter__(self):
"""Iterate over the tokens.
Yields:
token (Token):
"""
for i in range(self.length):
yield Token.cinit(self.vocab, self._string,
&self.data[i], i, self.length,
self)
def __len__(self):
return self.length
def __unicode__(self):
cdef const TokenC* last = &self.data[self.length - 1]
return self._string[:last.idx + last.lex.length]
@property
def string(self):
return unicode(self)
@property
def ents(self):
"""Yields named-entity Span objects.
Iterate over the span to get individual Token objects, or access the label:
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
>>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0])
(112504, u'PERSON', u'Best ')
"""
cdef int i
cdef const TokenC* token
cdef int start = -1
cdef int label = 0
for i in range(self.length):
token = &self.data[i]
if token.ent_iob == 1:
assert start != -1
pass
elif token.ent_iob == 2:
if start != -1:
yield Span(self, start, i, label=label)
start = -1
label = 0
elif token.ent_iob == 3:
if start != -1:
yield Span(self, start, i, label=label)
start = i
label = token.ent_type
if start != -1:
yield Span(self, start, self.length, label=label)
@property
def sents(self):
"""
Yield a list of sentence Span objects, calculated from the dependency parse.
"""
cdef int i
cdef Doc sent = Doc(self.vocab, self._string[self.data[0].idx:])
start = 0
for i in range(1, self.length):
if self.data[i].sent_start:
yield Span(self, start, i)
start = i
yield Span(self, start, self.length)
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
if self.length == self.max_length:
self._realloc(self.length * 2)
cdef TokenC* t = &self.data[self.length]
if LexemeOrToken is TokenC_ptr:
t[0] = lex_or_tok[0]
else:
t.lex = lex_or_tok
t.idx = idx
self.length += 1
self._py_tokens.append(None)
return idx + t.lex.length
@cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
of shape N*M, where N is the length of the sentence.
Arguments:
attr_ids (list[int]): A list of attribute ID ints.
Returns:
feat_array (numpy.ndarray[long, ndim=2]):
A feature matrix, with one row per word, and one column per attribute
indicated in the input attr_ids.
"""
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[long, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int)
for i in range(self.length):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.data[i], feature)
return output
def count_by(self, attr_id_t attr_id, exclude=None):
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed
by the values of the given attribute ID.
>>> from spacy.en import English, attrs
>>> nlp = English()
>>> tokens = nlp(u'apple apple orange banana')
>>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.ORTH])
array([[11880],
[11880],
[ 7561],
[12800]])
"""
cdef int i
cdef attr_t attr
cdef size_t count
cdef PreshCounter counts = PreshCounter(2 ** 8)
for i in range(self.length):
if exclude is not None and exclude(self[i]):
continue
attr = get_token_attr(&self.data[i], attr_id)
counts.inc(attr, 1)
return dict(counts)
def _realloc(self, new_size):
self.max_length = new_size
n = new_size + (PADDING * 2)
# What we're storing is a "padded" array. We've jumped forward PADDING
# places, and are storing the pointer to that. This way, we can access
# words out-of-bounds, and get out-of-bounds markers.
# Now that we want to realloc, we need the address of the true start,
# so we jump the pointer back PADDING places.
cdef TokenC* data_start = self.data - PADDING
data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
self.data = data_start + PADDING
cdef int i
for i in range(self.length, self.max_length + PADDING):
self.data[i].lex = &EMPTY_LEXEME
cdef int set_parse(self, const TokenC* parsed) except -1:
# TODO: This method is fairly misleading atm. It's used by GreedyParser
# to actually apply the parse calculated. Need to rethink this.
self._py_tokens = [None] * self.length
self.is_parsed = True
for i in range(self.length):
self.data[i] = parsed[i]
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
unicode ent_type):
"""Merge a multi-word expression into a single token. Currently
experimental; API is likely to change."""
cdef int i
cdef int start = -1
cdef int end = -1
for i in range(self.length):
if self.data[i].idx == start_idx:
start = i
if (self.data[i].idx + self.data[i].lex.length) == end_idx:
if start == -1:
return None
end = i + 1
break
else:
return None
# Get LexemeC for newly merged token
cdef UniStr new_orth_c
slice_unicode(&new_orth_c, self._string, start_idx, end_idx)
cdef const LexemeC* lex = self.vocab.get(self.mem, &new_orth_c)
# House the new merged token where it starts
cdef TokenC* token = &self.data[start]
# Update fields
token.lex = lex
# What to do about morphology??
# TODO: token.morph = ???
token.tag = self.vocab.strings[tag]
token.lemma = self.vocab.strings[lemma]
if ent_type == 'O':
token.ent_iob = 2
token.ent_type = 0
else:
token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type]
# Fix dependencies
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
for i in range(self.length):
self.data[i].head += i
# Find the head of the merged token, and its dep relation
outer_heads = {}
for i in range(start, end):
head_idx = self.data[i].head
if head_idx == i or head_idx < start or head_idx >= end:
# Don't consider "heads" which are actually dominated by a word
# in the region we're merging
gp = head_idx
while self.data[gp].head != gp:
if start <= gp < end:
break
gp = self.data[gp].head
else:
# If we have multiple words attaching to the same head,
# but with different dep labels, we're preferring the last
# occurring dep label. Shrug. What else could we do, I guess?
outer_heads[head_idx] = self.data[i].dep
token.head, token.dep = max(outer_heads.items())
# Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end
offset = (end - start) - 1
for i in range(self.length):
head_idx = self.data[i].head
if start <= head_idx < end:
self.data[i].head = start
elif head_idx >= end:
self.data[i].head -= offset
# TODO: Fix left and right deps
# Now compress the token array
for i in range(end, self.length):
self.data[i - offset] = self.data[i]
for i in range(self.length - offset, self.length):
memset(&self.data[i], 0, sizeof(TokenC))
self.data[i].lex = &EMPTY_LEXEME
self.length -= offset
for i in range(self.length):
# ...And, set heads back to a relative position
self.data[i].head -= i
# Clear cached Python objects
self._py_tokens = [None] * self.length
# Return the merged Python object
return self[start]
def _has_trailing_space(self, int i):
cdef int end_idx = self.data[i].idx + self.data[i].lex.length
if end_idx >= len(self._string):
return False
else:
return self._string[end_idx] == u' '
def serialize(self, bits=None):
if bits is None:
bits = BitArray()
codec = self.vocab.codec
ids = numpy.zeros(shape=(len(self),), dtype=numpy.uint32)
cdef int i
for i in range(self.length):
ids[i] = self.data[i].lex.id
bits = codec.encode(ids, bits=bits)
for i in range(self.length):
bits.append(self._has_trailing_space(i))
return bits
@staticmethod
def deserialize(Vocab vocab, bits):
biterator = iter(bits)
ids = vocab.codec.decode(biterator)
spaces = []
for bit in biterator:
spaces.append(bit)
if len(spaces) == len(ids):
break
string = u''
cdef const LexemeC* lex
for id_, space in zip(ids, spaces):
lex = vocab.lexemes[id_]
string += vocab.strings[lex.orth]
if space:
string += u' '
cdef Doc doc = Doc(vocab, string)
cdef int idx = 0
for i, id_ in enumerate(ids):
doc.push_back(idx, vocab.lexemes[id_])
idx += vocab.lexemes[id_].length
if spaces[i]:
idx += 1
return doc
# Enhance backwards compatibility by aliasing Doc to Tokens, for now
Tokens = Doc
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Doc.__getitem__ and Doc.__iter__.
"""
def __cinit__(self, Vocab vocab, unicode string):
self.vocab = vocab
self._string = string
def __dealloc__(self):
if self._owns_c_data:
# Cast through const, if we own the data
PyMem_Free(<void*>self.c)
def __len__(self):
return self.c.lex.length
def __unicode__(self):
return self.string
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
return check_flag(self.c.lex, flag_id)
cdef int take_ownership_of_c_data(self) except -1:
owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)
self.c = owned_data
self._owns_c_data = True
def nbor(self, int i=1):
return Token.cinit(self.vocab, self._string,
self.c, self.i, self.array_len,
self._seq)
property lex_id:
def __get__(self):
return self.c.lex.id
property string:
def __get__(self):
if (self.i+1) == self._seq.length:
return self._string[self.c.idx:]
cdef int next_idx = (self.c + 1).idx
if next_idx < self.c.idx:
next_idx = self.c.idx + self.c.lex.length
return self._string[self.c.idx:next_idx]
property prob:
def __get__(self):
return self.c.lex.prob
property idx:
def __get__(self):
return self.c.idx
property cluster:
def __get__(self):
return self.c.lex.cluster
property orth:
def __get__(self):
return self.c.lex.orth
property lower:
def __get__(self):
return self.c.lex.lower
property norm:
def __get__(self):
return self.c.lex.norm
property shape:
def __get__(self):
return self.c.lex.shape
property prefix:
def __get__(self):
return self.c.lex.prefix
property suffix:
def __get__(self):
return self.c.lex.suffix
property lemma:
def __get__(self):
return self.c.lemma
property pos:
def __get__(self):
return self.c.pos
property tag:
def __get__(self):
return self.c.tag
property dep:
def __get__(self):
return self.c.dep
property repvec:
def __get__(self):
cdef int length = self.vocab.repvec_length
repvec_view = <float[:length,]>self.c.lex.repvec
return numpy.asarray(repvec_view)
property n_lefts:
def __get__(self):
cdef int n = 0
cdef const TokenC* ptr = self.c - self.i
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr += 1
return n
property n_rights:
def __get__(self):
cdef int n = 0
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr -= 1
return n
property lefts:
def __get__(self):
"""The leftward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c - self.i
while ptr < self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
ptr += ptr.head
elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len,
self._seq)
ptr += 1
else:
ptr += 1
property rights:
def __get__(self):
"""The rightward immediate children of the word, in the syntactic
dependency parse."""
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
tokens = []
while ptr > self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
ptr += ptr.head
elif ptr + ptr.head == self.c:
tokens.append(Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len,
self._seq))
ptr -= 1
else:
ptr -= 1
tokens.reverse()
for t in tokens:
yield t
property children:
def __get__(self):
yield from self.lefts
yield from self.rights
property subtree:
def __get__(self):
for word in self.lefts:
yield from word.subtree
yield self
for word in self.rights:
yield from word.subtree
property left_edge:
def __get__(self):
return Token.cinit(self.vocab, self._string,
(self.c - self.i) + self.c.l_edge, self.c.l_edge,
self.array_len, self._seq)
property right_edge:
def __get__(self):
return Token.cinit(self.vocab, self._string,
(self.c - self.i) + self.c.r_edge, self.c.r_edge,
self.array_len, self._seq)
property head:
def __get__(self):
"""The token predicted by the parser to be the head of the current token."""
return Token.cinit(self.vocab, self._string,
self.c + self.c.head, self.i + self.c.head, self.array_len,
self._seq)
property conjuncts:
def __get__(self):
"""Get a list of conjoined words"""
cdef Token word
conjs = []
if self.c.pos != CONJ and self.c.pos != PUNCT:
seen_conj = False
for word in reversed(list(self.lefts)):
if word.c.pos == CONJ:
seen_conj = True
elif seen_conj and word.c.pos == self.c.pos:
conjs.append(word)
conjs.reverse()
conjs.append(self)
if seen_conj:
return conjs
elif self is not self.head and self in self.head.conjuncts:
return self.head.conjuncts
else:
return []
property ent_type:
def __get__(self):
return self.c.ent_type
property ent_iob:
def __get__(self):
return self.c.ent_iob
property ent_type_:
def __get__(self):
return self.vocab.strings[self.c.ent_type]
property ent_iob_:
def __get__(self):
iob_strings = ('', 'I', 'O', 'B')
return iob_strings[self.c.ent_iob]
property whitespace_:
def __get__(self):
return self.string[self.c.lex.length:]
property orth_:
def __get__(self):
return self.vocab.strings[self.c.lex.orth]
property lower_:
def __get__(self):
return self.vocab.strings[self.c.lex.lower]
property norm_:
def __get__(self):
return self.vocab.strings[self.c.lex.norm]
property shape_:
def __get__(self):
return self.vocab.strings[self.c.lex.shape]
property prefix_:
def __get__(self):
return self.vocab.strings[self.c.lex.prefix]
property suffix_:
def __get__(self):
return self.vocab.strings[self.c.lex.suffix]
property lemma_:
def __get__(self):
return self.vocab.strings[self.c.lemma]
property pos_:
def __get__(self):
return _pos_id_to_string[self.c.pos]
property tag_:
def __get__(self):
return self.vocab.strings[self.c.tag]
property dep_:
def __get__(self):
return self.vocab.strings[self.c.dep]
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
_parse_unset_error = """Text has not been parsed, so cannot be accessed.
Check that the parser data is installed. Run "python -m spacy.en.download" if not.
Check whether parse=False in the call to English.__call__
"""

5
spacy/tokens/__init__.py Normal file
View File

@ -0,0 +1,5 @@
from .doc import Doc
from .token import Token
from .spans import Span
__all__ = [Doc, Token, Span]

35
spacy/tokens/doc.pxd Normal file
View File

@ -0,0 +1,35 @@
from cymem.cymem cimport Pool
cimport numpy as np
from preshed.counter cimport PreshCounter
from ..vocab cimport Vocab
from ..structs cimport TokenC, LexemeC
ctypedef const LexemeC* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr
ctypedef fused LexemeOrToken:
const_Lexeme_ptr
TokenC_ptr
cdef class Doc:
cdef Pool mem
cdef Vocab vocab
cdef TokenC* data
cdef public bint is_tagged
cdef public bint is_parsed
cdef public list _py_tokens
cdef int length
cdef int max_length
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1
cpdef np.ndarray to_array(self, object features)
cdef int set_parse(self, const TokenC* parsed) except -1

399
spacy/tokens/doc.pyx Normal file
View File

@ -0,0 +1,399 @@
cimport cython
from libc.string cimport memcpy, memset
import numpy
import struct
from ..lexeme cimport EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT
from ..lexeme cimport check_flag
from ..lexeme cimport get_attr as get_lex_attr
from .spans import Span
from .token cimport Token
from ..serialize.bits cimport BitArray
DEF PADDING = 5
cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0:
raise IndexError
if (i - padding) >= length:
raise IndexError
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
if feat_name == LEMMA:
return token.lemma
elif feat_name == POS:
return token.pos
elif feat_name == TAG:
return token.tag
elif feat_name == DEP:
return token.dep
elif feat_name == HEAD:
return token.head
elif feat_name == SPACY:
return token.spacy
elif feat_name == ENT_IOB:
return token.ent_iob
elif feat_name == ENT_TYPE:
return token.ent_type
else:
return get_lex_attr(token.lex, feat_name)
cdef class Doc:
"""
Container class for annotated text. Constructed via English.__call__ or
Tokenizer.__call__.
"""
def __init__(self, Vocab vocab, orths_and_spaces=None):
self.vocab = vocab
size = 20
self.mem = Pool()
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
# However, we need to remember the true starting places, so that we can
# realloc.
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
cdef int i
for i in range(size + (PADDING*2)):
data_start[i].lex = &EMPTY_LEXEME
self.data = data_start + PADDING
self.max_length = size
self.length = 0
self.is_tagged = False
self.is_parsed = False
self._py_tokens = []
def __getitem__(self, object i):
"""Get a token.
Returns:
token (Token):
"""
if isinstance(i, slice):
if i.step is not None:
raise ValueError("Stepped slices not supported in Span objects."
"Try: list(doc)[start:stop:step] instead.")
return Span(self, i.start, i.stop, label=0)
if i < 0:
i = self.length + i
bounds_check(i, self.length, PADDING)
if self._py_tokens[i] is not None:
return self._py_tokens[i]
else:
return Token.cinit(self.vocab, &self.data[i], i, self)
def __iter__(self):
"""Iterate over the tokens.
Yields:
token (Token):
"""
cdef int i
for i in range(self.length):
if self._py_tokens[i] is not None:
yield self._py_tokens[i]
else:
yield Token.cinit(self.vocab, &self.data[i], i, self)
def __len__(self):
return self.length
def __unicode__(self):
return u''.join([t.string for t in self])
@property
def string(self):
return unicode(self)
@property
def ents(self):
"""Yields named-entity Span objects.
Iterate over the span to get individual Token objects, or access the label:
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
>>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0])
(112504, u'PERSON', u'Best ')
"""
cdef int i
cdef const TokenC* token
cdef int start = -1
cdef int label = 0
for i in range(self.length):
token = &self.data[i]
if token.ent_iob == 1:
assert start != -1
pass
elif token.ent_iob == 2:
if start != -1:
yield Span(self, start, i, label=label)
start = -1
label = 0
elif token.ent_iob == 3:
if start != -1:
yield Span(self, start, i, label=label)
start = i
label = token.ent_type
if start != -1:
yield Span(self, start, self.length, label=label)
@property
def sents(self):
"""
Yield a list of sentence Span objects, calculated from the dependency parse.
"""
cdef int i
start = 0
for i in range(1, self.length):
if self.data[i].sent_start:
yield Span(self, start, i)
start = i
yield Span(self, start, self.length)
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
if self.length == self.max_length:
self._realloc(self.length * 2)
cdef TokenC* t = &self.data[self.length]
if LexemeOrToken is TokenC_ptr:
t[0] = lex_or_tok[0]
else:
t.lex = lex_or_tok
if self.length == 0:
t.idx = 0
else:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
t.spacy = has_space
self.length += 1
self._py_tokens.append(None)
return t.idx + t.lex.length + t.spacy
@cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
of shape N*M, where N is the length of the sentence.
Arguments:
attr_ids (list[int]): A list of attribute ID ints.
Returns:
feat_array (numpy.ndarray[long, ndim=2]):
A feature matrix, with one row per word, and one column per attribute
indicated in the input attr_ids.
"""
cdef int i, j
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
for i in range(self.length):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.data[i], feature)
return output
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed
by the values of the given attribute ID.
>>> from spacy.en import English, attrs
>>> nlp = English()
>>> tokens = nlp(u'apple apple orange banana')
>>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.ORTH])
array([[11880],
[11880],
[ 7561],
[12800]])
"""
cdef int i
cdef attr_t attr
cdef size_t count
if counts is None:
counts = PreshCounter(self.length)
output_dict = True
else:
output_dict = False
# Take this check out of the loop, for a bit of extra speed
if exclude is None:
for i in range(self.length):
attr = get_token_attr(&self.data[i], attr_id)
counts.inc(attr, 1)
else:
for i in range(self.length):
if not exclude(self[i]):
attr = get_token_attr(&self.data[i], attr_id)
counts.inc(attr, 1)
if output_dict:
return dict(counts)
def _realloc(self, new_size):
self.max_length = new_size
n = new_size + (PADDING * 2)
# What we're storing is a "padded" array. We've jumped forward PADDING
# places, and are storing the pointer to that. This way, we can access
# words out-of-bounds, and get out-of-bounds markers.
# Now that we want to realloc, we need the address of the true start,
# so we jump the pointer back PADDING places.
cdef TokenC* data_start = self.data - PADDING
data_start = <TokenC*>self.mem.realloc(data_start, n * sizeof(TokenC))
self.data = data_start + PADDING
cdef int i
for i in range(self.length, self.max_length + PADDING):
self.data[i].lex = &EMPTY_LEXEME
cdef int set_parse(self, const TokenC* parsed) except -1:
# TODO: This method is fairly misleading atm. It's used by Parser
# to actually apply the parse calculated. Need to rethink this.
# Probably we should use from_array?
self.is_parsed = True
for i in range(self.length):
self.data[i] = parsed[i]
def from_array(self, attrs, array):
cdef int i, col
cdef attr_id_t attr_id
cdef TokenC* tokens = self.data
cdef int length = len(array)
for col, attr_id in enumerate(attrs):
values = array[:, col]
if attr_id == HEAD:
# TODO: Set left and right children
for i in range(length):
tokens[i].head = values[i]
elif attr_id == TAG:
for i in range(length):
tokens[i].tag = values[i]
elif attr_id == DEP:
for i in range(length):
tokens[i].dep = values[i]
elif attr_id == ENT_IOB:
for i in range(length):
tokens[i].ent_iob = values[i]
elif attr_id == ENT_TYPE:
for i in range(length):
tokens[i].ent_type = values[i]
return self
def to_bytes(self):
byte_string = self.vocab.serializer.pack(self)
return struct.pack('I', len(byte_string)) + byte_string
def from_bytes(self, bytes data):
self.vocab.serializer.unpack_into(data[4:], self)
return self
@staticmethod
def read_bytes(file_):
keep_reading = True
while keep_reading:
try:
n_bytes_str = file_.read(4)
if len(n_bytes_str) < 4:
break
n_bytes = struct.unpack('I', n_bytes_str)[0]
data = file_.read(n_bytes)
except StopIteration:
keep_reading = False
yield n_bytes_str + data
# This function is terrible --- need to fix this.
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
unicode ent_type):
"""Merge a multi-word expression into a single token. Currently
experimental; API is likely to change."""
cdef int i
cdef int start = -1
cdef int end = -1
for i in range(self.length):
if self.data[i].idx == start_idx:
start = i
if (self.data[i].idx + self.data[i].lex.length) == end_idx:
if start == -1:
return None
end = i + 1
break
else:
return None
cdef unicode string = self.string
# Get LexemeC for newly merged token
new_orth = string[start_idx:end_idx]
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
# House the new merged token where it starts
cdef TokenC* token = &self.data[start]
# Update fields
token.lex = lex
# What to do about morphology??
# TODO: token.morph = ???
token.tag = self.vocab.strings[tag]
token.lemma = self.vocab.strings[lemma]
if ent_type == 'O':
token.ent_iob = 2
token.ent_type = 0
else:
token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type]
# Fix dependencies
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
for i in range(self.length):
self.data[i].head += i
# Find the head of the merged token, and its dep relation
outer_heads = {}
for i in range(start, end):
head_idx = self.data[i].head
if head_idx == i or head_idx < start or head_idx >= end:
# Don't consider "heads" which are actually dominated by a word
# in the region we're merging
gp = head_idx
while self.data[gp].head != gp:
if start <= gp < end:
break
gp = self.data[gp].head
else:
# If we have multiple words attaching to the same head,
# but with different dep labels, we're preferring the last
# occurring dep label. Shrug. What else could we do, I guess?
outer_heads[head_idx] = self.data[i].dep
token.head, token.dep = max(outer_heads.items())
# Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end
offset = (end - start) - 1
for i in range(self.length):
head_idx = self.data[i].head
if start <= head_idx < end:
self.data[i].head = start
elif head_idx >= end:
self.data[i].head -= offset
# TODO: Fix left and right deps
# Now compress the token array
for i in range(end, self.length):
self.data[i - offset] = self.data[i]
for i in range(self.length - offset, self.length):
memset(&self.data[i], 0, sizeof(TokenC))
self.data[i].lex = &EMPTY_LEXEME
self.length -= offset
for i in range(self.length):
# ...And, set heads back to a relative position
self.data[i].head -= i
# Return the merged Python object
return self[start]

9
spacy/tokens/spans.pxd Normal file
View File

@ -0,0 +1,9 @@
from .doc cimport Doc
cdef class Span:
cdef readonly Doc _seq
cdef public int i
cdef public int start
cdef public int end
cdef readonly int label

View File

@ -1,6 +1,11 @@
from __future__ import unicode_literals
from collections import defaultdict
from ..structs cimport Morphology, TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t
from ..attrs cimport attr_id_t
from ..parts_of_speech cimport univ_pos_t
cdef class Span:
"""A slice from a Doc object."""

25
spacy/tokens/token.pxd Normal file
View File

@ -0,0 +1,25 @@
from ..vocab cimport Vocab
from ..structs cimport TokenC
from ..attrs cimport attr_id_t
from .doc cimport Doc
cdef class Token:
cdef Vocab vocab
cdef const TokenC* c
cdef readonly int i
cdef int array_len
cdef readonly Doc doc
@staticmethod
cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc):
if offset < 0 or offset >= doc.length:
msg = "Attempt to access token at %d, max length %d"
raise IndexError(msg % (offset, doc.length))
if doc._py_tokens[offset] != None:
return doc._py_tokens[offset]
cdef Token self = Token.__new__(Token, vocab, doc, offset)
doc._py_tokens[offset] = self
return self
cpdef bint check_flag(self, attr_id_t flag_id) except -1

282
spacy/tokens/token.pyx Normal file
View File

@ -0,0 +1,282 @@
from libc.string cimport memcpy
from cpython.mem cimport PyMem_Malloc, PyMem_Free
from ..lexeme cimport check_flag
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()
import numpy
from ..parts_of_speech import UNIV_POS_NAMES
from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Doc.__getitem__ and Doc.__iter__.
"""
def __cinit__(self, Vocab vocab, Doc doc, int offset):
self.vocab = vocab
self.doc = doc
self.c = &self.doc.data[offset]
self.i = offset
self.array_len = doc.length
def __len__(self):
return self.c.lex.length
def __unicode__(self):
return self.string
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
return check_flag(self.c.lex, flag_id)
def nbor(self, int i=1):
return self.doc[self.i+i]
property lex_id:
def __get__(self):
return self.c.lex.id
property string:
def __get__(self):
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
if self.c.spacy:
return orth + u' '
else:
return orth
property prob:
def __get__(self):
return self.c.lex.prob
property idx:
def __get__(self):
return self.c.idx
property cluster:
def __get__(self):
return self.c.lex.cluster
property orth:
def __get__(self):
return self.c.lex.orth
property lower:
def __get__(self):
return self.c.lex.lower
property norm:
def __get__(self):
return self.c.lex.norm
property shape:
def __get__(self):
return self.c.lex.shape
property prefix:
def __get__(self):
return self.c.lex.prefix
property suffix:
def __get__(self):
return self.c.lex.suffix
property lemma:
def __get__(self):
return self.c.lemma
property pos:
def __get__(self):
return self.c.pos
property tag:
def __get__(self):
return self.c.tag
property dep:
def __get__(self):
return self.c.dep
property repvec:
def __get__(self):
cdef int length = self.vocab.repvec_length
repvec_view = <float[:length,]>self.c.lex.repvec
return numpy.asarray(repvec_view)
property n_lefts:
def __get__(self):
cdef int n = 0
cdef const TokenC* ptr = self.c - self.i
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr += 1
return n
property n_rights:
def __get__(self):
cdef int n = 0
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr -= 1
return n
property lefts:
def __get__(self):
"""The leftward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c - self.i
while ptr < self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
ptr += ptr.head
elif ptr + ptr.head == self.c:
yield self.doc[ptr - (self.c - self.i)]
ptr += 1
else:
ptr += 1
property rights:
def __get__(self):
"""The rightward immediate children of the word, in the syntactic
dependency parse."""
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
tokens = []
while ptr > self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
ptr += ptr.head
elif ptr + ptr.head == self.c:
tokens.append(self.doc[ptr - (self.c - self.i)])
ptr -= 1
else:
ptr -= 1
tokens.reverse()
for t in tokens:
yield t
property children:
def __get__(self):
yield from self.lefts
yield from self.rights
property subtree:
def __get__(self):
for word in self.lefts:
yield from word.subtree
yield self
for word in self.rights:
yield from word.subtree
property left_edge:
def __get__(self):
return self.doc[self.c.l_edge]
property right_edge:
def __get__(self):
return self.doc[self.c.r_edge]
property head:
def __get__(self):
"""The token predicted by the parser to be the head of the current token."""
return self.doc[self.i + self.c.head]
property conjuncts:
def __get__(self):
"""Get a list of conjoined words"""
cdef Token word
conjs = []
if self.c.pos != CONJ and self.c.pos != PUNCT:
seen_conj = False
for word in reversed(list(self.lefts)):
if word.c.pos == CONJ:
seen_conj = True
elif seen_conj and word.c.pos == self.c.pos:
conjs.append(word)
conjs.reverse()
conjs.append(self)
if seen_conj:
return conjs
elif self is not self.head and self in self.head.conjuncts:
return self.head.conjuncts
else:
return []
property ent_type:
def __get__(self):
return self.c.ent_type
property ent_iob:
def __get__(self):
return self.c.ent_iob
property ent_type_:
def __get__(self):
return self.vocab.strings[self.c.ent_type]
property ent_iob_:
def __get__(self):
iob_strings = ('', 'I', 'O', 'B')
return iob_strings[self.c.ent_iob]
property whitespace_:
def __get__(self):
return self.string[self.c.lex.length:]
property orth_:
def __get__(self):
return self.vocab.strings[self.c.lex.orth]
property lower_:
def __get__(self):
return self.vocab.strings[self.c.lex.lower]
property norm_:
def __get__(self):
return self.vocab.strings[self.c.lex.norm]
property shape_:
def __get__(self):
return self.vocab.strings[self.c.lex.shape]
property prefix_:
def __get__(self):
return self.vocab.strings[self.c.lex.prefix]
property suffix_:
def __get__(self):
return self.vocab.strings[self.c.lex.suffix]
property lemma_:
def __get__(self):
return self.vocab.strings[self.c.lemma]
property pos_:
def __get__(self):
return _pos_id_to_string[self.c.pos]
property tag_:
def __get__(self):
return self.vocab.strings[self.c.tag]
property dep_:
def __get__(self):
return self.vocab.strings[self.c.dep]
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

View File

@ -1,96 +1,10 @@
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
from libc.stdint cimport uint8_t
# Reserve 64 values for flag features
cpdef enum attr_id_t:
FLAG0
FLAG1
FLAG2
FLAG3
FLAG4
FLAG5
FLAG6
FLAG7
FLAG8
FLAG9
FLAG10
FLAG11
FLAG12
FLAG13
FLAG14
FLAG15
FLAG16
FLAG17
FLAG18
FLAG19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
ID
ORTH
LOWER
NORM
SHAPE
PREFIX
SUFFIX
LENGTH
CLUSTER
LEMMA
POS
TAG
DEP
ENT
ctypedef uint64_t hash_t
ctypedef char* utf8_t
ctypedef uint32_t attr_t
ctypedef int32_t attr_t
ctypedef uint64_t flags_t
ctypedef uint32_t id_t
ctypedef uint16_t len_t
ctypedef uint16_t tag_t

View File

@ -2,6 +2,7 @@ from os import path
import codecs
import json
import re
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -64,7 +65,7 @@ def read_tokenization(lang):
return entries
def read_detoken_rules(lang):
def read_detoken_rules(lang): # Deprecated?
loc = path.join(DATA_DIR, lang, 'detokenize')
entries = []
with utf8open(loc) as file_:
@ -73,7 +74,7 @@ def read_detoken_rules(lang):
return entries
def align_tokens(ref, indices):
def align_tokens(ref, indices): # Deprecated, surely?
start = 0
queue = list(indices)
for token in ref:
@ -86,7 +87,7 @@ def align_tokens(ref, indices):
assert not queue
def detokenize(token_rules, words):
def detokenize(token_rules, words): # Deprecated?
"""To align with treebanks, return a list of "chunks", where a chunk is a
sequence of tokens that are separated by whitespace in actual strings. Each
chunk should be a tuple of token indices, e.g.

View File

@ -4,8 +4,8 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from .structs cimport LexemeC, TokenC, UniStr
from .typedefs cimport utf8_t, id_t, hash_t
from .structs cimport LexemeC, TokenC
from .typedefs cimport utf8_t, attr_t, hash_t
from .strings cimport StringStore
@ -27,13 +27,16 @@ cdef class Vocab:
cpdef public lexeme_props_getter
cdef Pool mem
cpdef readonly StringStore strings
cdef vector[const LexemeC*] lexemes
cdef readonly object pos_tags
cdef readonly int length
cdef public object _serializer
cdef public object data_dir
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef PreshMap _map
cdef PreshMap _by_hash
cdef PreshMap _by_orth
cdef readonly int repvec_length
cdef public object _codec

View File

@ -1,23 +1,24 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memset
from libc.stdint cimport int32_t
from libc.math cimport exp as c_exp
import bz2
from os import path
import codecs
import math
import json
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport set_lex_struct_props
from .lexeme cimport Lexeme
from .strings cimport slice_unicode
from .strings cimport hash_string
from .orth cimport word_shape
from .typedefs cimport attr_t
from .serialize cimport HuffmanCodec
from .cfile cimport CFile
from cymem.cymem cimport Address
from . import util
from .serialize.packer cimport Packer
DEF MAX_VEC_SIZE = 100000
@ -35,12 +36,15 @@ cdef class Vocab:
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
pos_tags=None):
self.mem = Pool()
self._map = PreshMap(2 ** 20)
self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore()
self.pos_tags = pos_tags if pos_tags is not None else {}
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexeme_props_getter = get_lex_props
self.repvec_length = 0
self.length = 0
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
if data_dir is not None:
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
@ -51,38 +55,77 @@ cdef class Vocab:
path.join(data_dir, 'lexemes.bin'))
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
self._codec = None
self._serializer = None
self.data_dir = data_dir
property serializer:
def __get__(self):
if self._serializer is None:
freqs = []
if self.data_dir is not None:
freqs_loc = path.join(self.data_dir, 'serializer.json')
if path.exists(freqs_loc):
freqs = json.load(open(freqs_loc))
self._serializer = Packer(self, freqs)
return self._serializer
def __len__(self):
"""The current number of lexemes stored."""
return self.lexemes.size()
return self.length
cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef LexemeC* lex
lex = <LexemeC*>self._map.get(c_str.key)
cdef hash_t key = hash_string(string)
lex = <LexemeC*>self._by_hash.get(key)
if lex != NULL:
return lex
if c_str.n < 3:
cdef bint is_oov = mem is not self.mem
if len(string) < 3:
mem = self.mem
cdef unicode py_str = c_str.chars[:c_str.n]
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(py_str)
props = self.lexeme_props_getter(string)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if mem is self.mem:
lex.id = self.lexemes.size()
self._add_lex_to_vocab(c_str.key, lex)
if is_oov:
lex.id = 0
else:
lex.id = 1
self._add_lex_to_vocab(key, lex)
return lex
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef LexemeC* lex
lex = <LexemeC*>self._by_orth.get(orth)
if lex != NULL:
return lex
cdef unicode string = self.strings[orth]
cdef bint is_oov = mem is not self.mem
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov:
lex.id = 0
else:
self._add_lex_to_vocab(hash_string(string), lex)
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._map.set(key, <void*>lex)
while self.lexemes.size() < (lex.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lex.id] = lex
self._by_hash.set(key, <void*>lex)
self._by_orth.set(lex.orth, <void*>lex)
self.length += 1
def __iter__(self):
cdef attr_t orth
cdef size_t addr
for orth, addr in self._by_orth.items():
yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length)
def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
@ -99,51 +142,46 @@ cdef class Vocab:
An instance of the Lexeme Python class, with data copied on
instantiation.
'''
cdef UniStr c_str
cdef const LexemeC* lexeme
cdef attr_t orth
if type(id_or_string) == int:
if id_or_string >= self.lexemes.size():
raise IndexError
lexeme = self.lexemes.at(id_or_string)
orth = id_or_string
lexeme = <LexemeC*>self._by_orth.get(orth)
if lexeme == NULL:
raise KeyError(id_or_string)
assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
elif type(id_or_string) == unicode:
slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
lexeme = self.get(self.mem, &c_str)
lexeme = self.get(self.mem, id_or_string)
assert lexeme.orth == self.strings[id_or_string]
else:
raise ValueError("Vocab unable to map type: "
"%s. Maps unicode --> Lexeme or "
"int --> Lexeme" % str(type(id_or_string)))
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
def __setitem__(self, unicode py_str, dict props):
cdef UniStr c_str
slice_unicode(&c_str, py_str, 0, len(py_str))
def __setitem__(self, unicode string, dict props):
cdef hash_t key = hash_string(string)
cdef LexemeC* lex
lex = <LexemeC*>self._map.get(c_str.key)
lex = <LexemeC*>self._by_hash.get(key)
if lex == NULL:
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
lex.id = self.lexemes.size()
self._add_lex_to_vocab(c_str.key, lex)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
self._add_lex_to_vocab(key, lex)
def dump(self, loc):
if path.exists(loc):
assert not path.isdir(loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
assert fp != NULL
cdef CFile fp = CFile(bytes_loc, 'wb')
cdef size_t st
cdef size_t addr
cdef hash_t key
for i in range(self._map.length):
key = self._map.c_map.cells[i].key
if key == 0:
continue
lexeme = <LexemeC*>self._map.c_map.cells[i].value
st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
assert st == 1
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
assert st == 1
st = fclose(fp)
assert st == 0
for key, addr in self._by_hash.items():
lexeme = <LexemeC*>addr
fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
fp.write_from(lexeme, sizeof(LexemeC), 1)
fp.close()
def load_lexemes(self, strings_loc, loc):
self.strings.load(strings_loc)
@ -174,40 +212,37 @@ cdef class Vocab:
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
py_str = self.strings[orth]
key = hash_string(py_str)
self._map.set(key, lexeme)
while self.lexemes.size() < (lexeme.id + 1):
self.lexemes.push_back(&EMPTY_LEXEME)
self.lexemes[lexeme.id] = lexeme
self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme)
self.length += 1
i += 1
fclose(fp)
def load_rep_vectors(self, loc):
file_ = _CFile(loc, b'rb')
cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len
cdef int32_t vec_len
cdef int32_t prev_vec_len = 0
cdef float* vec
cdef Address mem
cdef id_t string_id
cdef attr_t string_id
cdef bytes py_word
cdef vector[float*] vectors
cdef int i
cdef Pool tmp_mem = Pool()
while True:
try:
file_.read(&word_len, sizeof(word_len), 1)
file_.read_into(&word_len, sizeof(word_len), 1)
except IOError:
break
file_.read(&vec_len, sizeof(vec_len), 1)
file_.read_into(&vec_len, sizeof(vec_len), 1)
if prev_vec_len != 0 and vec_len != prev_vec_len:
raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len)
if 0 >= vec_len >= MAX_VEC_SIZE:
raise VectorReadError.bad_size(loc, vec_len)
mem = Address(word_len, sizeof(char))
chars = <char*>mem.ptr
vec = <float*>self.mem.alloc(vec_len, sizeof(float))
file_.read(chars, sizeof(char), word_len)
file_.read(vec, sizeof(float), vec_len)
chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
string_id = self.strings[chars[:word_len]]
while string_id >= vectors.size():
@ -215,9 +250,9 @@ cdef class Vocab:
assert vec != NULL
vectors[string_id] = vec
cdef LexemeC* lex
for i in range(self.lexemes.size()):
# Cast away the const, cos we can modify our lexemes
lex = <LexemeC*>self.lexemes[i]
cdef size_t lex_addr
for orth, lex_addr in self._by_orth.items():
lex = <LexemeC*>lex_addr
if lex.lower < vectors.size():
lex.repvec = vectors[lex.lower]
for i in range(vec_len):
@ -227,25 +262,9 @@ cdef class Vocab:
lex.repvec = EMPTY_VEC
return vec_len
property codec:
def __get__(self):
cdef Address mem
cdef int i
cdef float[:] cv_probs
if self._codec is not None:
return self._codec
else:
mem = Address(len(self), sizeof(float))
probs = <float*>mem.ptr
for i in range(len(self)):
probs[i] = <float>c_exp(self.lexemes[i].prob)
cv_probs = <float[:len(self)]>probs
self._codec = HuffmanCodec(cv_probs, 0)
return self._codec
def write_binary_vectors(in_loc, out_loc):
cdef _CFile out_file = _CFile(out_loc, 'wb')
cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem
cdef int32_t word_len
cdef int32_t vec_len
@ -262,42 +281,12 @@ def write_binary_vectors(in_loc, out_loc):
word_len = len(word)
vec_len = len(pieces)
out_file.write(sizeof(word_len), 1, &word_len)
out_file.write(sizeof(vec_len), 1, &vec_len)
out_file.write_from(&word_len, 1, sizeof(word_len))
out_file.write_from(&vec_len, 1, sizeof(vec_len))
chars = <char*>word
out_file.write(sizeof(char), len(word), chars)
out_file.write(sizeof(float), vec_len, vec)
cdef class _CFile:
cdef FILE* fp
def __init__(self, loc, bytes mode):
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode)
if self.fp == NULL:
raise IOError
def __dealloc__(self):
fclose(self.fp)
def close(self):
fclose(self.fp)
cdef int read(self, void* dest, size_t elem_size, size_t n) except -1:
st = fread(dest, elem_size, n, self.fp)
if st != n:
raise IOError
cdef int write(self, size_t elem_size, size_t n, void* data) except -1:
st = fwrite(data, elem_size, n, self.fp)
if st != n:
raise IOError
cdef int write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)
out_file.write_from(chars, len(word), sizeof(char))
out_file.write_from(vec, vec_len, sizeof(float))
class VectorReadError(Exception):

View File

@ -7,3 +7,19 @@ import os
def EN():
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
return English(data_dir=data_dir)
def pytest_addoption(parser):
parser.addoption("--models", action="store_true",
help="include tests that require full models")
parser.addoption("--vectors", action="store_true",
help="include word vectors tests")
parser.addoption("--slow", action="store_true",
help="include slow tests")
def pytest_runtest_setup(item):
for opt in ['models', 'vectors', 'slow']:
if opt in item.keywords and not item.config.getoption("--%s" % opt):
pytest.skip("need --%s option to run" % opt)

View File

@ -1,4 +1,6 @@
import pytest
@pytest.mark.models
def test_simple_types(EN):
tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)

View File

@ -0,0 +1,75 @@
from __future__ import unicode_literals
import pytest
import numpy
from spacy.vocab import Vocab
from spacy.serialize.packer import _BinaryCodec
from spacy.serialize.huffman import HuffmanCodec
from spacy.serialize.bits import BitArray
def test_binary():
codec = _BinaryCodec()
bits = BitArray()
msg = numpy.array([0, 1, 0, 1, 1], numpy.int32)
codec.encode(msg, bits)
result = numpy.array([0, 0, 0, 0, 0], numpy.int32)
bits.seek(0)
codec.decode(bits, result)
assert list(msg) == list(result)
def test_attribute():
freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, 'over': 8,
'lazy': 1, 'dog': 2, '.': 9}
int_map = {'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5,
'lazy': 6, 'dog': 7, '.': 8}
codec = HuffmanCodec([(int_map[string], freq) for string, freq in freqs.items()])
bits = BitArray()
msg = numpy.array([1, 7], dtype=numpy.int32)
msg_list = list(msg)
codec.encode(msg, bits)
result = numpy.array([0, 0], dtype=numpy.int32)
bits.seek(0)
codec.decode(bits, result)
assert msg_list == list(result)
def test_vocab_codec():
def get_lex_props(string, prob):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': prob,
'sentiment': 0
}
vocab = Vocab()
vocab['dog'] = get_lex_props('dog', 0.001)
vocab['the'] = get_lex_props('the', 0.05)
vocab['jumped'] = get_lex_props('jumped', 0.005)
codec = HuffmanCodec([(lex.orth, lex.prob) for lex in vocab])
bits = BitArray()
ids = [vocab[s].orth for s in ('the', 'dog', 'jumped')]
msg = numpy.array(ids, dtype=numpy.int32)
msg_list = list(msg)
codec.encode(msg, bits)
result = numpy.array(range(len(msg)), dtype=numpy.int32)
bits.seek(0)
codec.decode(bits, result)
assert msg_list == list(result)

View File

@ -3,33 +3,15 @@ from __future__ import division
import pytest
from spacy.serialize import HuffmanCodec
from spacy.serialize.huffman import HuffmanCodec
from spacy.serialize.bits import BitArray
import numpy
import math
from heapq import heappush, heappop, heapify
from collections import defaultdict
class Vocab(object):
def __init__(self, freqs):
freqs['-eol-'] = 5
total = sum(freqs.values())
by_freq = freqs.items()
by_freq.sort(key=lambda item: item[1], reverse=True)
self.symbols = [sym for sym, freq in by_freq]
self.probs = numpy.array([item[1] / total for item in by_freq], dtype=numpy.float32)
self.table = {sym: i for i, sym in enumerate(self.symbols)}
self.codec = HuffmanCodec(self.probs, self.table['-eol-'])
def pack(self, message):
seq = [self.table[sym] for sym in message]
return self.codec.encode(numpy.array(seq, dtype=numpy.uint32))
def unpack(self, packed):
ids = self.codec.decode(packed)
return [self.symbols[i] for i in ids]
def py_encode(symb2freq):
"""Huffman encode the given dict mapping symbols to weights
From Rosetta Code
@ -60,7 +42,7 @@ def test1():
probs[8] = 0.0001
probs[9] = 0.000001
codec = HuffmanCodec(probs, 9)
codec = HuffmanCodec(list(enumerate(probs)))
py_codes = py_encode(dict(enumerate(probs)))
py_codes = py_codes.items()
@ -71,19 +53,21 @@ def test1():
def test_round_trip():
freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, 'over': 8,
'lazy': 1, 'dog': 2, '.': 9}
vocab = Vocab(freqs)
codec = HuffmanCodec(freqs.items())
message = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the',
'the', 'lazy', 'dog', '.']
strings = list(vocab.codec.strings)
codes = {vocab.symbols[i]: strings[i] for i in range(len(vocab.symbols))}
packed = vocab.pack(message)
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in packed.as_bytes())
strings = list(codec.strings)
codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))}
bits = codec.encode(message)
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
for word in message:
code = codes[word]
assert string[:len(code)] == code
string = string[len(code):]
unpacked = vocab.unpack(packed)
unpacked = [0] * len(message)
bits.seek(0)
codec.decode(bits, unpacked)
assert message == unpacked
@ -92,34 +76,37 @@ def test_rosetta():
symb2freq = defaultdict(int)
for ch in txt:
symb2freq[ch] += 1
symb2freq['-eol-'] = 1
by_freq = symb2freq.items()
by_freq.sort(reverse=True, key=lambda item: item[1])
symbols = [sym for sym, prob in by_freq]
probs = numpy.array([prob for sym, prob in by_freq], dtype=numpy.float32)
codec = HuffmanCodec(probs, symbols.index('-eol-'))
codec = HuffmanCodec(symb2freq.items())
py_codec = py_encode(symb2freq)
codes = {codec.leaves[i]: codec.strings[i] for i in range(len(codec.leaves))}
my_lengths = defaultdict(int)
py_lengths = defaultdict(int)
for i, my in enumerate(codec.strings):
symb = by_freq[i][0]
my_lengths[len(my)] += by_freq[i][1]
py_lengths[len(py_codec[symb])] += by_freq[i][1]
for symb, freq in symb2freq.items():
my = codes[symb]
my_lengths[len(my)] += freq
py_lengths[len(py_codec[symb])] += freq
my_exp_len = sum(length * weight for length, weight in my_lengths.items())
py_exp_len = sum(length * weight for length, weight in py_lengths.items())
assert my_exp_len == py_exp_len
@pytest.mark.slow
def test_vocab(EN):
codec = EN.vocab.codec
codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
expected_length = 0
for i, code in enumerate(codec.strings):
expected_length += len(code) * numpy.exp(EN.vocab[i].prob)
leaf = codec.leaves[i]
expected_length += len(code) * numpy.exp(EN.vocab[leaf].prob)
assert 8 < expected_length < 15
@pytest.mark.slow
def test_freqs():
freqs = []
words = []
@ -129,11 +116,10 @@ def test_freqs():
continue
freq, word = pieces
freqs.append(int(freq))
freqs.append(1)
total = sum(freqs)
freqs = [(float(f) / total) for f in freqs]
codec = HuffmanCodec(numpy.array(freqs, dtype=numpy.float32), len(freqs)-1)
words.append(word)
total = float(sum(freqs))
codec = HuffmanCodec(zip(words, freqs))
expected_length = 0
for i, code in enumerate(codec.strings):
expected_length += len(code) * freqs[i]
expected_length += len(code) * (freqs[i] / total)
assert 8 < expected_length < 14

View File

@ -0,0 +1,23 @@
import pytest
from spacy.serialize.packer import Packer
from spacy.attrs import ORTH, SPACY
from spacy.tokens import Doc
import math
def test_read_write(EN):
doc1 = EN(u'This is a simple test. With a couple of sentences.')
doc2 = EN(u'This is another test document.')
with open('/tmp/spacy_docs.bin', 'wb') as file_:
file_.write(doc1.to_bytes())
file_.write(doc2.to_bytes())
with open('/tmp/spacy_docs.bin', 'rb') as file_:
bytes1, bytes2 = Doc.read_bytes(file_)
r1 = Doc(EN.vocab).from_bytes(bytes1)
r2 = Doc(EN.vocab).from_bytes(bytes2)
assert r1.string == doc1.string
assert r2.string == doc2.string

View File

@ -0,0 +1,122 @@
from __future__ import unicode_literals
import re
import pytest
import numpy
from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer
from spacy.en import LOCAL_DATA_DIR
from os import path
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer
from spacy.serialize.bits import BitArray
def get_lex_props(string, prob=-22):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': prob,
'sentiment': 0
}
@pytest.fixture
def vocab():
vocab = Vocab(get_lex_props=get_lex_props)
vocab['dog'] = get_lex_props('dog', 0.001)
assert vocab[vocab.strings['dog']].orth_ == 'dog'
vocab['the'] = get_lex_props('the', 0.01)
vocab['quick'] = get_lex_props('quick', 0.005)
vocab['jumped'] = get_lex_props('jumped', 0.007)
return vocab
@pytest.fixture
def tokenizer(vocab):
null_re = re.compile(r'!!!!!!!!!')
tokenizer = Tokenizer(vocab, {}, null_re, null_re, null_re)
return tokenizer
def test_char_packer(vocab):
packer = Packer(vocab, [])
bits = BitArray()
bits.seek(0)
byte_str = b'the dog jumped'
packer.char_codec.encode(byte_str, bits)
bits.seek(0)
result = [b''] * len(byte_str)
packer.char_codec.decode(bits, result)
assert b''.join(result) == byte_str
def test_packer_unannotated(tokenizer):
packer = Packer(tokenizer.vocab, [])
msg = tokenizer(u'the dog jumped')
assert msg.string == 'the dog jumped'
bits = packer.pack(msg)
result = packer.unpack(bits)
assert result.string == 'the dog jumped'
def test_packer_annotated(tokenizer):
vocab = tokenizer.vocab
nn = vocab.strings['NN']
dt = vocab.strings['DT']
vbd = vocab.strings['VBD']
jj = vocab.strings['JJ']
det = vocab.strings['det']
nsubj = vocab.strings['nsubj']
adj = vocab.strings['adj']
root = vocab.strings['ROOT']
attr_freqs = [
(TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]),
(DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()),
(HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
]
packer = Packer(vocab, attr_freqs)
msg = tokenizer(u'the dog jumped')
msg.from_array(
[TAG, DEP, HEAD],
numpy.array([
[dt, det, 1],
[nn, nsubj, 1],
[vbd, root, 0]
], dtype=numpy.int32))
assert msg.string == 'the dog jumped'
assert [t.tag_ for t in msg] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in msg] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in msg] == [1, 1, 0]
bits = packer.pack(msg)
result = packer.unpack(bits)
assert result.string == 'the dog jumped'
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in result] == [1, 1, 0]

View File

@ -1,7 +1,9 @@
import pytest
from spacy.en import English
from spacy.en import English, LOCAL_DATA_DIR
import os
@pytest.fixture(scope="session")
def en_nlp():
return English(load_vectors=False)
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
return English(load_vectors=False, data_dir=data_dir)

View File

@ -1,6 +1,8 @@
from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_merge_tokens(EN):
tokens = EN(u'Los Angeles start.')
assert len(tokens) == 4
@ -12,6 +14,7 @@ def test_merge_tokens(EN):
assert tokens[0].head.orth_ == 'start'
@pytest.mark.models
def test_merge_heads(EN):
tokens = EN(u'I found a pilates class near work.')
assert len(tokens) == 8

View File

@ -22,4 +22,4 @@ def test_root(doc):
assert len(np) == 2
assert np.orth_ == 'a sentence'
assert np.root.orth_ == 'sentence'
assert nlp.root.head.orth_ == 'is'
assert np.root.head.orth_ == 'is'

View File

@ -1,8 +1,9 @@
# -*- coding: utf-8 -*-
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
from spacy.en.attrs import IS_LOWER
import pytest
@pytest.mark.models
def test_1():
import spacy.en
from spacy.parts_of_speech import ADV
@ -21,6 +22,7 @@ def test_1():
assert o == -11.07155704498291
@pytest.mark.models
def test2():
import spacy.en
from spacy.parts_of_speech import ADV
@ -41,6 +43,7 @@ def test2():
-11.07155704498291
@pytest.mark.models
def test3():
import spacy.en
from spacy.parts_of_speech import ADV

View File

@ -32,7 +32,6 @@ def test_aint(en_tokenizer):
assert tokens[1].orth_ == "n't"
assert tokens[1].lemma_ == "not"
def test_capitalized(en_tokenizer):
tokens = en_tokenizer("can't")
assert len(tokens) == 2

View File

@ -1,16 +1,10 @@
from __future__ import unicode_literals
import pytest
from spacy.en import English
from spacy.parts_of_speech import ADV
@pytest.fixture
def nlp():
return English()
def test_prob(nlp):
tokens = nlp(u'Give it back')
def test_prob(EN):
tokens = EN(u'Give it back', parse=False)
give = tokens[0]
assert give.prob != 0

View File

@ -7,6 +7,7 @@ from spacy.en.attrs import IS_STOP
import pytest
@pytest.mark.models
def test_strings(EN):
tokens = EN(u'Give it back! He pleaded.')
token = tokens[0]

View File

@ -2,13 +2,15 @@ from __future__ import unicode_literals
import pytest
import gc
from spacy.en import English
from spacy.en import English, LOCAL_DATA_DIR
import os
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
# Let this have its own instances, as we have to be careful about memory here
# that's the point, after all
def get_orphan_token(text, i):
nlp = English(load_vectors=False)
nlp = English(load_vectors=False, data_dir=data_dir)
tokens = nlp(text)
gc.collect()
token = tokens[i]
@ -22,7 +24,7 @@ def test_orphan():
dummy = get_orphan_token('Load and flush the memory', 0)
dummy = get_orphan_token('Load again...', 0)
assert orphan.orth_ == 'orphan'
assert orphan.pos_ == 'NOUN'
assert orphan.pos_ in ('ADJ', 'NOUN')
assert orphan.head.orth_ == 'token'
@ -36,7 +38,7 @@ def _orphan_from_list(toks):
def test_list_orphans():
# Test case from NSchrading
nlp = English(load_vectors=False)
nlp = English(load_vectors=False, data_dir=data_dir)
samples = ["a", "test blah wat okay"]
lst = []
for sample in samples:

View File

@ -5,7 +5,7 @@ from spacy.tokens import Doc
import pytest
def test_getitem(EN):
def mest_getitem(EN):
tokens = EN(u'Give it back! He pleaded.')
assert tokens[0].orth_ == 'Give'
assert tokens[-1].orth_ == '.'
@ -13,24 +13,19 @@ def test_getitem(EN):
tokens[len(tokens)]
def test_trailing_spaces(EN):
tokens = EN(u' Give it back! He pleaded. ')
assert tokens[0].orth_ == ' '
assert not tokens._has_trailing_space(0)
assert tokens._has_trailing_space(1)
assert tokens._has_trailing_space(2)
assert not tokens._has_trailing_space(3)
assert tokens._has_trailing_space(4)
assert tokens._has_trailing_space(5)
assert not tokens._has_trailing_space(6)
assert tokens._has_trailing_space(7)
def test_serialize(EN):
tokens = EN(u' Give it back! He pleaded. ')
packed = tokens.serialize()
new_tokens = Doc.deserialize(EN.vocab, packed)
def mest_serialize(EN):
tokens = EN(u'Give it back! He pleaded.')
packed = tokens.to_bytes()
new_tokens = Doc(EN.vocab).from_bytes(packed)
assert tokens.string == new_tokens.string
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
def test_serialize_whitespace(EN):
tokens = EN(u' Give it back! He pleaded. ')
packed = tokens.to_bytes()
new_tokens = Doc(EN.vocab).from_bytes(packed)
assert tokens.string == new_tokens.string
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
assert [tokens._has_trailing_space(t.i) for t in tokens] == [new_tokens._has_trailing_space(t.i) for t in new_tokens]

View File

@ -4,13 +4,14 @@ from spacy.en import English
import pytest
@pytest.mark.vectors
def test_vec(EN):
hype = EN.vocab['hype']
assert hype.orth_ == 'hype'
assert 0.08 >= hype.repvec[0] > 0.07
@pytest.mark.vectors
def test_capitalized(EN):
hype = EN.vocab['Hype']
assert hype.orth_ == 'Hype'

View File

@ -35,3 +35,44 @@ def test_retrieve_id(sstore):
assert sstore[1] == 'A'
with pytest.raises(IndexError):
sstore[2]
def test_med_string(sstore):
nine_char_string = sstore[b'0123456789']
assert sstore[nine_char_string] == b'0123456789'
dummy = sstore[b'A']
assert sstore[b'0123456789'] == nine_char_string
def test_long_string(sstore):
url = u'INFORMATIVE](http://www.google.com/search?as_q=RedditMonkey&amp;hl=en&amp;num=50&amp;btnG=Google+Search&amp;as_epq=&amp;as_oq=&amp;as_eq=&amp;lr=&amp;as_ft=i&amp;as_filetype=&amp;as_qdr=all&amp;as_nlo=&amp;as_nhi=&amp;as_occt=any&amp;as_dt=i&amp;as_sitesearch=&amp;as_rights=&amp;safe=off'
orth = sstore[url]
assert sstore[orth] == url
def test_254_string(sstore):
s254 = 'a' * 254
orth = sstore[s254]
assert sstore[orth] == s254
def test_255_string(sstore):
s255 = 'b' * 255
orth = sstore[s255]
assert sstore[orth] == s255
def test_256_string(sstore):
s256 = 'c' * 256
orth = sstore[s256]
assert sstore[orth] == s256
def test_massive_strings(sstore):
s511 = 'd' * 511
orth = sstore[s511]
assert sstore[orth] == s511
s512 = 'e' * 512
orth = sstore[s512]
assert sstore[orth] == s512
s513 = '1' * 513
orth = sstore[s513]
assert sstore[orth] == s513

View File

@ -1,12 +0,0 @@
import pytest
def test_range_iter(en_vocab):
for i in range(len(en_vocab)):
lex = en_vocab[i]
def test_iter(en_vocab):
i = 0
for lex in en_vocab:
i += 1