mirror of
synced 2025-02-11 09:00:36 +03:00
Merge pull request #1468 from explosion/feature/tidy-up
💫 Tidy up v2.0 code base
This commit is contained in:
@ -1,93 +0,0 @@
#!/usr/bin/env python
from __future__ import unicode_literals, print_function
import plac
import joblib
from os import path
import os
import bz2
import ujson
from preshed.counter import PreshCounter
from joblib import Parallel, delayed
import io
from spacy.en import English
from spacy.strings import StringStore
from spacy.attrs import ORTH
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for line in file_:
yield ujson.loads(line)
def count_freqs(input_loc, output_loc):
vocab = English.default_vocab(get_lex_attr=None)
tokenizer = Tokenizer.from_dir(vocab,
path.join(English.default_data_dir(), 'tokenizer'))
counts = PreshCounter()
for json_comment in iter_comments(input_loc):
doc = tokenizer(json_comment['body'])
doc.count_by(ORTH, counts=counts)
with io.open(output_loc, 'w', 'utf8') as file_:
for orth, freq in counts:
string = tokenizer.vocab.strings[orth]
if not string.isspace():
file_.write('%d\t%s\n' % (freq, string))
def parallelize(func, iterator, n_jobs):
Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
def merge_counts(locs, out_loc):
string_map = StringStore()
counts = PreshCounter()
for loc in locs:
with io.open(loc, 'r', encoding='utf8') as file_:
for line in file_:
freq, word = line.strip().split('\t', 1)
orth = string_map[word]
counts.inc(orth, int(freq))
with io.open(out_loc, 'w', encoding='utf8') as file_:
for orth, count in counts:
string = string_map[orth]
file_.write('%d\t%s\n' % (count, string))
input_loc=("Location of input file list"),
freqs_dir=("Directory for frequency files"),
output_loc=("Location for output file"),
n_jobs=("Number of workers", "option", "n", int),
skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
tasks = []
outputs = []
for input_path in open(input_loc):
input_path = input_path.strip()
if not input_path:
filename = input_path.split('/')[-1]
output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
if not path.exists(output_path) or not skip_existing:
tasks.append((input_path, output_path))
if tasks:
parallelize(count_freqs, tasks, n_jobs)
merge_counts(outputs, output_loc)
if __name__ == '__main__':
@ -1,89 +0,0 @@
#!/usr/bin/env python
from __future__ import unicode_literals
from xml.etree import cElementTree as ElementTree
import json
import re
import plac
from pathlib import Path
from os import path
escaped_tokens = {
'-LRB-': '(',
'-RRB-': ')',
'-LSB-': '[',
'-RSB-': ']',
'-LCB-': '{',
'-RCB-': '}',
def read_parses(parse_loc):
offset = 0
doc = []
for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
parse = _adjust_token_ids(parse, offset)
offset += len(parse.split('\n'))
return doc
def _adjust_token_ids(parse, offset):
output = []
for line in parse.split('\n'):
pieces = line.split()
pieces[0] = str(int(pieces[0]) + offset)
pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
return '\n'.join(output)
def _fmt_doc(filename, paras):
return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}
def _fmt_para(raw, sents):
return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}
def _fmt_sent(sent):
return {
'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
'brackets': []}
def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
head = int(head) - 1
id_ = int(id_) - 1
head = (head - id_) if head != -1 else 0
return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}
tags_re = re.compile(r'<[\w\?/][^>]+>')
def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
ewtb_dir = Path(ewtb_dir)
out_dir = Path(out_dir)
if not out_dir.exists():
for genre_dir in ewtb_dir.joinpath('data').iterdir():
#if 'answers' in str(genre_dir): continue
parse_dir = genre_dir.joinpath('penntree')
docs = []
for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
filename = source_loc.parts[-1].replace('.sgm.sgm', '')
filename = filename.replace('.xml', '')
filename = filename.replace('.txt', '')
parse_loc = parse_dir.joinpath(filename + '.xml.tree')
parses = read_parses(parse_loc)
source = source_loc.open().read().strip()
if 'answers' in str(genre_dir):
source = tags_re.sub('', source).strip()
docs.append(_fmt_doc(filename, [[source, parses]]))
out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
with open(str(out_loc), 'w') as out_file:
out_file.write(json.dumps(docs, indent=4))
if __name__ == '__main__':
@ -1,32 +0,0 @@
import io
import plac
from spacy.en import English
def main(text_loc):
with io.open(text_loc, 'r', encoding='utf8') as file_:
text = file_.read()
NLU = English()
for paragraph in text.split('\n\n'):
tokens = NLU(paragraph)
ent_starts = {}
ent_ends = {}
for span in tokens.ents:
ent_starts[span.start] = span.label_
ent_ends[span.end] = span.label_
output = []
for token in tokens:
if token.i in ent_starts:
output.append('<%s>' % ent_starts[token.i])
if (token.i+1) in ent_ends:
output.append('</%s>' % ent_ends[token.i+1])
print ' '.join(output)
if __name__ == '__main__':
@ -1,157 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import io
import random
import time
import gzip
import plac
import cProfile
import pstats
import spacy.util
from spacy.en import English
from spacy.gold import GoldParse
from spacy.syntax.util import Config
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.parser import Parser
from spacy.scorer import Scorer
from spacy.tagger import Tagger
# Last updated for spaCy v0.97
def read_conll(file_):
"""Read a standard CoNLL/MALT-style format"""
sents = []
for sent_str in file_.read().strip().split('\n\n'):
ids = []
words = []
heads = []
labels = []
tags = []
for i, line in enumerate(sent_str.split('\n')):
word, pos_string, head_idx, label = _parse_line(line)
if head_idx < 0:
head_idx = i
text = ' '.join(words)
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
sents.append((None, [(annot, [])]))
return sents
def _parse_line(line):
pieces = line.split()
if len(pieces) == 4:
word, pos, head_idx, label = pieces
head_idx = int(head_idx)
elif len(pieces) == 15:
id_ = int(pieces[0].split('_')[-1])
word = pieces[1]
pos = pieces[4]
head_idx = int(pieces[8])-1
label = pieces[10]
id_ = int(pieces[0].split('_')[-1])
word = pieces[1]
pos = pieces[4]
head_idx = int(pieces[6])-1
label = pieces[7]
if head_idx == 0:
label = 'ROOT'
return word, pos, head_idx, label
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
gold = GoldParse(tokens, annot_tuples, make_projective=False)
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
gold_preproc=False, force_gold=False):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
if path.exists(pos_model_dir):
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
loss = 0
for _, sents in gold_tuples:
for annot_tuples, _ in sents:
if len(annot_tuples[1]) == 1:
score_model(scorer, nlp, None, annot_tuples, verbose=False)
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
gold = GoldParse(tokens, annot_tuples, make_projective=True)
if not gold.is_projective:
raise Exception(
"Non-projective sentence in training, after we should "
"have enforced projectivity: %s" % annot_tuples
loss += nlp.parser.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags)
print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
scorer.tags_acc, scorer.token_acc))
print('end training')
train_loc=("Location of CoNLL 09 formatted training file"),
dev_loc=("Location of CoNLL 09 formatted development file"),
model_dir=("Location of output model directory"),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_iter=("Number of training iterations", "option", "i", int),
def main(train_loc, dev_loc, model_dir, n_iter=15):
with io.open(train_loc, 'r', encoding='utf8') as file_:
train_sents = read_conll(file_)
if not eval_only:
train(English, train_sents, model_dir, n_iter=n_iter)
nlp = English(data_dir=model_dir)
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
scorer = Scorer()
for _, sents in dev_sents:
for annot_tuples, _ in sents:
score_model(scorer, nlp, None, annot_tuples)
print('TOK', 100-scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
if __name__ == '__main__':
@ -1,187 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function
import os
from os import path
import shutil
import io
import random
import plac
import re
import spacy.util
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.gold import merge_sents
from spacy.scorer import Scorer
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.ner import BiluoPushDown
from spacy.tagger import Tagger
from spacy.syntax.parser import Parser
from spacy.syntax.nonproj import PseudoProjectivity
def _corrupt(c, noise_level):
if random.random() >= noise_level:
return c
elif c == ' ':
return '\n'
elif c == '\n':
return ' '
elif c in ['.', "'", "!", "?"]:
return ''
return c.lower()
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
corrupted = [_corrupt(word, noise_level) for word in orig]
corrupted = [w for w in corrupted if w]
return corrupted
return ''.join(_corrupt(c, noise_level) for c in orig)
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
tokens = nlp.tokenizer(raw_text)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
with Language.train(model_dir, train_data,
tagger_cfg, parser_cfg, entity_cfg) as trainer:
loss = 0
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
for doc, gold in epoch:
trainer.update(doc, gold)
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None, cand_preproc=None):
print("Load parser", model_dir)
nlp = Language(path=model_dir)
if nlp.lang == 'de':
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
scorer = Scorer()
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
sents = merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
tokens = nlp(raw_text)
gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
return scorer
def write_parses(Language, dev_loc, model_dir, out_loc):
nlp = Language(data_dir=model_dir)
gold_tuples = read_json_file(dev_loc)
scorer = Scorer()
out_file = io.open(out_loc, 'w', 'utf8')
for raw_text, sents in gold_tuples:
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
tokens = nlp(raw_text)
#gold = GoldParse(tokens, annot_tuples)
#scorer.score(tokens, gold, verbose=False)
for sent in tokens.sents:
for t in sent:
if not t.is_space:
'%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
language=("The language to train", "positional", None, str, ['en','de', 'zh']),
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
corruption_level=("Amount of noise to add to training data", "option", "c", float),
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
out_loc=("Out location", "option", "o", str),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool),
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
L1=("L1 regularization penalty", "option", "L", float),
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
parser_cfg = dict(locals())
tagger_cfg = dict(locals())
entity_cfg = dict(locals())
lang = spacy.util.get_lang_class(language)
parser_cfg['features'] = lang.Defaults.parser_features
entity_cfg['features'] = lang.Defaults.entity_features
if not eval_only:
gold_train = list(read_json_file(train_loc))
gold_dev = list(read_json_file(dev_loc))
if n_sents > 0:
gold_train = gold_train[:n_sents]
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
if out_loc:
write_parses(lang, dev_loc, model_dir, out_loc)
scorer = evaluate(lang, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose)
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
print('NER P', scorer.ents_p)
print('NER R', scorer.ents_r)
print('NER F', scorer.ents_f)
if __name__ == '__main__':
@ -1,201 +0,0 @@
from __future__ import unicode_literals, print_function
import plac
import json
import random
import pathlib
from spacy.tokens import Doc
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.language import Language
from spacy.gold import GoldParse
from spacy.tagger import Tagger
from spacy.pipeline import DependencyParser, TokenVectorEncoder
from spacy.syntax.parser import get_templates
from spacy.syntax.arc_eager import ArcEager
from spacy.scorer import Scorer
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
import spacy.attrs
import io
from thinc.neural.ops import CupyOps
from thinc.neural import Model
from spacy.es import Spanish
from spacy.attrs import POS
from thinc.neural import Model
import cupy
from thinc.neural.ops import CupyOps
cupy = None
def read_conllx(loc, n=0):
with io.open(loc, 'r', encoding='utf8') as file_:
text = file_.read()
i = 0
for sent in text.strip().split('\n\n'):
lines = sent.strip().split('\n')
if lines:
while lines[0].startswith('#'):
tokens = []
for line in lines:
id_, word, lemma, pos, tag, morph, head, dep, _1, \
_2 = line.split('\t')
if '-' in id_ or '.' in id_:
id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_
dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
tag = pos+'__'+dep+'__'+morph
Spanish.Defaults.tag_map[tag] = {POS: pos}
tokens.append((id_, word, tag, head, dep, 'O'))
tuples = [list(t) for t in zip(*tokens)]
yield (None, [[tuples, []]])
i += 1
if n >= 1 and i >= n:
def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
scorer = Scorer()
correct = 0.
total = 0.
for doc, gold in zip(Xs, ys):
doc = Doc(vocab, words=[w.text for w in doc])
scorer.score(doc, gold, verbose=verbose)
for token, tag in zip(doc, gold.tags):
if '_' in token.tag_:
univ_guess, _ = token.tag_.split('_', 1)
univ_guess = ''
univ_truth, _ = tag.split('_', 1)
correct += univ_guess == univ_truth
total += 1
return scorer
def organize_data(vocab, train_sents):
Xs = []
ys = []
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
doc = Doc(vocab, words=words)
gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
return Xs, ys
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
LangClass = spacy.util.get_lang_class(lang_name)
train_sents = list(read_conllx(train_loc))
dev_sents = list(read_conllx(dev_loc))
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
actions = ArcEager.get_actions(gold_parses=train_sents)
features = get_templates('basic')
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
if not (model_dir / 'deps').exists():
(model_dir / 'deps').mkdir()
if not (model_dir / 'pos').exists():
(model_dir / 'pos').mkdir()
with (model_dir / 'deps' / 'config.json').open('wb') as file_:
{'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
vocab = LangClass.Defaults.create_vocab()
if not (model_dir / 'vocab').exists():
(model_dir / 'vocab').mkdir()
if (model_dir / 'vocab' / 'strings.json').exists():
with (model_dir / 'vocab' / 'strings.json').open() as file_:
if (model_dir / 'vocab' / 'lexemes.bin').exists():
vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
if clusters_loc is not None:
clusters_loc = pathlib.Path(clusters_loc)
with clusters_loc.open() as file_:
for line in file_:
cluster, word, freq = line.split()
except ValueError:
lex = vocab[word]
lex.cluster = int(cluster[::-1], 2)
# Populate vocab
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
for word in words:
_ = vocab[word]
for dep in deps:
_ = vocab[dep]
for tag in tags:
_ = vocab[tag]
if vocab.morphology.tag_map:
for tag in tags:
vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
tagger = Tagger(vocab)
encoder = TokenVectorEncoder(vocab, width=64)
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
Xs, ys = organize_data(vocab, train_sents)
dev_Xs, dev_ys = organize_data(vocab, dev_sents)
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
docs = list(Xs)
for doc in docs:
nn_loss = [0.]
def track_progress():
with encoder.tagger.use_params(optimizer.averages):
with parser.model.use_params(optimizer.averages):
scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
itn = len(nn_loss)
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
trainer.batch_size = 24
trainer.nb_epoch = 40
for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
tokvecs, upd_tokvecs = encoder.begin_update(docs)
for doc, tokvec in zip(docs, tokvecs):
doc.tensor = tokvec
d_tokvecs = parser.update(docs, golds, sgd=optimizer)
upd_tokvecs(d_tokvecs, sgd=optimizer)
encoder.update(docs, golds, sgd=optimizer)
nlp = LangClass(vocab=vocab, parser=parser)
scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
#scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
#print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
if __name__ == '__main__':
import cProfile
import pstats
if 1:
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
@ -1,194 +0,0 @@
"""Convert OntoNotes into a json format.
doc: {
id: string,
paragraphs: [{
raw: string,
sents: [int],
tokens: [{
start: int,
tag: string,
head: int,
dep: string}],
ner: [{
start: int,
end: int,
label: string}],
brackets: [{
start: int,
end: int,
label: string}]}]}
Consumes output of spacy/munge/align_raw.py
from __future__ import unicode_literals
import plac
import json
from os import path
import os
import re
import io
from collections import defaultdict
from spacy.munge import read_ptb
from spacy.munge import read_conll
from spacy.munge import read_ner
def _iter_raw_files(raw_loc):
files = json.load(open(raw_loc))
for f in files:
yield f
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
ptb_sents = read_ptb.split(ptb_text)
dep_sents = read_conll.split(dep_text)
if len(ptb_sents) != len(dep_sents):
return None
if ner_text is not None:
ner_sents = read_ner.split(ner_text)
ner_sents = [None] * len(ptb_sents)
i = 0
doc = {'id': file_id}
if raw_paras is None:
doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
#for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents):
# doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent]))
doc['paragraphs'] = []
for raw_sents in raw_paras:
para = format_para(
' '.join(raw_sents).replace('<SEP>', ''),
if para['sentences']:
i += len(raw_sents)
return doc
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
para = {'raw': raw_text, 'sentences': []}
offset = 0
assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
_, deps = read_conll.parse(dep_text, strip_bad_periods=True)
if deps and 'VERB' in [t['tag'] for t in deps]:
if ner_text is not None:
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
ner = ['-' for _ in deps]
_, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
# Necessary because the ClearNLP converter deletes EDITED words.
if len(ner) != len(deps):
ner = ['-' for _ in deps]
para['sentences'].append(format_sentence(deps, ner, brackets))
return para
def format_sentence(deps, ner, brackets):
sent = {'tokens': [], 'brackets': []}
for token_id, (token, token_ent) in enumerate(zip(deps, ner)):
sent['tokens'].append(format_token(token_id, token, token_ent))
for label, start, end in brackets:
if start != end:
'label': label,
'first': start,
'last': (end-1)})
return sent
def format_token(token_id, token, ner):
assert token_id == token['id']
head = (token['head'] - token_id) if token['head'] != -1 else 0
return {
'id': token_id,
'orth': token['word'],
'tag': token['tag'],
'head': head,
'dep': token['dep'],
'ner': ner}
def read_file(*pieces):
loc = path.join(*pieces)
if not path.exists(loc):
return None
return io.open(loc, 'r', encoding='utf8').read().strip()
def get_file_names(section_dir, subsection):
filenames = []
for fn in os.listdir(path.join(section_dir, subsection)):
filenames.append(fn.rsplit('.', 1)[0])
return list(sorted(set(filenames)))
def read_wsj_with_source(onto_dir, raw_dir):
# Now do WSJ, with source alignment
onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
docs = {}
for i in range(25):
section = str(i) if i >= 10 else ('0' + str(i))
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
if section == '00':
j += 1
if section == '04' and filename == '55':
ptb = read_file(onto_dir, section, '%s.parse' % filename)
dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
ner = read_file(onto_dir, section, '%s.name' % filename)
if ptb is not None and dep is not None:
docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner)
return docs
def get_doc(onto_dir, file_path, wsj_docs):
filename = file_path.rsplit('/', 1)[1]
if filename in wsj_docs:
return wsj_docs[filename]
ptb = read_file(onto_dir, file_path + '.parse')
dep = read_file(onto_dir, file_path + '.parse.dep')
ner = read_file(onto_dir, file_path + '.name')
if ptb is not None and dep is not None:
return format_doc(filename, None, ptb, dep, ner)
return None
def read_ids(loc):
return open(loc).read().strip().split('\n')
def main(onto_dir, raw_dir, out_dir):
wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
for partition in ('train', 'test', 'development'):
ids = read_ids(path.join(onto_dir, '%s.id' % partition))
docs_by_genre = defaultdict(list)
for file_path in ids:
doc = get_doc(onto_dir, file_path, wsj_docs)
if doc is not None:
genre = file_path.split('/')[3]
part_dir = path.join(out_dir, partition)
if not path.exists(part_dir):
for genre, docs in sorted(docs_by_genre.items()):
out_loc = path.join(part_dir, genre + '.json')
with open(out_loc, 'w') as file_:
json.dump(docs, file_, indent=4)
if __name__ == '__main__':
@ -1,13 +0,0 @@
"""Read a vector file, and prepare it as binary data, for easy consumption"""
import plac
from spacy.vocab import write_binary_vectors
def main(in_loc, out_loc):
write_binary_vectors(in_loc, out_loc)
if __name__ == '__main__':
@ -1,175 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function
import os
from os import path
import shutil
import codecs
import random
import plac
import re
import spacy.util
from spacy.en import English
from spacy.tagger import Tagger
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.scorer import Scorer
def score_model(scorer, nlp, raw_text, annot_tuples):
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
tokens = nlp.tokenizer(raw_text)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold)
def _merge_sents(sents):
m_deps = [[], [], [], [], [], []]
m_brackets = []
i = 0
for (ids, words, tags, heads, labels, ner), brackets in sents:
m_deps[0].extend(id_ + i for id_ in ids)
m_deps[3].extend(head + i for head in heads)
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
i += len(ids)
return [(m_deps, m_brackets)]
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
beam_width=1, verbose=False,
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
templates = Tagger.default_templates()
nlp = Language(data_dir=model_dir, tagger=False)
nlp.tagger = Tagger.blank(nlp.vocab, templates)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
loss = 0
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
sents = _merge_sents(sents)
for annot_tuples, ctnt in sents:
words = annot_tuples[1]
gold_tags = annot_tuples[2]
score_model(scorer, nlp, raw_text, annot_tuples)
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(words)
tokens = nlp.tokenizer(raw_text)
loss += nlp.tagger.train(tokens, gold_tags)
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
nlp = Language(data_dir=model_dir)
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
scorer = Scorer()
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
return scorer
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
nlp = Language(data_dir=model_dir)
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
gold_tuples = read_json_file(dev_loc)
scorer = Scorer()
out_file = codecs.open(out_loc, 'w', 'utf8')
for raw_text, sents in gold_tuples:
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=False)
for t in tokens:
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
return scorer
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
corruption_level=("Amount of noise to add to training data", "option", "c", float),
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
out_loc=("Out location", "option", "o", str),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool),
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
if not eval_only:
gold_train = list(read_json_file(train_loc))
train(English, gold_train, model_dir,
feat_set='basic' if not debug else 'debug',
gold_preproc=gold_preproc, n_sents=n_sents,
corruption_level=corruption_level, n_iter=n_iter,
#if out_loc:
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
scorer = evaluate(English, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose)
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
print('NER P', scorer.ents_p)
print('NER R', scorer.ents_r)
print('NER F', scorer.ents_f)
if __name__ == '__main__':
@ -1,160 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import io
import random
import time
import gzip
import ujson
import plac
import cProfile
import pstats
import spacy.util
from spacy.de import German
from spacy.gold import GoldParse
from spacy.tagger import Tagger
from spacy.scorer import PRFScore
from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags
from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags
from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
def default_templates():
return spacy.tagger.Tagger.default_templates()
def default_templates_without_clusters():
return (
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(P1_pos, P2_pos),
(P1_pos, W_orth),
def make_tagger(vocab, templates):
model = spacy.tagger.TaggerModel(templates)
return spacy.tagger.Tagger(vocab,model)
def read_conll(file_):
def sentences():
words, tags = [], []
for line in file_:
line = line.strip()
if line:
word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
elif words:
yield words, tags
words, tags = [], []
if words:
yield words, tags
return [ s for s in sentences() ]
def score_model(score, nlp, words, gold_tags):
tokens = nlp.tokenizer.tokens_from_list(words)
assert(len(tokens) == len(gold_tags))
for token, gold_tag in zip(tokens,gold_tags):
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
# make shuffling deterministic
# set up directory for model
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(pos_model_dir):
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = make_tagger(nlp.vocab,default_templates())
print("Itn.\ttrain acc %\tdev acc %")
for itn in range(n_iter):
# train on train set
#train_acc = PRFScore()
correct, total = 0., 0.
for words, gold_tags in train_sents:
tokens = nlp.tokenizer.tokens_from_list(words)
correct += nlp.tagger.train(tokens, gold_tags)
total += len(words)
train_acc = correct/total
# test on dev set
dev_acc = PRFScore()
for words, gold_tags in dev_sents:
score_model(dev_acc, nlp, words, gold_tags)
print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
print('end training')
train_loc=("Location of CoNLL 09 formatted training file"),
dev_loc=("Location of CoNLL 09 formatted development file"),
model_dir=("Location of output model directory"),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_iter=("Number of training iterations", "option", "i", int),
def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
# training
if not eval_only:
with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
io.open(dev_loc, 'r', encoding='utf8') as devfile_:
train_sents = read_conll(trainfile_)
dev_sents = read_conll(devfile_)
train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
# testing
with io.open(dev_loc, 'r', encoding='utf8') as file_:
dev_sents = read_conll(file_)
nlp = German(data_dir=model_dir)
dev_acc = PRFScore()
for words, gold_tags in dev_sents:
score_model(dev_acc, nlp, words, gold_tags)
print('POS: %6.2f %%' % (100*dev_acc.precision))
if __name__ == '__main__':
@ -24,7 +24,6 @@ MOD_NAMES = [
@ -3,8 +3,6 @@ from __future__ import unicode_literals
from .cli.info import info as cli_info
from .glossary import explain
from .deprecated import resolve_load_name
#from .about import __version__
from .about import __version__
from . import util
@ -1,7 +1,7 @@
# coding: utf8
from __future__ import print_function
# NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals
# from __future__ import unicode_literals
if __name__ == '__main__':
import plac
@ -1,47 +1,40 @@
import ujson
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
# coding: utf8
from __future__ import unicode_literals
import numpy
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
from thinc.i2v import HashEmbed, StaticVectors
from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
from thinc.t2v import Pooling, sum_pool
from thinc.misc import Residual
from thinc.misc import BatchNorm as BN
from thinc.misc import LayerNorm as LN
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.api import FeatureExtracter, with_getitem
from thinc.api import uniqued, wrap, flatten_add_lengths, noop
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
from thinc.api import uniqued, wrap, noop
from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
import random
import cytoolz
from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed
import thinc.extra.load_nlp
from .tokens.doc import Doc
from . import util
import numpy
import io
# TODO: Unset this once we don't want to support models previous models.
import thinc.neural._classes.layernorm
VECTORS_KEY = 'spacy_pretrained_vectors'
def _flatten_add_lengths(seqs, pad=0, drop=0.):
ops = Model.ops
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths, pad=pad)
X = ops.flatten(seqs, pad=pad)
return (X, lengths), finish_update
@ -55,33 +48,14 @@ def _logistic(X, drop=0.):
X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y))
return dX
return Y, logistic_bwd
def add_tuples(X, drop=0.):
"""Give inputs of sequence pairs, where each sequence is (vals, length),
sum the values, returning a single sequence.
If input is:
((vals1, length), (vals2, length)
Output is:
(vals1+vals2, length)
vals are a single tensor for the whole batch.
(vals1, length1), (vals2, length2) = X
assert length1 == length2
def add_tuples_bwd(dY, sgd=None):
return (dY, dY)
return (vals1+vals2, length), add_tuples_bwd
def _zero_init(model):
def _zero_init_impl(self, X, y):
@ -115,13 +89,12 @@ def _init_for_precomputed(W, ops):
nF=Dimension("Number of features"),
nO=Dimension("Output size"),
W=Synapses("Weights matrix",
lambda obj: (obj.nF, obj.nO, obj.nI),
lambda W, ops: _init_for_precomputed(W, ops)),
lambda obj: (obj.nF, obj.nO, obj.nI),
lambda W, ops: _init_for_precomputed(W, ops)),
b=Biases("Bias vector",
lambda obj: (obj.nO,)),
lambda obj: (obj.nO,)),
class PrecomputableAffine(Model):
def __init__(self, nO=None, nI=None, nF=None, **kwargs):
Model.__init__(self, **kwargs)
@ -134,18 +107,19 @@ class PrecomputableAffine(Model):
# Yf: (b, f, i)
# dY: (b, o)
# dYf: (b, f, o)
#Yf = numpy.einsum('bi,foi->bfo', X, self.W)
# Yf = numpy.einsum('bi,foi->bfo', X, self.W)
Yf = self.ops.xp.tensordot(
X, self.W, axes=[[1], [2]])
Yf += self.b
def backward(dY_ids, sgd=None):
tensordot = self.ops.xp.tensordot
dY, ids = dY_ids
Xf = X[ids]
#dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
# dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
dXf = tensordot(dY, self.W, axes=[[1], [1]])
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
# dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
dW = tensordot(dY, Xf, axes=[[0], [0]])
# ofi -> foi
self.d_W += dW.transpose((1, 0, 2))
@ -154,6 +128,7 @@ class PrecomputableAffine(Model):
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf
return Yf, backward
@ -164,13 +139,12 @@ class PrecomputableAffine(Model):
nP=Dimension("Number of pieces"),
nO=Dimension("Output size"),
W=Synapses("Weights matrix",
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
lambda W, ops: ops.xavier_uniform_init(W)),
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
lambda W, ops: ops.xavier_uniform_init(W)),
b=Biases("Bias vector",
lambda obj: (obj.nO, obj.nP)),
lambda obj: (obj.nO, obj.nP)),
class PrecomputableMaxouts(Model):
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
Model.__init__(self, **kwargs)
@ -186,114 +160,26 @@ class PrecomputableMaxouts(Model):
# dYp: (b, o, p)
# W: (f, o, p, i)
# b: (o, p)
# bi,opfi->bfop
# bop,fopi->bfi
# bop,fbi->opfi : fopi
tensordot = self.ops.xp.tensordot
ascontiguous = self.ops.xp.ascontiguousarray
Yfp = tensordot(X, self.W, axes=[[1], [3]])
Yfp += self.b
def backward(dYp_ids, sgd=None):
dYp, ids = dYp_ids
Xf = X[ids]
dXf = tensordot(dYp, self.W, axes=[[1, 2], [1,2]])
dXf = tensordot(dYp, self.W, axes=[[1, 2], [1, 2]])
dW = tensordot(dYp, Xf, axes=[[0], [0]])
self.d_W += dW.transpose((2, 0, 1, 3))
self.d_b += dYp.sum(axis=0)
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf
return Yfp, backward
# Thinc's Embed class is a bit broken atm, so drop this here.
from thinc import describe
from thinc.neural._classes.embed import _uniform_init
nV=describe.Dimension("Number of vectors"),
nO=describe.Dimension("Size of output"),
vectors=describe.Weights("Embedding table",
lambda obj: (obj.nV, obj.nO),
_uniform_init(-0.1, 0.1)
class Embed(Model):
name = 'embed'
def __init__(self, nO, nV=None, **kwargs):
if nV is not None:
nV += 1
Model.__init__(self, **kwargs)
if 'name' in kwargs:
self.name = kwargs['name']
self.column = kwargs.get('column', 0)
self.nO = nO
self.nV = nV
def predict(self, ids):
if ids.ndim == 2:
ids = ids[:, self.column]
return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
def begin_update(self, ids, drop=0.):
if ids.ndim == 2:
ids = ids[:, self.column]
vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
def backprop_embed(d_vectors, sgd=None):
n_vectors = d_vectors.shape[0]
self.ops.scatter_add(self.d_vectors, ids, d_vectors)
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return None
return vectors, backprop_embed
def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
'''Wrap a model, adding features representing action history.'''
if hist_size == 0:
return layerize(noop())
embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
for i in range(hist_size)]
embed = chain(concatenate(*embed_tables),
LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
ops = embed.ops
def add_history_fwd(vectors_hists, drop=0.):
vectors, hist_ids = vectors_hists
hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop)
outputs = ops.xp.hstack((vectors, hist_feats))
def add_history_bwd(d_outputs, sgd=None):
d_vectors = d_outputs[:, :vectors.shape[1]]
d_hists = d_outputs[:, vectors.shape[1]:]
bp_hists(d_hists, sgd=sgd)
return embed.ops.xp.ascontiguousarray(d_vectors)
return outputs, add_history_bwd
return wrap(add_history_fwd, embed)
def drop_layer(layer, factor=2.):
def drop_layer_fwd(X, drop=0.):
if drop <= 0.:
return layer.begin_update(X, drop=drop)
coinflip = layer.ops.xp.random.random()
if (coinflip / factor) >= drop:
return layer.begin_update(X, drop=drop)
return X, lambda dX, sgd=None: dX
model = wrap(drop_layer_fwd, layer)
model.predict = layer
return model
def link_vectors_to_models(vocab):
vectors = vocab.vectors
@ -308,16 +194,21 @@ def link_vectors_to_models(vocab):
# (unideal, I know)
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = kwargs.get('pretrained_dims', 0)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
'*': reapply}):
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
'+': add, '*': reapply}):
norm = HashEmbed(width, embed_size, column=cols.index(NORM),
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
if pretrained_dims is not None and pretrained_dims >= 1:
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
@ -329,7 +220,6 @@ def Tok2Vec(width, embed_size, **kwargs):
(norm | prefix | suffix | shape)
>> LN(Maxout(width, width*4, pieces=3)), column=5)
convolution = Residual(
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
@ -354,6 +244,7 @@ def reapply(layer, n_times):
Y, backprop = layer.begin_update(X, drop=drop)
X = Y
def reapply_bwd(dY, sgd=None):
dX = None
for backprop in reversed(backprops):
@ -363,39 +254,20 @@ def reapply(layer, n_times):
dX += dY
return dX
return Y, reapply_bwd
return wrap(reapply_fwd, layer)
def asarray(ops, dtype):
def forward(X, drop=0.):
return ops.asarray(X, dtype=dtype), None
return layerize(forward)
def foreach(layer):
def forward(Xs, drop=0.):
results = []
backprops = []
for X in Xs:
result, bp = layer.begin_update(X, drop=drop)
def backward(d_results, sgd=None):
dXs = []
for d_result, backprop in zip(d_results, backprops):
dXs.append(backprop(d_result, sgd))
return dXs
return results, backward
model = layerize(forward)
return model
def rebatch(size, layer):
ops = layer.ops
def forward(X, drop=0.):
if X.shape[0] < size:
return layer.begin_update(X)
@ -403,6 +275,7 @@ def rebatch(size, layer):
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
for p in parts])
y = ops.flatten(results)
def backward(dy, sgd=None):
d_parts = [bp(y, sgd=sgd) for bp, y in
zip(bp_results, _divide_array(dy, size))]
@ -413,6 +286,7 @@ def rebatch(size, layer):
except ValueError:
dX = None
return dX
return y, backward
model = layerize(forward)
@ -423,13 +297,14 @@ def _divide_array(X, size):
parts = []
index = 0
while index < len(X):
parts.append(X[index : index + size])
parts.append(X[index:index + size])
index += size
return parts
def get_col(idx):
assert idx >= 0, idx
def forward(X, drop=0.):
assert idx >= 0, idx
if isinstance(X, numpy.ndarray):
@ -437,30 +312,28 @@ def get_col(idx):
ops = CupyOps()
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
def backward(y, sgd=None):
assert idx >= 0, idx
dX = ops.allocate(X.shape)
dX[:, idx] += y
return dX
return output, backward
return layerize(forward)
def zero_init(model):
def _hook(self, X, y=None):
return model
def doc2feats(cols=None):
if cols is None:
def forward(docs, drop=0.):
feats = []
for doc in docs:
return feats, None
model = layerize(forward)
model.cols = cols
return model
@ -474,28 +347,14 @@ def print_shape(prefix):
def get_token_vectors(tokens_attrs_vectors, drop=0.):
ops = Model.ops
tokens, attrs, vectors = tokens_attrs_vectors
def backward(d_output, sgd=None):
return (tokens, d_output)
return vectors, backward
def flatten(seqs, drop=0.):
if isinstance(seqs[0], numpy.ndarray):
ops = NumpyOps()
elif hasattr(CupyOps.xp, 'ndarray') and isinstance(seqs[0], CupyOps.xp.ndarray):
ops = CupyOps()
raise ValueError("Unable to flatten sequence of type %s" % type(seqs[0]))
lengths = [len(seq) for seq in seqs]
def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths)
X = ops.xp.vstack(seqs)
return X, finish_update
def logistic(X, drop=0.):
xp = get_array_module(X)
@ -505,9 +364,11 @@ def logistic(X, drop=0.):
X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y))
return dX
return Y, logistic_bwd
@ -517,6 +378,7 @@ def zero_init(model):
return model
def preprocess_doc(docs, drop=0.):
keys = [doc.to_array([LOWER]) for doc in docs]
@ -526,11 +388,13 @@ def preprocess_doc(docs, drop=0.):
vals = ops.allocate(keys.shape[0]) + 1
return (keys, vals, lengths), None
def getitem(i):
def getitem_fwd(X, drop=0.):
return X[i], None
return layerize(getitem_fwd)
def build_tagger_model(nr_class, **cfg):
embed_size = util.env_opt('embed_size', 7000)
if 'token_vector_width' in cfg:
@ -555,8 +419,6 @@ def build_tagger_model(nr_class, **cfg):
def SpacyVectors(docs, drop=0.):
xp = get_array_module(docs[0].vocab.vectors.data)
width = docs[0].vocab.vectors.data.shape[1]
batch = []
for doc in docs:
indices = numpy.zeros((len(doc),), dtype='i')
@ -570,29 +432,6 @@ def SpacyVectors(docs, drop=0.):
return batch, None
def foreach(layer, drop_factor=1.0):
'''Map a layer across elements in a list'''
def foreach_fwd(Xs, drop=0.):
drop *= drop_factor
ys = []
backprops = []
for X in Xs:
y, bp_y = layer.begin_update(X, drop=drop)
def foreach_bwd(d_ys, sgd=None):
d_Xs = []
for d_y, bp_y in zip(d_ys, backprops):
if bp_y is not None and bp_y is not None:
d_Xs.append(d_y, sgd=sgd)
return d_Xs
return ys, foreach_bwd
model = wrap(foreach_fwd, layer)
return model
def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 5000)
pretrained_dims = cfg.get('pretrained_dims', 0)
@ -602,9 +441,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
model = (
>> flatten_add_lengths
>> with_getitem(0,
Affine(width, pretrained_dims)
>> with_getitem(0, Affine(width, pretrained_dims))
>> ParametricAttention(width)
>> Pooling(sum_pool)
>> Residual(ReLu(width, width)) ** 2
@ -613,7 +450,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
return model
lower = HashEmbed(width, nr_vector, column=1)
prefix = HashEmbed(width//2, nr_vector, column=2)
suffix = HashEmbed(width//2, nr_vector, column=3)
@ -671,33 +507,40 @@ def build_text_classifier(nr_class, width=64, **cfg):
model.lsuv = False
return model
def flatten(seqs, drop=0.):
ops = Model.ops
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths, pad=0)
X = ops.flatten(seqs, pad=0)
return X, finish_update
def concatenate_lists(*layers, **kwargs): # pragma: no cover
'''Compose two or more models `f`, `g`, etc, such that their outputs are
def concatenate_lists(*layers, **kwargs): # pragma: no cover
"""Compose two or more models `f`, `g`, etc, such that their outputs are
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
if not layers:
return noop()
drop_factor = kwargs.get('drop_factor', 1.0)
ops = layers[0].ops
layers = [chain(layer, flatten) for layer in layers]
concat = concatenate(*layers)
def concatenate_lists_fwd(Xs, drop=0.):
drop *= drop_factor
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
ys = ops.unflatten(flat_y, lengths)
def concatenate_lists_bwd(d_ys, sgd=None):
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
return ys, concatenate_lists_bwd
model = wrap(concatenate_lists_fwd, concat)
return model
@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
Normalize a dictionary of attributes, converting them to ints.
stringy_attrs (dict):
Dictionary keyed by attribute string names. Values can be ints or strings.
strings_map (StringStore):
Defaults to None. If provided, encodes string values into ints.
inty_attrs (dict):
Attributes dictionary with keys and optionally values converted to
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
can be ints or strings.
strings_map (StringStore): Defaults to None. If provided, encodes string
values into ints.
RETURNS (dict): Attributes dictionary with keys and optionally values
converted to ints.
inty_attrs = {}
if _do_deprecated:
@ -7,10 +7,9 @@ from pathlib import Path
from .converters import conllu2json, iob2json, conll_ner2json
from ..util import prints
# Converters are matched by file extension. To add a converter, add a new entry
# to this dict with the file extension mapped to the converter function imported
# from /converters.
# Converters are matched by file extension. To add a converter, add a new
# entry to this dict with the file extension mapped to the converter function
# imported from /converters.
'conllu': conllu2json,
'conll': conllu2json,
@ -24,8 +23,7 @@ CONVERTERS = {
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", int),
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
morphology=("Enable appending morphology to tags", "flag", "m", bool))
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
@ -40,7 +38,7 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
prints(output_path, title="Output directory not found", exits=1)
if converter == 'auto':
converter = input_path.suffix[1:]
if not converter in CONVERTERS:
if converter not in CONVERTERS:
prints("Can't find converter for %s" % converter,
title="Unknown format", exits=1)
func = CONVERTERS[converter]
@ -8,7 +8,8 @@ from ...gold import iob_to_biluo
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
Convert files in the CoNLL-2003 NER format into JSON format for use with
train cli.
docs = read_conll_ner(input_path)
@ -13,10 +13,9 @@ from .. import about
model=("model to download (shortcut or model name)", "positional", None, str),
model=("model to download, shortcut or name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
"perform compatibility check", "flag", "d", bool))
def download(cmd, model, direct=False):
Download compatible model from default download path using pip. Model
@ -30,21 +29,25 @@ def download(cmd, model, direct=False):
model_name = shortcuts.get(model, model)
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
if dl == 0:
# Get package path here because link uses
# pip.get_installed_distributions() to check if model is a package,
# which fails if model was just installed via subprocess
# pip.get_installed_distributions() to check if model is a
# package, which fails if model was just installed via
# subprocess
package_path = get_package_path(model_name)
link(None, model_name, model, force=True, model_path=package_path)
link(None, model_name, model, force=True,
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and
# loading instructions, even if linking fails.
prints("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load "
"the model via its full package name:",
# Dirty, but since spacy.download and the auto-linking is
# mostly a convenience wrapper, it's best to show a success
# message and loading instructions, even if linking fails.
"Creating a shortcut link for 'en' didn't work (maybe "
"you don't have admin permissions?), but you can still "
"load the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful")
@ -52,9 +55,10 @@ def download(cmd, model, direct=False):
def get_json(url, desc):
r = requests.get(url)
if r.status_code != 200:
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
"(v%s), and download it manually." % (desc, about.__version__),
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
msg = ("Couldn't fetch %s. Please find a model for your spaCy "
"installation (v%s), and download it manually.")
prints(msg % (desc, about.__version__), about.__docs_models__,
title="Server error (%d)" % r.status_code, exits=1)
return r.json()
@ -71,13 +75,13 @@ def get_compatibility():
def get_version(model, comp):
if model not in comp:
version = about.__version__
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
title="Compatibility error", exits=1)
msg = "No compatible model found for '%s' (spaCy v%s)."
prints(msg % (model, version), title="Compatibility error", exits=1)
return comp[model][0]
def download_model(filename):
download_url = about.__download_url__ + '/' + filename
return subprocess.call([sys.executable, '-m',
'pip', 'install', '--no-cache-dir', download_url],
return subprocess.call(
[sys.executable, '-m', 'pip', 'install', '--no-cache-dir',
download_url], env=os.environ.copy())
@ -2,27 +2,15 @@
from __future__ import unicode_literals, division, print_function
import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch
from ..gold import GoldCorpus
from ..util import prints
from .. import util
from .. import about
from .. import displacy
from ..compat import json_dumps
@ -30,17 +18,18 @@ numpy.random.seed(0)
model=("Model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional",
None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
gpu_id=("Use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
displacy_path=("Directory to output rendered parses as HTML", "option",
"dp", str),
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
displacy_path=None, displacy_limit=25):
Evaluate a model. To render a sample of parses in a HTML file, set an output
directory as the displacy_path argument.
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
if gpu_id >= 0:
@ -50,7 +39,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
if not data_path.exists():
prints(data_path, title="Evaluation data not found", exits=1)
if displacy_path and not displacy_path.exists():
prints(displacy_path, title="Visualization output directory not found", exits=1)
prints(displacy_path, title="Visualization output directory not found",
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
@ -64,12 +54,14 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
docs, golds = zip(*dev_docs)
render_deps = 'parser' in nlp.meta.get('pipeline', [])
render_ents = 'ner' in nlp.meta.get('pipeline', [])
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
deps=render_deps, ents=render_ents)
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
render_parses(docs, displacy_path, model_name=model,
limit=displacy_limit, deps=render_deps, ents=render_ents)
msg = "Generated %s parses as HTML" % displacy_limit
prints(displacy_path, title=msg)
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
docs[0].user_data['title'] = model_name
if ents:
with (output_path / 'entities.html').open('w') as file_:
@ -77,7 +69,8 @@ def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=T
if deps:
with (output_path / 'parses.html').open('w') as file_:
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
html = displacy.render(docs[:limit], style='dep', page=True,
options={'compact': True})
@ -12,8 +12,7 @@ from .. import util
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
markdown=("generate Markdown for GitHub issues", "flag", "md", str))
def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
@ -12,8 +12,7 @@ from .. import util
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
force=("force overwriting of existing link", "flag", "f", bool))
def link(cmd, origin, link_name, force=False, model_path=None):
Create a symlink for models within the spacy/data directory. Accepts
@ -46,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None):
# This is quite dirty, but just making sure other errors are caught.
prints("Creating a symlink in spacy/data failed. Make sure you have "
"the required permissions and try re-running the command as "
"admin, or use a virtualenv. You can still import the model as a "
"module and call its load() method, or create the symlink manually.",
"admin, or use a virtualenv. You can still import the model as "
"a module and call its load() method, or create the symlink "
"%s --> %s" % (path2str(model_path), path2str(link_path)),
title="Error: Couldn't link model to '%s'" % link_name)
@ -16,10 +16,12 @@ from .. import about
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta_path=("path to meta.json", "option", "m", str),
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
create_meta=("create meta.json, even if one exists in directory", "flag",
"c", bool),
force=("force overwriting of existing folder in output directory", "flag",
"f", bool))
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
@ -52,13 +54,15 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force
package_path = main_path / model_name
create_dirs(package_path, force)
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
path2str(package_path / model_name_v))
create_file(main_path / 'meta.json', json_dumps(meta))
create_file(main_path / 'setup.py', template_setup)
create_file(main_path / 'MANIFEST.in', template_manifest)
create_file(package_path / '__init__.py', template_init)
prints(main_path, "To build the package, run `python setup.py sdist` in this "
"directory.", title="Successfully created package '%s'" % model_name_v)
prints(main_path, "To build the package, run `python setup.py sdist` in "
"this directory.",
title="Successfully created package '%s'" % model_name_v)
def create_dirs(package_path, force):
@ -66,9 +70,10 @@ def create_dirs(package_path, force):
if force:
prints(package_path, "Please delete the directory and try again, or "
"use the --force flag to overwrite existing directories.",
title="Package directory already exists", exits=1)
prints(package_path, "Please delete the directory and try again, "
"or use the --force flag to overwrite existing "
"directories.", title="Package directory already exists",
Path.mkdir(package_path, parents=True)
@ -82,7 +87,8 @@ def generate_meta(model_path):
settings = [('lang', 'Model language', 'en'),
('name', 'Model name', 'model'),
('version', 'Model version', '0.0.0'),
('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
('spacy_version', 'Required spaCy version',
'>=%s,<3.0.0' % about.__version__),
('description', 'Model description', False),
('author', 'Author', False),
('email', 'Author email', False),
@ -27,15 +27,15 @@ def read_inputs(loc):
lang=("model/language", "positional", None, str),
inputs=("Location of input file", "positional", None, read_inputs)
inputs=("Location of input file", "positional", None, read_inputs))
def profile(cmd, lang, inputs=None):
Profile a spaCy pipeline, to find out which functions take the most time.
nlp = spacy.load(lang)
nlp = spacy.load(lang)
texts = list(cytoolz.take(10000, inputs))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
s = pstats.Stats("Profile.prof")
@ -2,21 +2,14 @@
from __future__ import unicode_literals, division, print_function
import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch
from ..util import prints
from .. import util
@ -31,8 +24,10 @@ numpy.random.seed(0)
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional",
None, str),
dev_data=("location of JSON-formatted development data (optional)",
"positional", None, str),
n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int),
@ -42,11 +37,12 @@ numpy.random.seed(0)
no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
meta_path=("Optional path to meta.json. All relevant properties will be "
"overwritten.", "option", "m", Path))
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False, version="0.0.0", meta_path=None):
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
no_entities=False, gold_preproc=False, version="0.0.0",
Train a model. Expects data in spaCy's JSON format.
@ -72,9 +68,12 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
meta.setdefault('name', 'unnamed')
pipeline = ['tagger', 'parser', 'ner']
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
if no_tagger and 'tagger' in pipeline:
if no_parser and 'parser' in pipeline:
if no_entities and 'ner' in pipeline:
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
@ -139,7 +138,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
scorer = nlp_loaded.evaluate(dev_docs)
end_time = timer()
cpu_wps = nwords/(end_time-start_time)
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
meta_loc = output_path / ('model%d' % i) / 'meta.json'
@ -157,7 +156,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
with meta_loc.open('w') as file_:
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
print("Saving model...")
@ -1,5 +1,5 @@
# coding: utf8
from __future__ import unicode_literals
from __future__ import unicode_literals, print_function
import requests
import pkg_resources
@ -29,8 +29,10 @@ def validate(cmd):
model_links = get_model_links(current_compat)
model_pkgs = get_model_pkgs(current_compat, all_models)
incompat_links = {l for l, d in model_links.items() if not d['compat']}
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
incompat_models = {d['name'] for _, d in model_pkgs.items()
if not d['compat']}
incompat_models.update([d['name'] for _, d in model_links.items()
if not d['compat']])
na_models = [m for m in incompat_models if m not in current_compat]
update_models = [m for m in incompat_models if m in current_compat]
@ -90,7 +92,6 @@ def get_model_pkgs(compat, all_models):
def get_model_row(compat, name, data, type='package'):
tpl_row = ' {:<10}' + (' {:<20}' * 4)
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
if data['compat']:
@ -110,7 +111,8 @@ def get_row(*args):
def is_model_path(model_path):
exclude = ['cache', 'pycache', '__pycache__']
name = model_path.parts[-1]
return model_path.is_dir() and name not in exclude and not name.startswith('.')
return (model_path.is_dir() and name not in exclude
and not name.startswith('.'))
def is_compat(compat, name, version):
@ -118,6 +120,7 @@ def is_compat(compat, name, version):
def reformat_version(version):
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
if version.endswith('-alpha'):
return version.replace('-alpha', 'a0')
return version.replace('-alpha', 'a')
@ -87,15 +87,15 @@ def symlink_to(orig, dest):
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
return ((python2 == None or python2 == is_python2) and
(python3 == None or python3 == is_python3) and
(windows == None or windows == is_windows) and
(linux == None or linux == is_linux) and
(osx == None or osx == is_osx))
return ((python2 is None or python2 == is_python2) and
(python3 is None or python3 == is_python3) and
(windows is None or windows == is_windows) and
(linux is None or linux == is_linux) and
(osx is None or osx == is_osx))
def normalize_string_keys(old):
'''Given a dictionary, make sure keys are unicode strings, not bytes.'''
"""Given a dictionary, make sure keys are unicode strings, not bytes."""
new = {}
for key, value in old.items():
if isinstance(key, bytes_):
@ -24,7 +24,7 @@ def depr_model_download(lang):
def resolve_load_name(name, **overrides):
"""Resolve model loading if deprecated path kwarg is specified in overrides.
"""Resolve model loading if deprecated path kwarg in overrides.
name (unicode): Name of model to load.
**overrides: Overrides specified in spacy.load().
@ -32,8 +32,9 @@ def resolve_load_name(name, **overrides):
if overrides.get('path') not in (None, False, True):
name = overrides.get('path')
prints("To load a model from a path, you can now use the first argument. "
"The model meta is used to load the required Language class.",
"OLD: spacy.load('en', path='/some/path')", "NEW: spacy.load('/some/path')",
prints("To load a model from a path, you can now use the first "
"argument. The model meta is used to load the Language class.",
"OLD: spacy.load('en', path='/some/path')",
"NEW: spacy.load('/some/path')",
title="Warning: deprecated argument 'path'")
return name
@ -12,7 +12,7 @@ IS_JUPYTER = is_in_jupyter()
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
options={}, manual=False):
options={}, manual=False):
"""Render displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
@ -21,7 +21,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
minify (bool): Minify HTML markup.
jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
RETURNS (unicode): Rendered HTML markup.
factories = {'dep': (DependencyRenderer, parse_deps),
@ -35,7 +35,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
parsed = [converter(doc, options) for doc in docs] if not manual else docs
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
html = _html['parsed']
if jupyter: # return HTML rendered by IPython display()
if jupyter: # return HTML rendered by IPython display()
from IPython.core.display import display, HTML
return display(HTML(html))
return html
@ -50,13 +50,15 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
port (int): Port to serve visualisation.
from wsgiref import simple_server
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
render(docs, style=style, page=page, minify=minify, options=options,
httpd = simple_server.make_server('', port, app)
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
prints("Using the '%s' visualizer" % style,
title="Serving on port %d..." % port)
except KeyboardInterrupt:
@ -67,7 +69,8 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
def app(environ, start_response):
# headers and status need to be bytes in Python 2, see #1227
headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
headers = [(b_to_str(b'Content-type'),
b_to_str(b'text/html; charset=utf-8'))]
start_response(b_to_str(b'200 OK'), headers)
res = _html['parsed'].encode(encoding='utf-8')
return [res]
@ -89,9 +92,9 @@ def parse_deps(orig_doc, options={}):
end = word.i + 1
while end < len(doc) and doc[end].is_punct:
end += 1
span = doc[start : end]
span = doc[start:end]
spans.append((span.start_char, span.end_char, word.tag_,
word.lemma_, word.ent_type_))
word.lemma_, word.ent_type_))
for span_props in spans:
words = [{'text': w.text, 'tag': w.tag_} for w in doc]
@ -113,6 +116,7 @@ def parse_ents(doc, options={}):
RETURNS (dict): Generated entities keyed by text (original text) and ents.
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
for ent in doc.ents]
title = doc.user_data.get('title', None) if hasattr(doc, 'user_data') else None
for ent in doc.ents]
title = (doc.user_data.get('title', None)
if hasattr(doc, 'user_data') else None)
return {'text': doc.text, 'ents': ents, 'title': title}
@ -14,13 +14,15 @@ class DependencyRenderer(object):
"""Initialise dependency renderer.
options (dict): Visualiser-specific options (compact, word_spacing,
arrow_spacing, arrow_width, arrow_stroke, distance,
offset_x, color, bg, font)
arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
color, bg, font)
self.compact = options.get('compact', False)
self.word_spacing = options.get('word_spacing', 45)
self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
self.arrow_spacing = options.get('arrow_spacing',
12 if self.compact else 20)
self.arrow_width = options.get('arrow_width',
6 if self.compact else 10)
self.arrow_stroke = options.get('arrow_stroke', 2)
self.distance = options.get('distance', 150 if self.compact else 175)
self.offset_x = options.get('offset_x', 50)
@ -39,7 +41,8 @@ class DependencyRenderer(object):
rendered = [self.render_svg(i, p['words'], p['arcs'])
for i, p in enumerate(parsed)]
if page:
content = ''.join([TPL_FIGURE.format(content=svg) for svg in rendered])
content = ''.join([TPL_FIGURE.format(content=svg)
for svg in rendered])
markup = TPL_PAGE.format(content=content)
markup = ''.join(rendered)
@ -63,12 +66,13 @@ class DependencyRenderer(object):
self.id = render_id
words = [self.render_word(w['text'], w['tag'], i)
for i, w in enumerate(words)]
arcs = [self.render_arrow(a['label'], a['start'], a['end'], a['dir'], i)
arcs = [self.render_arrow(a['label'], a['start'],
a['end'], a['dir'], i)
for i, a in enumerate(arcs)]
content = ''.join(words) + ''.join(arcs)
return TPL_DEP_SVG.format(id=self.id, width=self.width, height=self.height,
color=self.color, bg=self.bg, font=self.font,
return TPL_DEP_SVG.format(id=self.id, width=self.width,
height=self.height, color=self.color,
bg=self.bg, font=self.font, content=content)
def render_word(self, text, tag, i):
"""Render individual word.
@ -96,7 +100,7 @@ class DependencyRenderer(object):
x_start = self.offset_x+start*self.distance+self.arrow_spacing
y = self.offset_y
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
- self.arrow_spacing*(self.highest_level-level)/4)
y_curve = self.offset_y-level*self.distance/2
if self.compact:
y_curve = self.offset_y-level*self.distance/6
@ -133,8 +137,10 @@ class DependencyRenderer(object):
if direction is 'left':
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
pos1, pos2, pos3 = (end, end+self.arrow_width-2, end-self.arrow_width+2)
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, y-self.arrow_width)
pos1, pos2, pos3 = (end, end+self.arrow_width-2,
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3,
return "M{},{} L{},{} {},{}".format(*arrowhead)
def get_levels(self, arcs):
@ -159,9 +165,10 @@ class EntityRenderer(object):
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LANGUAGE': '#ff8197',
'WORK_OF_ART': '#f0d0ff', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9',
'MONEY': '#e4e7d2', 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197',
'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff',
'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2',
'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
colors.update(options.get('colors', {}))
self.default_color = '#ddd'
@ -176,9 +183,11 @@ class EntityRenderer(object):
minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered HTML markup.
rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed]
rendered = [self.render_ents(p['text'], p['ents'],
p.get('title', None)) for p in parsed]
if page:
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
docs = ''.join([TPL_FIGURE.format(content=doc)
for doc in rendered])
markup = TPL_PAGE.format(content=docs)
markup = ''.join(rendered)
@ -264,7 +264,6 @@ GLOSSARY = {
'nk': 'noun kernel element',
'nmc': 'numerical component',
'oa': 'accusative object',
'oa': 'second accusative object',
'oc': 'clausal object',
'og': 'genitive object',
'op': 'prepositional object',
@ -2,7 +2,6 @@
# coding: utf8
from __future__ import unicode_literals, print_function
import io
import re
import ujson
import random
@ -10,9 +9,8 @@ import cytoolz
import itertools
from .syntax import nonproj
from .util import ensure_path
from . import util
from .tokens import Doc
from . import util
def tags_to_entities(tags):
@ -54,7 +52,8 @@ def merge_sents(sents):
m_deps[3].extend(head + i for head in heads)
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])
for b in brackets)
i += len(ids)
return [(m_deps, m_brackets)]
@ -80,6 +79,8 @@ def align(cand_words, gold_words):
punct_re = re.compile(r'\W')
def _min_edit_path(cand_words, gold_words):
Pool mem
@ -98,9 +99,9 @@ def _min_edit_path(cand_words, gold_words):
mem = Pool()
n_cand = len(cand_words)
n_gold = len(gold_words)
# Levenshtein distance, except we need the history, and we may want different
# costs.
# Mark operations with a string, and score the history using _edit_cost.
# Levenshtein distance, except we need the history, and we may want
# different costs. Mark operations with a string, and score the history
# using _edit_cost.
previous_row = []
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
@ -144,9 +145,9 @@ def _min_edit_path(cand_words, gold_words):
def minibatch(items, size=8):
'''Iterate over batches of items. `size` may be an iterator,
"""Iterate over batches of items. `size` may be an iterator,
so that batch-size can vary on each step.
if isinstance(size, int):
size_ = itertools.repeat(8)
@ -168,6 +169,7 @@ class GoldCorpus(object):
train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path)
@ -213,7 +215,7 @@ class GoldCorpus(object):
train_tuples = self.train_tuples
if projectivize:
train_tuples = nonproj.preprocess_training_data(
self.train_tuples, label_freq_cutoff=100)
self.train_tuples, label_freq_cutoff=100)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
@ -222,7 +224,6 @@ class GoldCorpus(object):
def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
#gold_docs = nlp.preprocess_gold(gold_docs)
yield from gold_docs
@ -233,7 +234,6 @@ class GoldCorpus(object):
raw_text = None
paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples)
@ -248,17 +248,20 @@ class GoldCorpus(object):
raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)]
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples]
return [Doc(nlp.vocab,
words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples]
def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples)
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
return [GoldParse.from_annot_tuples(docs[0],
return [GoldParse.from_annot_tuples(doc, sent_tuples)
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
for doc, (sent_tuples, brackets)
in zip(docs, paragraph_tuples)]
def walk_corpus(path):
@ -305,7 +308,7 @@ def _corrupt(c, noise_level):
def read_json_file(loc, docs_filter=None, limit=None):
loc = ensure_path(loc)
loc = util.ensure_path(loc)
if loc.is_dir():
for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit)
@ -330,16 +333,16 @@ def read_json_file(loc, docs_filter=None, limit=None):
for i, token in enumerate(sent['tokens']):
heads.append(token.get('head',0) + i)
tags.append(token.get('tag', '-'))
heads.append(token.get('head', 0) + i)
labels.append(token.get('dep', ''))
# Ensure ROOT label is case-insensitive
if labels[-1].lower() == 'root':
labels[-1] = 'ROOT'
ner.append(token.get('ner', '-'))
[ids, words, tags, heads, labels, ner],
sent.get('brackets', [])])
sent.get('brackets', [])])
if sents:
yield [paragraph.get('raw', None), sents]
@ -382,19 +385,21 @@ cdef class GoldParse:
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
_, words, tags, heads, deps, entities = annot_tuples
return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,
return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
entities=entities, make_projective=make_projective)
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False,
def __init__(self, doc, annot_tuples=None, words=None, tags=None,
heads=None, deps=None, entities=None, make_projective=False,
"""Create a GoldParse.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations.
heads (iterable): A sequence of integers, representing syntactic head offsets.
deps (iterable): A sequence of strings, representing the syntactic relation types.
heads (iterable): A sequence of integers, representing syntactic
head offsets.
deps (iterable): A sequence of strings, representing the syntactic
relation types.
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
@ -404,9 +409,10 @@ cdef class GoldParse:
document (usually a sentence). Unlike entity annotations, label
annotations can overlap, i.e. a single word can be covered by
multiple labelled spans. The TextCategorizer component expects
true examples of a label to have the value 1.0, and negative examples
of a label to have the value 0.0. Labels not in the dictionary are
treated as missing -- the gradient for those labels will be zero.
true examples of a label to have the value 1.0, and negative
examples of a label to have the value 0.0. Labels not in the
dictionary are treated as missing - the gradient for those labels
will be zero.
RETURNS (GoldParse): The newly constructed object.
if words is None:
@ -470,11 +476,11 @@ cdef class GoldParse:
self.ner[i] = entities[gold_i]
cycle = nonproj.contains_cycle(self.heads)
if cycle != None:
if cycle is not None:
raise Exception("Cycle found: %s" % cycle)
if make_projective:
proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
self.heads = proj_heads
def __len__(self):
@ -497,20 +503,19 @@ cdef class GoldParse:
def biluo_tags_from_offsets(doc, entities, missing='O'):
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (BILUO).
"""Encode labelled spans into per-token tags, using the
Begin/In/Last/Unit/Out scheme (BILUO).
doc (Doc): The document that the entity offsets refer to. The output tags
will refer to the token boundaries within the document.
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
`end` should be character-offset integers denoting the slice into the
original string.
entities (iterable): A sequence of `(start, end, label)` triples. `start`
and `end` should be character-offset integers denoting the slice into
the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The
training algorithm will view these as missing values. "O" denotes a
entity offsets don't align with the tokenization in the `Doc` object.
The training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
@ -1,31 +1,28 @@
# coding: utf8
from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import copy
from thinc.neural import Model
from thinc.neural.optimizers import Adam
import random
import ujson
from collections import OrderedDict
import itertools
import weakref
import functools
import tqdm
from collections import OrderedDict
from contextlib import contextmanager
from copy import copy
from thinc.neural import Model
from thinc.neural.optimizers import Adam
from .tokenizer import Tokenizer
from .vocab import Vocab
from .tagger import Tagger
from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
from .compat import json_dumps, izip, copy_reg
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer
from .compat import json_dumps, izip
from .scorer import Scorer
from ._ml import link_vectors_to_models
from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS, is_stop
@ -57,16 +54,18 @@ class BaseDefaults(object):
def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions
token_match = cls.token_match
prefix_search = util.compile_prefix_regex(cls.prefixes).search \
if cls.prefixes else None
suffix_search = util.compile_suffix_regex(cls.suffixes).search \
if cls.suffixes else None
infix_finditer = util.compile_infix_regex(cls.infixes).finditer \
if cls.infixes else None
prefix_search = (util.compile_prefix_regex(cls.prefixes).search
if cls.prefixes else None)
suffix_search = (util.compile_suffix_regex(cls.suffixes).search
if cls.suffixes else None)
infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
if cls.infixes else None)
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
return Tokenizer(vocab, rules=rules,
prefix_search=prefix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer, token_match=token_match)
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
token_match = TOKEN_MATCH
@ -98,7 +97,7 @@ class Language(object):
factories = {
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
@ -218,14 +217,14 @@ class Language(object):
def add_pipe(self, component, name=None, before=None, after=None,
first=None, last=None):
"""Add a component to the processing pipeline. Valid components are
callables that take a `Doc` object, modify it and return it. Only one of
before, after, first or last can be set. Default behaviour is "last".
callables that take a `Doc` object, modify it and return it. Only one
of before/after/first/last can be set. Default behaviour is "last".
component (callable): The pipeline component.
name (unicode): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if the name already exists in the pipeline.
used. An error is raised if a name already exists in the pipeline.
before (unicode): Component name to insert component directly before.
after (unicode): Component name to insert component directly after.
first (bool): Insert component first / not first in the pipeline.
@ -240,7 +239,8 @@ class Language(object):
name = component.name
elif hasattr(component, '__name__'):
name = component.__name__
elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
elif (hasattr(component, '__class__') and
hasattr(component.__class__, '__name__')):
name = component.__class__.__name__
name = repr(component)
@ -269,7 +269,7 @@ class Language(object):
`name in nlp.pipe_names`.
name (unicode): Name of the component.
RETURNS (bool): Whether a component of that name exists in the pipeline.
RETURNS (bool): Whether a component of the name exists in the pipeline.
return name in self.pipe_names
@ -332,15 +332,12 @@ class Language(object):
return doc
def disable_pipes(self, *names):
'''Disable one or more pipeline components.
If used as a context manager, the pipeline will be restored to the initial
state at the end of the block. Otherwise, a DisabledPipes object is
returned, that has a `.restore()` method you can use to undo your
"""Disable one or more pipeline components. If used as a context
manager, the pipeline will be restored to the initial state at the end
of the block. Otherwise, a DisabledPipes object is returned, that has
a `.restore()` method you can use to undo your changes.
>>> nlp.add_pipe('parser')
>>> nlp.add_pipe('tagger')
>>> with nlp.disable_pipes('parser', 'tagger'):
@ -351,7 +348,7 @@ class Language(object):
>>> assert not nlp.has_pipe('parser')
>>> disabled.restore()
>>> assert nlp.has_pipe('parser')
return DisabledPipes(self, *names)
def make_doc(self, text):
@ -367,14 +364,14 @@ class Language(object):
RETURNS (dict): Results from the update.
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> with nlp.begin_training(gold) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
if len(docs) != len(golds):
raise IndexError("Update expects same number of docs and golds "
"Got: %d, %d" % (len(docs), len(golds)))
"Got: %d, %d" % (len(docs), len(golds)))
if len(docs) == 0:
if sgd is None:
@ -382,8 +379,10 @@ class Language(object):
self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
pipes = list(self.pipeline)
for name, proc in pipes:
@ -421,7 +420,7 @@ class Language(object):
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
@ -461,7 +460,7 @@ class Language(object):
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
@ -512,17 +511,17 @@ class Language(object):
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
"""Process texts as a stream, and yield `Doc` objects in order.
Supports GIL-free multi-threading.
texts (iterator): A sequence of texts to process.
as_tuples (bool):
If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
n_threads (int): The number of worker threads to use. If -1, OpenMP
will decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text.
@ -546,7 +545,8 @@ class Language(object):
if name in disable:
if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
docs = proc.pipe(docs, n_threads=n_threads,
# Apply the function, but yield the doc
docs = _pipe(proc, docs)
@ -583,7 +583,7 @@ class Language(object):
will include the model.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
it doesn't exist. Paths may be strings or `Path`-like objects.
disable (list): Names of pipeline components to disable and prevent
from being saved.
@ -649,7 +649,7 @@ class Language(object):
serializers = OrderedDict((
('vocab', lambda: self.vocab.to_bytes()),
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
('meta', lambda: ujson.dumps(self.meta))
('meta', lambda: json_dumps(self.meta))
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
@ -682,14 +682,14 @@ class Language(object):
class DisabledPipes(list):
'''Manager for temporary pipeline disabling.'''
"""Manager for temporary pipeline disabling."""
def __init__(self, nlp, *names):
self.nlp = nlp
self.names = names
# Important! Not deep copy -- we just want the container (but we also
# want to support people providing arbitrarily typed nlp.pipeline
# objects.)
self.original_pipeline = copy.copy(nlp.pipeline)
self.original_pipeline = copy(nlp.pipeline)
self.extend(nlp.remove_pipe(name) for name in names)
@ -702,7 +702,8 @@ class DisabledPipes(list):
def restore(self):
'''Restore the pipeline to its state when DisabledPipes was created.'''
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)]
unexpected = [name for name, pipe in current
if not self.nlp.has_pipe(name)]
if unexpected:
# Don't change the pipeline if we're raising an error.
self.nlp.pipeline = current
@ -43,16 +43,15 @@ class Lemmatizer(object):
morphology = {} if morphology is None else morphology
others = [key for key in morphology
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
return True
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
morphology.get('Tense') == 'pres' and \
morphology.get('Number') is None and \
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and
morphology.get('Tense') == 'pres' and
morphology.get('Number') is None and
not others):
return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
@ -89,9 +88,6 @@ class Lemmatizer(object):
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
# TODO: Is this correct? See discussion in Issue #435.
#if string in index:
# forms.append(string)
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
@ -2,27 +2,17 @@
# coding: utf8
from __future__ import unicode_literals, print_function
from libc.math cimport sqrt
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
# Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray
cimport numpy as np
from libc.string cimport memset
import numpy
from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_BRACKET
from .attrs cimport IS_QUOTE
from .attrs cimport IS_LEFT_PUNCT
from .attrs cimport IS_RIGHT_PUNCT
from .attrs cimport IS_OOV
from . import about
@ -32,8 +22,8 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef class Lexeme:
"""An entry in the vocabulary. A `Lexeme` has no string context – it's a
word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the
part-of-speech tag).
def __init__(self, Vocab vocab, attr_t orth):
"""Create a Lexeme object.
@ -60,17 +50,17 @@ cdef class Lexeme:
a = 0
b = 1
if op == 2: # ==
if op == 2: # ==
return a == b
elif op == 3: # !=
elif op == 3: # !=
return a != b
elif op == 0: # <
elif op == 0: # <
return a < b
elif op == 1: # <=
elif op == 1: # <=
return a <= b
elif op == 4: # >
elif op == 4: # >
return a > b
elif op == 5: # >=
elif op == 5: # >=
return a >= b
raise NotImplementedError(op)
@ -104,7 +94,8 @@ cdef class Lexeme:
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
return (numpy.dot(self.vector, other.vector) /
(self.vector_norm * other.vector_norm))
def to_bytes(self):
lex_data = Lexeme.c_to_bytes(self.c)
@ -130,19 +121,13 @@ cdef class Lexeme:
self.orth = self.c.orth
property has_vector:
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""RETURNS (bool): Whether a word vector is associated with the object.
def __get__(self):
return self.vocab.has_vector(self.c.orth)
property vector_norm:
"""The L2 norm of the lexeme's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""RETURNS (float): The L2 norm of the vector representation."""
def __get__(self):
vector = self.vector
return numpy.sqrt((vector**2).sum())
@ -169,149 +154,320 @@ cdef class Lexeme:
self.vocab.set_vector(self.c.orth, vector)
property rank:
"""RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.id
def __set__(self, value):
self.c.id = value
property sentiment:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme."""
def __get__(self):
return self.c.sentiment
def __set__(self, float sentiment):
self.c.sentiment = sentiment
property orth_:
"""RETURNS (unicode): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes."""
def __get__(self):
return self.vocab.strings[self.c.orth]
property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""RETURNS (unicode): The original verbatim text of the lexeme."""
def __get__(self):
return self.orth_
property lower:
def __get__(self): return self.c.lower
def __set__(self, attr_t x): self.c.lower = x
"""RETURNS (unicode): Lowercase form of the lexeme."""
def __get__(self):
return self.c.lower
def __set__(self, attr_t x):
self.c.lower = x
property norm:
def __get__(self): return self.c.norm
def __set__(self, attr_t x): self.c.norm = x
"""RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the
lexeme text.
def __get__(self):
return self.c.norm
def __set__(self, attr_t x):
self.c.norm = x
property shape:
def __get__(self): return self.c.shape
def __set__(self, attr_t x): self.c.shape = x
"""RETURNS (uint64): Transform of the word's string, to show
orthographic features.
def __get__(self):
return self.c.shape
def __set__(self, attr_t x):
self.c.shape = x
property prefix:
def __get__(self): return self.c.prefix
def __set__(self, attr_t x): self.c.prefix = x
"""RETURNS (uint64): Length-N substring from the start of the word.
Defaults to `N=1`.
def __get__(self):
return self.c.prefix
def __set__(self, attr_t x):
self.c.prefix = x
property suffix:
def __get__(self): return self.c.suffix
def __set__(self, attr_t x): self.c.suffix = x
"""RETURNS (uint64): Length-N substring from the end of the word.
Defaults to `N=3`.
def __get__(self):
return self.c.suffix
def __set__(self, attr_t x):
self.c.suffix = x
property cluster:
def __get__(self): return self.c.cluster
def __set__(self, attr_t x): self.c.cluster = x
"""RETURNS (int): Brown cluster ID."""
def __get__(self):
return self.c.cluster
def __set__(self, attr_t x):
self.c.cluster = x
property lang:
def __get__(self): return self.c.lang
def __set__(self, attr_t x): self.c.lang = x
"""RETURNS (uint64): Language of the parent vocabulary."""
def __get__(self):
return self.c.lang
def __set__(self, attr_t x):
self.c.lang = x
property prob:
def __get__(self): return self.c.prob
def __set__(self, float x): self.c.prob = x
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
def __get__(self):
return self.c.prob
def __set__(self, float x):
self.c.prob = x
property lower_:
def __get__(self): return self.vocab.strings[self.c.lower]
def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x)
"""RETURNS (unicode): Lowercase form of the word."""
def __get__(self):
return self.vocab.strings[self.c.lower]
def __set__(self, unicode x):
self.c.lower = self.vocab.strings.add(x)
property norm_:
def __get__(self): return self.vocab.strings[self.c.norm]
def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x)
"""RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
lexeme text.
def __get__(self):
return self.vocab.strings[self.c.norm]
def __set__(self, unicode x):
self.c.norm = self.vocab.strings.add(x)
property shape_:
def __get__(self): return self.vocab.strings[self.c.shape]
def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x)
"""RETURNS (unicode): Transform of the word's string, to show
orthographic features.
def __get__(self):
return self.vocab.strings[self.c.shape]
def __set__(self, unicode x):
self.c.shape = self.vocab.strings.add(x)
property prefix_:
def __get__(self): return self.vocab.strings[self.c.prefix]
def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x)
"""RETURNS (unicode): Length-N substring from the start of the word.
Defaults to `N=1`.
def __get__(self):
return self.vocab.strings[self.c.prefix]
def __set__(self, unicode x):
self.c.prefix = self.vocab.strings.add(x)
property suffix_:
def __get__(self): return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x)
"""RETURNS (unicode): Length-N substring from the end of the word.
Defaults to `N=3`.
def __get__(self):
return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x):
self.c.suffix = self.vocab.strings.add(x)
property lang_:
def __get__(self): return self.vocab.strings[self.c.lang]
def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x)
"""RETURNS (unicode): Language of the parent vocabulary."""
def __get__(self):
return self.vocab.strings[self.c.lang]
def __set__(self, unicode x):
self.c.lang = self.vocab.strings.add(x)
property flags:
def __get__(self): return self.c.flags
def __set__(self, flags_t x): self.c.flags = x
"""RETURNS (uint64): Container of the lexeme's binary flags."""
def __get__(self):
return self.c.flags
def __set__(self, flags_t x):
self.c.flags = x
property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_OOV)
def __set__(self, attr_t x):
Lexeme.c_set_flag(self.c, IS_OOV, x)
property is_stop:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x)
"""RETURNS (bool): Whether the lexeme is a stop word."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_STOP)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_STOP, x)
property is_alpha:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x)
"""RETURNS (bool): Whether the lexeme consists of alphanumeric
characters. Equivalent to `lexeme.text.isalpha()`.
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ALPHA)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
property is_ascii:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x)
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ASCII)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ASCII, x)
property is_digit:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x)
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
to `lexeme.text.isdigit()`.
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_DIGIT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
property is_lower:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x)
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LOWER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LOWER, x)
property is_upper:
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_UPPER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_UPPER, x)
property is_title:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x)
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_TITLE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_TITLE, x)
property is_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x)
"""RETURNS (bool): Whether the lexeme is punctuation."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
Equivalent to `lexeme.text.isspace()`.
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_SPACE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_SPACE, x)
property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
"""RETURNS (bool): Whether the lexeme is a bracket."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_BRACKET)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_QUOTE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. )."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
"""RETURNS (bool): Whether the lexeme resembles a URL."""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_URL, x)
property like_num:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x)
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
"10", "ten", etc.
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_NUM)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
property like_email:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
"""RETURNS (bool): Whether the lexeme resembles an email address."""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
@ -4,12 +4,6 @@
from __future__ import unicode_literals
import ujson
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .attrs cimport attr_id_t
from .structs cimport TokenC
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
from . import attrs
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .structs cimport TokenC
from .tokens.doc cimport Doc, get_token_attr
from .vocab cimport Vocab
from .attrs import IDS
from .attrs cimport attr_id_t, ID, NULL_ATTR
from .attrs import FLAG61 as U_ENT
from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT
from .attrs import FLAG58 as B4_ENT
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
from .attrs import FLAG54 as B8_ENT
from .attrs import FLAG53 as B9_ENT
from .attrs import FLAG52 as B10_ENT
from .attrs import FLAG51 as I3_ENT
from .attrs import FLAG50 as I4_ENT
from .attrs import FLAG49 as I5_ENT
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
from .attrs import FLAG46 as I8_ENT
from .attrs import FLAG45 as I9_ENT
from .attrs import FLAG44 as I10_ENT
from .attrs import FLAG43 as L2_ENT
from .attrs import FLAG42 as L3_ENT
from .attrs import FLAG41 as L4_ENT
@ -153,7 +146,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
def _convert_strings(token_specs, string_store):
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
'?': (ZERO_ONE,), '1': (ONE,)}
'?': (ZERO_ONE,), '1': (ONE,)}
tokens = []
op = ONE
for spec in token_specs:
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
if value in operators:
ops = operators[value]
raise KeyError(
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
msg = "Unknown operator '%s'. Options: %s"
raise KeyError(msg % (value, ', '.join(operators.keys())))
if isinstance(attr, basestring):
attr = attrs.IDS.get(attr.upper())
attr = IDS.get(attr.upper())
if isinstance(value, basestring):
value = string_store.add(value)
if isinstance(value, bool):
@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches):
"""Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i]
span = doc[start : end]
span = doc[start:end]
span.merge(ent_type=label, ent_id=ent_id)
@ -233,13 +226,13 @@ cdef class Matcher:
return self._normalize_key(key) in self._patterns
def add(self, key, on_match, *patterns):
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
an on_match callback, and one or more patterns.
"""Add a match-rule to the matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns.
If the key exists, the patterns are appended to the previous ones, and
the previous on_match callback is replaced. The `on_match` callback will
receive the arguments `(matcher, doc, i, matches)`. You can also set
`on_match` to `None` to not perform any actions.
the previous on_match callback is replaced. The `on_match` callback
will receive the arguments `(matcher, doc, i, matches)`. You can also
set `on_match` to `None` to not perform any actions.
A pattern consists of one or more `token_specs`, where a `token_spec`
is a dictionary mapping attribute IDs to values, and optionally a
@ -253,8 +246,8 @@ cdef class Matcher:
The + and * operators are usually interpretted "greedily", i.e. longer
matches are returned where possible. However, if you specify two '+'
and '*' patterns in a row and their matches overlap, the first
operator will behave non-greedily. This quirk in the semantics
makes the matcher more efficient, by avoiding the need for back-tracking.
operator will behave non-greedily. This quirk in the semantics makes
the matcher more efficient, by avoiding the need for back-tracking.
key (unicode): The match ID.
on_match (callable): Callback executed on match.
@ -268,7 +261,6 @@ cdef class Matcher:
key = self._normalize_key(key)
self._patterns.setdefault(key, [])
self._callbacks[key] = on_match
for pattern in patterns:
specs = _convert_strings(pattern, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, key, specs))
@ -315,9 +307,9 @@ cdef class Matcher:
"""Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set.
batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading.
in parallel, if the implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
for doc in docs:
@ -325,7 +317,7 @@ cdef class Matcher:
yield doc
def __call__(self, Doc doc):
"""Find all token sequences matching the supplied patterns on the `Doc`.
"""Find all token sequences matching the supplied pattern.
doc (Doc): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples,
@ -342,8 +334,8 @@ cdef class Matcher:
for token_i in range(doc.length):
token = &doc.c[token_i]
q = 0
# Go over the open matches, extending or finalizing if able. Otherwise,
# we over-write them (q doesn't advance)
# Go over the open matches, extending or finalizing if able.
# Otherwise, we over-write them (q doesn't advance)
for state in partials:
action = get_action(state.second, token)
if action == PANIC:
@ -356,8 +348,8 @@ cdef class Matcher:
if action == REPEAT:
# Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match more
# pattern.
# (i.e. we don't overwrite -- we want to greedily match
# more pattern.
q += 1
elif action == REJECT:
@ -366,8 +358,8 @@ cdef class Matcher:
partials[q].second += 1
q += 1
elif action in (ACCEPT, ACCEPT_PREV):
# TODO: What to do about patterns starting with ZERO? Need to
# adjust the start position.
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
start = state.first
end = token_i+1 if action == ACCEPT else token_i
ent_id = state.second[1].attrs[0].value
@ -388,8 +380,8 @@ cdef class Matcher:
state.second = pattern
elif action == ADVANCE:
# TODO: What to do about patterns starting with ZERO? Need to
# adjust the start position.
# TODO: What to do about patterns starting with ZERO? Need
# to adjust the start position.
state.first = token_i
state.second = pattern + 1
@ -413,7 +405,6 @@ cdef class Matcher:
on_match = self._callbacks.get(ent_id)
if on_match is not None:
on_match(self, doc, i, matches)
# TODO: only return (match_id, start, end)
return matches
def _normalize_key(self, key):
@ -441,7 +432,8 @@ def get_bilou(length):
elif length == 8:
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
elif length == 9:
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
elif length == 10:
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT]
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
cdef Vocab vocab
cdef Matcher matcher
cdef PreshMap phrase_ids
cdef int max_length
cdef attr_t* _phrase_key
cdef public object _callbacks
cdef public object _patterns
@ -470,7 +460,8 @@ cdef class PhraseMatcher:
self.phrase_ids = PreshMap()
abstract_patterns = []
for length in range(1, max_length):
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
abstract_patterns.append([{tag: True}
for tag in get_bilou(length)])
self.matcher.add('Candidate', None, *abstract_patterns)
self._callbacks = {}
@ -496,8 +487,8 @@ cdef class PhraseMatcher:
return (self.__class__, (self.vocab,), None, None)
def add(self, key, on_match, *docs):
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
an on_match callback, and one or more patterns.
"""Add a match-rule to the matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns.
key (unicode): The match ID.
on_match (callable): Callback executed on match.
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
raise ValueError(msg % (len(doc), self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match
cdef int length
cdef int i
cdef hash_t phrase_hash
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
"""Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set.
batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading.
in parallel, if the implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
for doc in stream:
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
self._phrase_key[i] = 0
for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.c[j].lex.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
cdef hash_t key = hash64(self._phrase_key,
self.max_length * sizeof(attr_t), 0)
ent_id = <hash_t>self.phrase_ids.get(key)
if ent_id == 0:
return None
@ -4,17 +4,15 @@ from __future__ import unicode_literals
from libc.string cimport memset
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
from .attrs cimport POS, IS_SPACE
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
from .attrs import LEMMA, intify_attrs
def _normalize_props(props):
Transform deprecated string keys to correct names.
"""Transform deprecated string keys to correct names."""
out = {}
for key, value in props.items():
if key == POS:
@ -77,7 +75,8 @@ cdef class Morphology:
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
table provided by the language data as lemma_lookup (if available)."""
table provided by the language data as lemma_lookup (if available).
if token.lemma == 0:
orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str)
@ -95,11 +94,10 @@ cdef class Morphology:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags:
raise ValueError("Unknown tag ID: %s" % tag_id)
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
# is that this is where the specific word and the tag interact. Still,
# we should have a better way to enforce this rule, or figure out why
# the statistical model fails.
# Related to Issue #220
# TODO: It's pretty arbitrary to put this logic here. I guess the
# justification is that this is where the specific word and the tag
# interact. Still, we should have a better way to enforce this rule, or
# figure out why the statistical model fails. Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')]
rich_tag = self.rich_tags[tag_id]
@ -123,14 +121,13 @@ cdef class Morphology:
flags[0] &= ~(one << flag_id)
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
Add a special-case rule to the morphological analyser. Tokens whose
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
self.exc[(tag_str, orth_str)] = dict(attrs)
tag = self.strings.add(tag_str)
@ -144,10 +141,9 @@ cdef class Morphology:
elif force:
memset(cached, 0, sizeof(cached[0]))
msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
"to overwrite.")
msg = msg % (tag_str, orth_str)
raise ValueError(msg)
raise ValueError(
"Conflicting morphology exception for (%s, %s). Use "
"force=True to overwrite." % (tag_str, orth_str))
cached.tag = rich_tag
# TODO: Refactor this to take arbitrary attributes.
@ -218,7 +214,7 @@ IDS = {
"Definite_two": Definite_two,
"Definite_def": Definite_def,
"Definite_red": Definite_red,
"Definite_cons": Definite_cons, # U20
"Definite_cons": Definite_cons, # U20
"Definite_ind": Definite_ind,
"Degree_cmp": Degree_cmp,
"Degree_comp": Degree_comp,
@ -227,7 +223,7 @@ IDS = {
"Degree_sup": Degree_sup,
"Degree_abs": Degree_abs,
"Degree_com": Degree_com,
"Degree_dim ": Degree_dim, # du
"Degree_dim ": Degree_dim, # du
"Gender_com": Gender_com,
"Gender_fem": Gender_fem,
"Gender_masc": Gender_masc,
@ -242,15 +238,15 @@ IDS = {
"Negative_neg": Negative_neg,
"Negative_pos": Negative_pos,
"Negative_yes": Negative_yes,
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Polarity_neg": Polarity_neg, # U20
"Polarity_pos": Polarity_pos, # U20
"Number_com": Number_com,
"Number_dual": Number_dual,
"Number_none": Number_none,
"Number_plur": Number_plur,
"Number_sing": Number_sing,
"Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg
"Number_ptan ": Number_ptan, # bg
"Number_count ": Number_count, # bg
"NumType_card": NumType_card,
"NumType_dist": NumType_dist,
"NumType_frac": NumType_frac,
@ -276,7 +272,7 @@ IDS = {
"PronType_rel": PronType_rel,
"PronType_tot": PronType_tot,
"PronType_clit": PronType_clit,
"PronType_exc ": PronType_exc, # es, ca, it, fa,
"PronType_exc ": PronType_exc, # es, ca, it, fa,
"Reflex_yes": Reflex_yes,
"Tense_fut": Tense_fut,
"Tense_imp": Tense_imp,
@ -292,19 +288,19 @@ IDS = {
"VerbForm_partPres": VerbForm_partPres,
"VerbForm_sup": VerbForm_sup,
"VerbForm_trans": VerbForm_trans,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la,
"VerbForm_conv": VerbForm_conv, # U20
"VerbForm_gdv ": VerbForm_gdv, # la,
"Voice_act": Voice_act,
"Voice_cau": Voice_cau,
"Voice_pass": Voice_pass,
"Voice_mid ": Voice_mid, # gkc,
"Voice_int ": Voice_int, # hb,
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U,
"AdpType_voc ": AdpType_voc, # cz,
"AdpType_comprep ": AdpType_comprep, # cz,
"AdpType_circ ": AdpType_circ, # U,
"Voice_mid ": Voice_mid, # gkc,
"Voice_int ": Voice_int, # hb,
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
"AdpType_prep ": AdpType_prep, # cz, U,
"AdpType_post ": AdpType_post, # U,
"AdpType_voc ": AdpType_voc, # cz,
"AdpType_comprep ": AdpType_comprep, # cz,
"AdpType_circ ": AdpType_circ, # U,
"AdvType_man": AdvType_man,
"AdvType_loc": AdvType_loc,
"AdvType_tim": AdvType_tim,
@ -314,122 +310,122 @@ IDS = {
"AdvType_sta": AdvType_sta,
"AdvType_ex": AdvType_ex,
"AdvType_adadj": AdvType_adadj,
"ConjType_oper ": ConjType_oper, # cz, U,
"ConjType_comp ": ConjType_comp, # cz, U,
"Connegative_yes ": Connegative_yes, # fi,
"Derivation_minen ": Derivation_minen, # fi,
"Derivation_sti ": Derivation_sti, # fi,
"Derivation_inen ": Derivation_inen, # fi,
"Derivation_lainen ": Derivation_lainen, # fi,
"Derivation_ja ": Derivation_ja, # fi,
"Derivation_ton ": Derivation_ton, # fi,
"Derivation_vs ": Derivation_vs, # fi,
"Derivation_ttain ": Derivation_ttain, # fi,
"Derivation_ttaa ": Derivation_ttaa, # fi,
"Echo_rdp ": Echo_rdp, # U,
"Echo_ech ": Echo_ech, # U,
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
"Foreign_tscript ": Foreign_tscript, # cz, U,
"Foreign_yes ": Foreign_yes, # sl,
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
"Gender_erg_masc ": Gender_erg_masc, # bq,
"Gender_erg_fem ": Gender_erg_fem, # bq,
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
"Gender_psor_neut ": Gender_psor_neut, # sl,
"Hyph_yes ": Hyph_yes, # cz, U,
"InfForm_one ": InfForm_one, # fi,
"InfForm_two ": InfForm_two, # fi,
"InfForm_three ": InfForm_three, # fi,
"NameType_geo ": NameType_geo, # U, cz,
"NameType_prs ": NameType_prs, # U, cz,
"NameType_giv ": NameType_giv, # U, cz,
"NameType_sur ": NameType_sur, # U, cz,
"NameType_nat ": NameType_nat, # U, cz,
"NameType_com ": NameType_com, # U, cz,
"NameType_pro ": NameType_pro, # U, cz,
"NameType_oth ": NameType_oth, # U, cz,
"NounType_com ": NounType_com, # U,
"NounType_prop ": NounType_prop, # U,
"NounType_class ": NounType_class, # U,
"Number_abs_sing ": Number_abs_sing, # bq, U,
"Number_abs_plur ": Number_abs_plur, # bq, U,
"Number_dat_sing ": Number_dat_sing, # bq, U,
"Number_dat_plur ": Number_dat_plur, # bq, U,
"Number_erg_sing ": Number_erg_sing, # bq, U,
"Number_erg_plur ": Number_erg_plur, # bq, U,
"Number_psee_sing ": Number_psee_sing, # U,
"Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U,
"NumValue_one ": NumValue_one, # cz, U,
"NumValue_two ": NumValue_two, # cz, U,
"NumValue_three ": NumValue_three, # cz, U,
"PartForm_pres ": PartForm_pres, # fi,
"PartForm_past ": PartForm_past, # fi,
"PartForm_agt ": PartForm_agt, # fi,
"PartForm_neg ": PartForm_neg, # fi,
"PartType_mod ": PartType_mod, # U,
"PartType_emp ": PartType_emp, # U,
"PartType_res ": PartType_res, # U,
"PartType_inf ": PartType_inf, # U,
"PartType_vbp ": PartType_vbp, # U,
"Person_abs_one ": Person_abs_one, # bq, U,
"Person_abs_two ": Person_abs_two, # bq, U,
"Person_abs_three ": Person_abs_three, # bq, U,
"Person_dat_one ": Person_dat_one, # bq, U,
"Person_dat_two ": Person_dat_two, # bq, U,
"Person_dat_three ": Person_dat_three, # bq, U,
"Person_erg_one ": Person_erg_one, # bq, U,
"Person_erg_two ": Person_erg_two, # bq, U,
"Person_erg_three ": Person_erg_three, # bq, U,
"Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U,
"Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U,
"PunctSide_ini ": PunctSide_ini, # U,
"PunctSide_fin ": PunctSide_fin, # U,
"PunctType_peri ": PunctType_peri, # U,
"PunctType_qest ": PunctType_qest, # U,
"PunctType_excl ": PunctType_excl, # U,
"PunctType_quot ": PunctType_quot, # U,
"PunctType_brck ": PunctType_brck, # U,
"PunctType_comm ": PunctType_comm, # U,
"PunctType_colo ": PunctType_colo, # U,
"PunctType_semi ": PunctType_semi, # U,
"PunctType_dash ": PunctType_dash, # U,
"Style_arch ": Style_arch, # cz, fi, U,
"Style_rare ": Style_rare, # cz, fi, U,
"Style_poet ": Style_poet, # cz, U,
"Style_norm ": Style_norm, # cz, U,
"Style_coll ": Style_coll, # cz, U,
"Style_vrnc ": Style_vrnc, # cz, U,
"Style_sing ": Style_sing, # cz, U,
"Style_expr ": Style_expr, # cz, U,
"Style_derg ": Style_derg, # cz, U,
"Style_vulg ": Style_vulg, # cz, U,
"Style_yes ": Style_yes, # fi, U,
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
"VerbType_aux ": VerbType_aux, # U,
"VerbType_cop ": VerbType_cop, # U,
"VerbType_mod ": VerbType_mod, # U,
"VerbType_light ": VerbType_light, # U,
"ConjType_oper ": ConjType_oper, # cz, U,
"ConjType_comp ": ConjType_comp, # cz, U,
"Connegative_yes ": Connegative_yes, # fi,
"Derivation_minen ": Derivation_minen, # fi,
"Derivation_sti ": Derivation_sti, # fi,
"Derivation_inen ": Derivation_inen, # fi,
"Derivation_lainen ": Derivation_lainen, # fi,
"Derivation_ja ": Derivation_ja, # fi,
"Derivation_ton ": Derivation_ton, # fi,
"Derivation_vs ": Derivation_vs, # fi,
"Derivation_ttain ": Derivation_ttain, # fi,
"Derivation_ttaa ": Derivation_ttaa, # fi,
"Echo_rdp ": Echo_rdp, # U,
"Echo_ech ": Echo_ech, # U,
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
"Foreign_tscript ": Foreign_tscript, # cz, U,
"Foreign_yes ": Foreign_yes, # sl,
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
"Gender_erg_masc ": Gender_erg_masc, # bq,
"Gender_erg_fem ": Gender_erg_fem, # bq,
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
"Gender_psor_neut ": Gender_psor_neut, # sl,
"Hyph_yes ": Hyph_yes, # cz, U,
"InfForm_one ": InfForm_one, # fi,
"InfForm_two ": InfForm_two, # fi,
"InfForm_three ": InfForm_three, # fi,
"NameType_geo ": NameType_geo, # U, cz,
"NameType_prs ": NameType_prs, # U, cz,
"NameType_giv ": NameType_giv, # U, cz,
"NameType_sur ": NameType_sur, # U, cz,
"NameType_nat ": NameType_nat, # U, cz,
"NameType_com ": NameType_com, # U, cz,
"NameType_pro ": NameType_pro, # U, cz,
"NameType_oth ": NameType_oth, # U, cz,
"NounType_com ": NounType_com, # U,
"NounType_prop ": NounType_prop, # U,
"NounType_class ": NounType_class, # U,
"Number_abs_sing ": Number_abs_sing, # bq, U,
"Number_abs_plur ": Number_abs_plur, # bq, U,
"Number_dat_sing ": Number_dat_sing, # bq, U,
"Number_dat_plur ": Number_dat_plur, # bq, U,
"Number_erg_sing ": Number_erg_sing, # bq, U,
"Number_erg_plur ": Number_erg_plur, # bq, U,
"Number_psee_sing ": Number_psee_sing, # U,
"Number_psee_plur ": Number_psee_plur, # U,
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
"NumForm_digit ": NumForm_digit, # cz, sl, U,
"NumForm_roman ": NumForm_roman, # cz, sl, U,
"NumForm_word ": NumForm_word, # cz, sl, U,
"NumValue_one ": NumValue_one, # cz, U,
"NumValue_two ": NumValue_two, # cz, U,
"NumValue_three ": NumValue_three, # cz, U,
"PartForm_pres ": PartForm_pres, # fi,
"PartForm_past ": PartForm_past, # fi,
"PartForm_agt ": PartForm_agt, # fi,
"PartForm_neg ": PartForm_neg, # fi,
"PartType_mod ": PartType_mod, # U,
"PartType_emp ": PartType_emp, # U,
"PartType_res ": PartType_res, # U,
"PartType_inf ": PartType_inf, # U,
"PartType_vbp ": PartType_vbp, # U,
"Person_abs_one ": Person_abs_one, # bq, U,
"Person_abs_two ": Person_abs_two, # bq, U,
"Person_abs_three ": Person_abs_three, # bq, U,
"Person_dat_one ": Person_dat_one, # bq, U,
"Person_dat_two ": Person_dat_two, # bq, U,
"Person_dat_three ": Person_dat_three, # bq, U,
"Person_erg_one ": Person_erg_one, # bq, U,
"Person_erg_two ": Person_erg_two, # bq, U,
"Person_erg_three ": Person_erg_three, # bq, U,
"Person_psor_one ": Person_psor_one, # fi, U,
"Person_psor_two ": Person_psor_two, # fi, U,
"Person_psor_three ": Person_psor_three, # fi, U,
"Polite_inf ": Polite_inf, # bq, U,
"Polite_pol ": Polite_pol, # bq, U,
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
"Prefix_yes ": Prefix_yes, # U,
"PrepCase_npr ": PrepCase_npr, # cz,
"PrepCase_pre ": PrepCase_pre, # U,
"PunctSide_ini ": PunctSide_ini, # U,
"PunctSide_fin ": PunctSide_fin, # U,
"PunctType_peri ": PunctType_peri, # U,
"PunctType_qest ": PunctType_qest, # U,
"PunctType_excl ": PunctType_excl, # U,
"PunctType_quot ": PunctType_quot, # U,
"PunctType_brck ": PunctType_brck, # U,
"PunctType_comm ": PunctType_comm, # U,
"PunctType_colo ": PunctType_colo, # U,
"PunctType_semi ": PunctType_semi, # U,
"PunctType_dash ": PunctType_dash, # U,
"Style_arch ": Style_arch, # cz, fi, U,
"Style_rare ": Style_rare, # cz, fi, U,
"Style_poet ": Style_poet, # cz, U,
"Style_norm ": Style_norm, # cz, U,
"Style_coll ": Style_coll, # cz, U,
"Style_vrnc ": Style_vrnc, # cz, U,
"Style_sing ": Style_sing, # cz, U,
"Style_expr ": Style_expr, # cz, U,
"Style_derg ": Style_derg, # cz, U,
"Style_vulg ": Style_vulg, # cz, U,
"Style_yes ": Style_yes, # fi, U,
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
"VerbType_aux ": VerbType_aux, # U,
"VerbType_cop ": VerbType_cop, # U,
"VerbType_mod ": VerbType_mod, # U,
"VerbType_light ": VerbType_light, # U,
@ -8,7 +8,7 @@ IDS = {
"CONJ": CONJ, # U20
"CONJ": CONJ, # U20
@ -3,26 +3,17 @@
# coding: utf8
from __future__ import unicode_literals
from thinc.api import chain, layerize, with_getitem
import numpy
cimport numpy as np
import cytoolz
import util
from collections import OrderedDict
import ujson
import msgpack
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.i2v import HashEmbed
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.misc import Residual
from thinc.misc import BatchNorm as BN
from thinc.misc import LayerNorm as LN
from thinc.api import chain
from thinc.v2v import Softmax
from thinc.t2v import Pooling, max_pool, mean_pool
from thinc.neural.util import to_categorical
from thinc.neural._classes.difference import Siamese, CauchySimilarity
from .tokens.doc cimport Doc
@ -30,29 +21,23 @@ from .syntax.nn_parser cimport Parser
from .syntax import nonproj
from .syntax.ner cimport BiluoPushDown
from .syntax.arc_eager cimport ArcEager
from .tagger import Tagger
from .syntax.stateclass cimport StateClass
from .gold cimport GoldParse
from .morphology cimport Morphology
from .vocab cimport Vocab
from .syntax import nonproj
from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten
from ._ml import build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models
from .attrs import POS
from .parts_of_speech import X
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models
from . import util
class SentenceSegmenter(object):
"""A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse).
To change the sentence boundary detection strategy, pass a generator
function `strategy` on initialization, or assign a new strategy to
the .strategy attribute.
(that doesn't require the dependency parse). To change the sentence
boundary detection strategy, pass a generator function `strategy` on
initialization, or assign a new strategy to the .strategy attribute.
Sentence detection strategies should be generators that take `Doc` objects
and yield `Span` objects for each sentence.
@ -74,16 +59,20 @@ class SentenceSegmenter(object):
seen_period = False
for i, word in enumerate(doc):
if seen_period and not word.is_punct:
yield doc[start : word.i]
yield doc[start:word.i]
start = word.i
seen_period = False
elif word.text in ['.', '!', '?']:
seen_period = True
if start < len(doc):
yield doc[start : len(doc)]
yield doc[start:len(doc)]
class Pipe(object):
"""This class is not instantiated directly. Components inherit from it, and
it defines the interface that components should follow to function as
components in a spaCy analysis pipeline.
name = None
@ -149,8 +138,7 @@ class Pipe(object):
def use_params(self, params):
"""Modify the pipe's model, to use the given parameter values.
"""Modify the pipe's model, to use the given parameter values."""
with self.model.use_params(params):
@ -235,8 +223,8 @@ class Tensorizer(Pipe):
"""Construct a new statistical model. Weights are not allocated on
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
instance with the `Doc` objects it will process.
vocab (Vocab): A `Vocab` instance. The model must share the same
`Vocab` instance with the `Doc` objects it will process.
model (Model): A `Model` instance or `True` allocate one later.
**cfg: Config parameters.
@ -280,7 +268,7 @@ class Tensorizer(Pipe):
"""Return a single tensor for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the documents.
RETURNS (object): Vector representations for each token in the docs.
tokvecs = self.model(docs)
return tokvecs
@ -289,7 +277,7 @@ class Tensorizer(Pipe):
"""Set the tensor attribute for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
tokvecs (object): Vector representation for each token in the documents.
tokvecs (object): Vector representation for each token in the docs.
for doc, tokvecs in zip(docs, tokvecses):
assert tokvecs.shape[0] == len(doc)
@ -328,12 +316,14 @@ class Tensorizer(Pipe):
class Tagger(Pipe):
name = 'tagger'
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
def __call__(self, doc):
tags = self.predict([doc])
@ -353,8 +343,7 @@ class Tagger(Pipe):
guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get()
guesses = self.model.ops.unflatten(guesses,
[len(d) for d in docs])
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
return guesses
def set_annotations(self, docs, batch_tag_ids):
@ -387,8 +376,8 @@ class Tagger(Pipe):
def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
tag_index = {tag: i
for i, tag in enumerate(self.vocab.morphology.tag_names)}
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)
@ -443,17 +432,18 @@ class Tagger(Pipe):
serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
serialize['tag_map'] = lambda: msgpack.dumps(
self.vocab.morphology.tag_map, use_bin_type=True, encoding='utf8')
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
def load_model(b):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width',
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
token_vector_width = util.env_opt(
self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags,
def load_tag_map(b):
@ -509,11 +499,11 @@ class Tagger(Pipe):
class MultitaskObjective(Tagger):
'''Assist training of a parser or tagger, by training a side-objective.
"""Experimental: Assist training of a parser or tagger, by training a
name = 'nn_labeller'
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab
self.model = model
@ -530,12 +520,12 @@ class MultitaskObjective(Tagger):
elif hasattr(target, '__call__'):
self.make_label = target
raise ValueError(
"MultitaskObjective target should be function or one of "
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
raise ValueError("MultitaskObjective target should be function or "
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
def labels(self):
@ -623,20 +613,19 @@ class MultitaskObjective(Tagger):
class SimilarityHook(Pipe):
Experimental: A pipeline component to install a hook for supervised
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
documents. The similarity model can be any object obeying the Thinc `Model`
interface. By default, the model concatenates the elementwise mean and
elementwise max of the two tensors, and compares them using the
Cauchy-like similarity function from Chen (2013):
A pipeline component to install a hook for supervised similarity into
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
model can be any object obeying the Thinc Model interface. By default,
the model concatenates the elementwise mean and elementwise max of the two
tensors, and compares them using the Cauchy-like similarity function
from Chen (2013):
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
>>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
Where W is a vector of dimension weights, initialized to 1.
name = 'similarity'
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
@ -662,8 +651,7 @@ class SimilarityHook(Pipe):
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
def begin_training(self, _=tuple(), pipeline=None):
Allocate model, using width from tensorizer in pipeline.
"""Allocate model, using width from tensorizer in pipeline.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
@ -763,12 +751,14 @@ cdef class DependencyParser(Parser):
for target in []:
labeller = MultitaskObjective(self.vocab, target=target)
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
labeller.begin_training(gold_tuples, pipeline=pipeline,
def __reduce__(self):
return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
return (DependencyParser, (self.vocab, self.moves, self.model),
None, None)
cdef class EntityRecognizer(Parser):
@ -781,12 +771,14 @@ cdef class EntityRecognizer(Parser):
for target in []:
labeller = MultitaskObjective(self.vocab, target=target)
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
labeller.begin_training(gold_tuples, pipeline=pipeline,
def __reduce__(self):
return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None)
return (EntityRecognizer, (self.vocab, self.moves, self.model),
None, None)
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
@ -74,18 +74,21 @@ class Scorer(object):
def scores(self):
return {
'uas': self.uas, 'las': self.las,
'ents_p': self.ents_p, 'ents_r': self.ents_r, 'ents_f': self.ents_f,
'uas': self.uas,
'las': self.las,
'ents_p': self.ents_p,
'ents_r': self.ents_r,
'ents_f': self.ents_f,
'tags_acc': self.tags_acc,
'token_acc': self.token_acc
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
assert len(tokens) == len(gold)
gold_deps = set()
gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
gold_ents = set(tags_to_entities([annot[-1]
for annot in gold.orig_annot]))
for id_, word, tag, head, dep, ner in gold.orig_annot:
gold_tags.add((id_, tag))
if dep not in (None, "") and dep.lower() not in punct_labels:
@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
cimport cython
from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t
from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32
import ujson
import dill
from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t
from . import util
from .compat import json_dumps
from . import util
cpdef hash_t hash_string(unicode string) except 0:
@ -195,7 +191,7 @@ cdef class StringStore:
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
it doesn't exist. Paths may be either strings or Path-like objects.
path = util.ensure_path(path)
strings = list(self)
@ -225,7 +221,7 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object.
return ujson.dumps(list(self))
return json_dumps(list(self))
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
@ -1,8 +1,8 @@
# coding: utf8
#cython: optimize.unpack_method_calls=False
from __future__ import unicode_literals
IDS = {
"": NIL,
@ -464,9 +464,11 @@ IDS = {
def sort_nums(x):
return x[1]
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
# Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+)
@ -2,7 +2,7 @@
# cython: profile=True
cimport numpy as np
import numpy
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.ref cimport PyObject, Py_XDECREF
from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation
from thinc.typedefs cimport hash_t, class_t
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
from .transition_system cimport TransitionSystem, Transition
from .stateclass cimport StateClass
from ..gold cimport GoldParse
from ..tokens.doc cimport Doc
# These are passed as callbacks to thinc.search.Beam
@ -50,7 +49,7 @@ cdef class ParserBeam(object):
cdef public object dones
def __init__(self, TransitionSystem moves, states, golds,
int width, float density):
int width, float density):
self.moves = moves
self.states = states
self.golds = golds
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
cdef StateClass state, st
for state in states:
beam = Beam(self.moves.n_moves, width, density)
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
beam.initialize(self.moves.init_beam_state, state.c.length,
for i in range(beam.width):
st = <StateClass>beam.at(i)
st.c.offset = state.c.offset
@ -74,7 +74,8 @@ cdef class ParserBeam(object):
def is_done(self):
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
return all(b.is_done or self.dones[i]
for i, b in enumerate(self.beams))
def __getitem__(self, i):
return self.beams[i]
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
for i in range(beam.size):
state = <StateClass>beam.at(i)
if not state.c.is_final():
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
state, gold)
if follow_gold:
for j in range(beam.nr_class):
if beam.costs[i][j] >= 1:
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
c_ids += ids.shape[1]
return ids
nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, golds,
state2vec, vec2scores,
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
if pbeam.is_done and gbeam.is_done:
# The beam maps let us find the right row in the flattened scores
# arrays for each state. States are identified by (example id, history).
# We keep a different beam map for each step (since we'll have a flat
# scores array for each step). The beam map will let us take the per-state
# losses, and compute the gradient for each (step, state, class).
# arrays for each state. States are identified by (example id,
# history). We keep a different beam map for each step (since we'll
# have a flat scores array for each step). The beam map will let us
# take the per-state losses, and compute the gradient for each (step,
# state, class).
# Gather all states from the two beams in a list. Some stats may occur
# in both beams. To figure out which beam each state belonged to,
# we keep two lists of indices, p_indices and g_indices
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
if not states:
# Now that we have our flat list of states, feed them through the model
token_ids = get_token_ids(states, nr_feature)
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
if hist_feats:
hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i')
scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop)
hists = numpy.asarray([st.history[:hist_feats] for st in states],
scores, bp_scores = vec2scores.begin_update((vectors, hists),
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
# Unpack the flat scores into lists for the two beams. The indices arrays
# tell us which example and state the scores-row refers to.
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
for indices in g_indices]
# Now advance the states in the beams. The gold beam is contrained to
# to follow only gold analyses.
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
def get_gradient(nr_class, beam_maps, histories, losses):
The global model assigns a loss to each parse. The beam scores
"""The global model assigns a loss to each parse. The beam scores
are additive, so the same gradient is applied to each action
in the history. This gives the gradient of a single *action*
for a beam state -- so we have "the gradient of loss for taking
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
if loss != 0.0 and not numpy.isnan(loss):
nr_step = max(nr_step, len(hist))
for i in range(nr_step):
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
assert len(histories) == len(losses)
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
grads[j][i, clas] += loss
key = key + tuple([clas])
return grads
@ -1 +0,0 @@
# test
@ -4,24 +4,16 @@
# coding: utf-8
from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
import ctypes
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from collections import OrderedDict
from thinc.extra.search cimport Beam
import numpy
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
from ._state cimport StateC
from .nonproj import is_nonproj_tree
from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
from ..lexeme cimport Lexeme
from ..gold cimport GoldParse, GoldParseC
from ..structs cimport TokenC
@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
(SHIFT, ['']),
(REDUCE, ['']),
(RIGHT, []),
(LEFT, []),
actions = kwargs.get('actions', OrderedDict((
(SHIFT, ['']),
(REDUCE, ['']),
(RIGHT, []),
(LEFT, []),
(BREAK, ['ROOT']))
seen_actions = set()
for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT':
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
if gold.cand_to_gold[i] is None:
if state.safe_get(i).dep:
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
predicted.add((i, state.H(i),
predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
if not self.has_gold(gold):
return None
for i in range(gold.length):
if gold.heads[i] is None or gold.labels[i] is None: # Missing values
# Missing values
if gold.heads[i] is None or gold.labels[i] is None:
gold.c.heads[i] = i
gold.c.has_dep[i] = False
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
# Check projectivity --- leading cause
if is_nonproj_tree(gold.heads):
raise ValueError(
"Could not find a gold-standard action to supervise the dependency "
"Likely cause: the tree is non-projective (i.e. it has crossing "
"arcs -- see spacy/syntax/nonproj.pyx for definitions)\n"
"The ArcEager transition system only supports projective trees.\n"
"To learn non-projective representations, transform the data "
"before training and after parsing. Either pass make_projective=True "
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
"Could not find a gold-standard action to supervise the "
"dependency parser. Likely cause: the tree is "
"non-projective (i.e. it has crossing arcs -- see "
"spacy/syntax/nonproj.pyx for definitions). The ArcEager "
"transition system only supports projective trees. To "
"learn non-projective representations, transform the data "
"before training and after parsing. Either pass "
"make_projective=True to the GoldParse class, or use "
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
raise ValueError(
"Could not find a gold-standard action to supervise the dependency "
"The GoldParse was projective.\n"
"The transition system has %d actions.\n"
"State at failure:\n"
"%s" % (self.n_moves, stcls.print_state(gold.words)))
"Could not find a gold-standard action to supervise the"
"dependency parser. The GoldParse was projective. The "
"transition system has %d actions. State at failure: %s"
% (self.n_moves, stcls.print_state(gold.words)))
assert n_gold >= 1
def get_beam_annot(self, Beam beam):
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
deps[j].setdefault(dep, 0.0)
deps[j][dep] += prob
return heads, deps
@ -1,144 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
def english_noun_chunks(obj):
Detect base noun phrases from a dependency parse.
Works on both Doc and Span.
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
# Prevent nested chunks from being produced
if word.i in seen:
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
seen.update(j for j in range(word.left_edge.i, word.i+1))
yield word.left_edge.i, word.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
seen.update(j for j in range(word.left_edge.i, word.i+1))
yield word.left_edge.i, word.i+1, np_label
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
def german_noun_chunks(obj):
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings.add('NP')
np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings.add('nk')
rbracket = 0
for i, word in enumerate(obj):
if i < rbracket:
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
rbracket = word.i+1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in doc[word.i].rights:
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
rbracket = rdep.i+1
yield word.left_edge.i, rbracket, np_label
def es_noun_chunks(obj):
doc = obj.doc
np_label = doc.vocab.strings['NP']
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
right_labels = ['flat', 'fixed', 'compound', 'neg']
stop_labels = ['punct']
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
def next_token(token):
return token.nbor()
return None
def noun_bounds(root):
def is_verb_token(token):
return token.pos in [VERB, AUX]
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
left_bound = token
right_bound = root
for token in root.rights:
if (token.dep in np_right_deps):
left, right = noun_bounds(token)
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i: right.i])):
right_bound = right
return left_bound, right_bound
token = doc[0]
while token and token.i < len(doc):
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(token)
yield left.i, right.i+1, np_label
token = right
token = next_token(token)
def french_noun_chunks(obj):
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
# Prevent nested chunks from being produced
if word.i in seen:
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
'es': es_noun_chunks, 'fr': french_noun_chunks}
@ -4,17 +4,12 @@ from __future__ import unicode_literals
from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
from collections import OrderedDict
import numpy
from thinc.neural.ops import NumpyOps
from .stateclass cimport StateClass
from ._state cimport StateC
from .transition_system cimport Transition
from .transition_system cimport do_func_t
from ..structs cimport TokenC, Entity
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
from ..gold cimport GoldParseC, GoldParse
cdef enum:
@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem):
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
(MISSING, ['']),
(BEGIN, []),
(IN, []),
(LAST, []),
(UNIT, []),
(OUT, [''])
actions = kwargs.get('actions', OrderedDict((
(MISSING, ['']),
(BEGIN, []),
(IN, []),
(LAST, []),
(UNIT, []),
(OUT, [''])
seen_entities = set()
for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities:
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == None:
if name == '-' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0)
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)
@ -328,8 +322,8 @@ cdef class In:
return False
elif preset_ent_iob == 3:
return False
# TODO: Is this quite right?
# I think it's supposed to be ensuring the gazetteer matches are maintained
# TODO: Is this quite right? I think it's supposed to be ensuring the
# gazetteer matches are maintained
elif st.B_(1).ent_iob != preset_ent_iob:
return False
# Don't allow entities to extend across sentence boundaries
@ -354,10 +348,12 @@ cdef class In:
if g_act == MISSING:
return 0
elif g_act == BEGIN:
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
# I, Gold B --> True
# (P of bad open entity sunk, R of this entity sunk)
return 0
elif g_act == IN:
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
# I, Gold I --> True
# (label forced by prev, if mismatch, P and R both sunk)
return 0
elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O
@ -505,11 +501,3 @@ cdef class Out:
return 1
return 1
class OracleError(Exception):
class UnknownMove(Exception):
@ -5,79 +5,55 @@
# coding: utf-8
from __future__ import unicode_literals, print_function
from collections import Counter, OrderedDict
from collections import OrderedDict
import ujson
import json
import contextlib
import numpy
from libc.math cimport exp
cimport cython
cimport cython.parallel
import cytoolz
import dill
import numpy.random
cimport numpy as np
from libcpp.vector cimport vector
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.stdint cimport uint32_t, uint64_t
from libc.string cimport memset, memcpy
from libc.stdlib cimport malloc, calloc, free
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport Vec, VecVec
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
from thinc.extra.eg cimport Example
from libc.math cimport exp
from libcpp.vector cimport vector
from libc.string cimport memset
from libc.stdlib cimport calloc, free
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t, class_t, hash_t
from thinc.extra.search cimport Beam
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.api import layerize, chain, clone, with_flatten
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.api import chain, clone
from thinc.v2v import Model, Maxout, Affine
from thinc.misc import LayerNorm
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.ops import CupyOps
from thinc.neural.util import get_array_module
from .. import util
from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch
from .._ml import Residual, drop_layer, flatten
from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
from .._ml import link_vectors_to_models
from .._ml import HistoryFeatures
from ..compat import json_dumps, copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
from .. import util
from .stateclass cimport StateClass
from ._state cimport StateC
from . import nonproj
from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from ..gold cimport GoldParse
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
from . import _beam_utils
from .transition_system cimport Transition
from . import _beam_utils, nonproj
def get_templates(*args, **kwargs):
return []
DEBUG = False
def set_debug(val):
global DEBUG
DEBUG = val
cdef class precompute_hiddens:
'''Allow a model to be "primed" by pre-computing input features in bulk.
"""Allow a model to be "primed" by pre-computing input features in bulk.
This is used for the parser, where we want to take a batch of documents,
and compute vectors for each (token, position) pair. These vectors can then
@ -92,7 +68,7 @@ cdef class precompute_hiddens:
so we can save the factor k. This also gives a nice CPU/GPU division:
we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU.
cdef int nF, nO, nP
cdef bint _is_synchronized
cdef public object ops
@ -101,7 +77,8 @@ cdef class precompute_hiddens:
cdef object _cuda_stream
cdef object _bp_hiddens
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.):
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
cdef np.ndarray cached
if not isinstance(gpu_cached, numpy.ndarray):
@ -121,8 +98,7 @@ cdef class precompute_hiddens:
self._bp_hiddens = bp_features
cdef const float* get_feat_weights(self) except NULL:
if not self._is_synchronized \
and self._cuda_stream is not None:
if not self._is_synchronized and self._cuda_stream is not None:
self._is_synchronized = True
return <float*>self._cached.data
@ -131,7 +107,8 @@ cdef class precompute_hiddens:
return self.begin_update(X)[0]
def begin_update(self, token_ids, drop=0.):
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO*self.nP), dtype='f')
# This is tricky, but (assuming GPU available);
# - Input to forward on CPU
# - Output from forward on CPU
@ -142,8 +119,8 @@ cdef class precompute_hiddens:
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids
feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP)
feat_weights, &ids[0, 0],
token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector, sgd=None):
@ -162,10 +139,11 @@ cdef class precompute_hiddens:
state_vector = state_vector.reshape(
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
best, which = self.ops.maxout(state_vector)
def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
return best, backprop
cdef void sum_state_features(float* output,
@ -240,11 +218,15 @@ cdef class Parser:
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
if depth != 1:
raise ValueError("Currently parser depth is hard-coded to 1.")
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
cfg.get('maxout_pieces', 2))
if parser_maxout_pieces != 2:
raise ValueError("Currently parser_maxout_pieces is hard-coded to 2")
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
raise ValueError("Currently parser_maxout_pieces is hard-coded "
"to 2")
token_vector_width = util.env_opt('token_vector_width',
cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width',
cfg.get('hidden_width', 200))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -280,23 +262,19 @@ cdef class Parser:
return (tok2vec, lower, upper), cfg
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
Create a Parser.
"""Create a Parser.
vocab (Vocab):
The vocabulary object. Must be shared with documents to be processed.
The value is set to the .vocab attribute.
moves (TransitionSystem):
Defines how the parse-state is created, updated and evaluated.
The value is set to the .moves attribute unless True (default),
in which case a new instance is created with Parser.Moves().
model (object):
Defines how the parse-state is created, updated and evaluated.
The value is set to the .model attribute unless True (default),
in which case a new instance is created with Parser.Model().
Arbitrary configuration parameters. Set to the .cfg attribute
vocab (Vocab): The vocabulary object. Must be shared with documents
to be processed. The value is set to the `.vocab` attribute.
moves (TransitionSystem): Defines how the parse-state is created,
updated and evaluated. The value is set to the .moves attribute
unless True (default), in which case a new instance is created with
model (object): Defines how the parse-state is created, updated and
evaluated. The value is set to the .model attribute unless True
(default), in which case a new instance is created with
**cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
self.vocab = vocab
if moves is True:
@ -322,13 +300,10 @@ cdef class Parser:
return (Parser, (self.vocab, self.moves, self.model), None, None)
def __call__(self, Doc doc, beam_width=None, beam_density=None):
Apply the parser or entity recognizer, setting the annotations onto the Doc object.
"""Apply the parser or entity recognizer, setting the annotations onto
the `Doc` object.
doc (Doc): The document to be processed.
doc (Doc): The document to be processed.
if beam_width is None:
beam_width = self.cfg.get('beam_width', 1)
@ -350,16 +325,13 @@ cdef class Parser:
def pipe(self, docs, int batch_size=256, int n_threads=2,
beam_width=None, beam_density=None):
Process a stream of documents.
"""Process a stream of documents.
stream: The sequence of documents to process.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel.
Yields (Doc): Documents, in order.
stream: The sequence of documents to process.
batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel.
YIELDS (Doc): Documents, in order.
if beam_width is None:
beam_width = self.cfg.get('beam_width', 1)
@ -376,8 +348,8 @@ cdef class Parser:
parse_states = self.parse_batch(subbatch)
beams = []
beams = self.beam_parse(subbatch,
beam_width=beam_width, beam_density=beam_density)
beams = self.beam_parse(subbatch, beam_width=beam_width,
parse_states = []
for beam in beams:
@ -397,9 +369,9 @@ cdef class Parser:
if isinstance(docs, Doc):
docs = [docs]
cuda_stream = get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
docs, cuda_stream, 0.0)
nr_state = len(docs)
nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1]
@ -413,7 +385,8 @@ cdef class Parser:
feat_weights = state2vec.get_feat_weights()
cdef int i
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T)
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
hW = <float*>hidden_weights.data
@ -473,9 +446,9 @@ cdef class Parser:
cdef Doc doc
cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output
cuda_stream = get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
docs, cuda_stream, 0.0)
beams = []
cdef int offset = 0
cdef int j = 0
@ -530,9 +503,7 @@ cdef class Parser:
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs]
golds = [golds]
cuda_stream = get_cuda_stream()
cuda_stream = util.get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
@ -547,7 +518,6 @@ cdef class Parser:
n_steps = 0
while todo:
states, golds = zip(*todo)
token_ids = self.get_token_ids(states)
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
if drop != 0:
@ -569,8 +539,8 @@ cdef class Parser:
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously
get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector),
util.get_async(cuda_stream, token_ids),
util.get_async(cuda_stream, d_vector),
@ -603,15 +573,13 @@ cdef class Parser:
states = self.moves.init_batch(docs)
for gold in golds:
cuda_stream = get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
states, golds,
state2vec, vec2scores,
width, density, self.cfg.get('hist_size', 0),
drop=drop, losses=losses)
cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
docs, cuda_stream, drop)
states_d_scores, backprops = _beam_utils.update_beam(
self.moves, self.nr_feature, 500, states, golds, state2vec,
vec2scores, width, density, self.cfg.get('hist_size', 0),
drop=drop, losses=losses)
backprop_lower = []
cdef float batch_size = len(docs)
for i, d_scores in enumerate(states_d_scores):
@ -623,13 +591,14 @@ cdef class Parser:
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(ids, state2vec.ops.xp.ndarray):
get_async(cuda_stream, ids),
get_async(cuda_stream, d_vector),
util.get_async(cuda_stream, ids),
util.get_async(cuda_stream, d_vector),
backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long
@ -779,7 +748,8 @@ cdef class Parser:
def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg:
self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
gold_tuples = nonproj.preprocess_training_data(gold_tuples,
actions = self.moves.get_actions(gold_parses=gold_tuples)
for action, labels in actions.items():
for label in labels:
@ -1,39 +1,37 @@
# coding: utf-8
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
from __future__ import unicode_literals
from copy import copy
from ..tokens.doc cimport Doc
from ..attrs import DEP, HEAD
def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root
# the path to root cannot be longer than the number of words in the sentence
# this function ends after at most len(heads) steps
# because it would otherwise loop indefinitely on cycles
# Returns all words going from the word up the path to the root. The path
# to root cannot be longer than the number of words in the sentence. This
# function ends after at most len(heads) steps, because it would otherwise
# loop indefinitely on cycles.
head = tokenid
cnt = 0
while heads[head] != head and cnt < len(heads):
head = heads[head]
cnt += 1
yield head
if head == None:
if head is None:
def contains_cycle(heads):
# in an acyclic tree, the path from each word following
# the head relation upwards always ends at the root node
# in an acyclic tree, the path from each word following the head relation
# upwards always ends at the root node
for tokenid in range(len(heads)):
seen = set([tokenid])
for ancestor in ancestors(tokenid,heads):
for ancestor in ancestors(tokenid, heads):
if ancestor in seen:
return seen
@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads):
# if there is a token k, h < k < d such that h is not
# an ancestor of k. Same for h -> d, h > d
head = heads[tokenid]
if head == tokenid: # root arcs cannot be non-projective
if head == tokenid: # root arcs cannot be non-projective
return False
elif head == None: # unattached tokens cannot be non-projective
elif head is None: # unattached tokens cannot be non-projective
return False
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
for k in range(start,end):
for ancestor in ancestors(k,heads):
if ancestor == None: # for unattached tokens/subtrees
for k in range(start, end):
for ancestor in ancestors(k, heads):
if ancestor is None: # for unattached tokens/subtrees
elif ancestor == head: # normal case: k dominated by h
elif ancestor == head: # normal case: k dominated by h
else: # head not in ancestors: d -> h is non-projective
else: # head not in ancestors: d -> h is non-projective
return True
return False
def is_nonproj_tree(heads):
# a tree is non-projective if at least one arc is non-projective
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
def decompose(label):
@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
for raw_text, sents in gold_tuples:
prepro_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads,deco_labels = projectivize(heads,labels)
proj_heads, deco_labels = projectivize(heads, labels)
# set the label to ROOT for each root dependent
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
deco_labels = ['ROOT' if head == i else deco_labels[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
if label_freq_cutoff > 0:
for label in deco_labels:
if is_decorated(label):
freqs[label] = freqs.get(label,0) + 1
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
freqs[label] = freqs.get(label, 0) + 1
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
preprocessed.append((raw_text, prepro_sents))
if label_freq_cutoff > 0:
return _filter_labels(preprocessed,label_freq_cutoff,freqs)
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
return preprocessed
def projectivize(heads, labels):
# use the algorithm by Nivre & Nilsson 2005
# assumes heads to be a proper tree, i.e. connected and cycle-free
# returns a new pair (heads,labels) which encode
# a projective and decorated tree
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
# which encode a projective and decorated tree.
proj_heads = copy(heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
if smallest_np_arc == None: # this sentence is already projective
if smallest_np_arc is None: # this sentence is already projective
return proj_heads, copy(labels)
while smallest_np_arc != None:
while smallest_np_arc is not None:
_lift(smallest_np_arc, proj_heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
deco_labels = _decorate(heads, proj_heads, labels)
@ -114,24 +112,26 @@ def projectivize(heads, labels):
def deprojectivize(tokens):
# reattach arcs with decorated labels (following HEAD scheme)
# for each decorated arc X||Y, search top-down, left-to-right,
# breadth-first until hitting a Y then make this the new head
# Reattach arcs with decorated labels (following HEAD scheme). For each
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
# hitting a Y then make this the new head.
for token in tokens:
if is_decorated(token.dep_):
newlabel,headlabel = decompose(token.dep_)
newhead = _find_new_head(token,headlabel)
newlabel, headlabel = decompose(token.dep_)
newhead = _find_new_head(token, headlabel)
token.head = newhead
token.dep_ = newlabel
return tokens
def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005
assert(len(heads) == len(proj_heads) == len(labels))
deco_labels = []
for tokenid,head in enumerate(heads):
for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]:
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
'%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
return deco_labels
@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads):
# and ties are broken left to right
smallest_size = float('inf')
smallest_np_arc = None
for tokenid,head in enumerate(heads):
for tokenid, head in enumerate(heads):
size = abs(tokenid-head)
if size < smallest_size and is_nonproj_arc(tokenid,heads):
if size < smallest_size and is_nonproj_arc(tokenid, heads):
smallest_size = size
smallest_np_arc = tokenid
return smallest_np_arc
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space: continue
if child == token: continue
if child.is_space:
if child == token:
if child.dep_ == headlabel:
return child
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
for raw_text, sents in gold_tuples:
filtered_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
filtered_labels = [decompose(label)[0]
if freqs.get(label, cutoff) < cutoff
else label for label in labels]
((ids, words, tags, heads, filtered_labels, iob), ctnts))
filtered.append((raw_text, filtered_sents))
return filtered
@ -2,17 +2,8 @@
# cython: infer_types=True
from __future__ import unicode_literals
from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t, uint64_t
import numpy
from ..vocab cimport EMPTY_LEXEME
from ..structs cimport Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
from ..attrs cimport attr_id_t
from ..tokens.token cimport Token
from ..tokens.doc cimport Doc
@ -2,17 +2,17 @@
# coding: utf-8
from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from collections import defaultdict, OrderedDict
from collections import OrderedDict
import ujson
from .. import util
from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from ..typedefs cimport attr_t
from ..compat import json_dumps
from .. import util
cdef weight_t MIN_SCORE = -90000
@ -136,11 +136,12 @@ cdef class TransitionSystem:
print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
print("Self labels",
[self.c[i].label for i in range(self.n_moves)])
raise ValueError(
"Could not find a gold-standard action to supervise "
"the entity recognizer\n"
"The transition system has %d actions." % (self.n_moves))
"the entity recognizer. The transition system has "
"%d actions." % (self.n_moves))
def get_class_name(self, int clas):
act = self.c[clas]
@ -149,7 +150,7 @@ cdef class TransitionSystem:
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int) and \
not isinstance(label_name, long):
not isinstance(label_name, long):
label_id = self.strings.add(label_name)
label_id = label_name
@ -186,7 +187,7 @@ cdef class TransitionSystem:
'name': self.move_name(trans.move, trans.label)
serializers = {
'transitions': lambda: ujson.dumps(transitions),
'transitions': lambda: json_dumps(transitions),
'strings': lambda: self.strings.to_bytes()
return util.to_bytes(serializers, exclude)
@ -1,17 +0,0 @@
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC
from .structs cimport TokenC
from .vocab cimport Vocab
cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *
cdef class Tagger:
cdef readonly Vocab vocab
cdef readonly TaggerModel model
cdef public dict freqs
cdef public object cfg
@ -1,253 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from collections import defaultdict
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .gold cimport GoldParse
from .attrs cimport *
cpdef enum:
cdef class TaggerModel(AveragedPerceptron):
def update(self, Example eg):
self.time += 1
guess = eg.guess
best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)
if guess != best:
for feat in eg.c.features[:eg.c.nr_feat]:
self.update_weight(feat.key, best, -feat.value)
self.update_weight(feat.key, guess, feat.value)
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.lower
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.tag
context[6] = t.lemma
if t.lex.flags & (1 << IS_ALPHA):
context[7] = 1
elif t.lex.flags & (1 << IS_PUNCT):
context[7] = 2
elif t.lex.flags & (1 << LIKE_URL):
context[7] = 3
elif t.lex.flags & (1 << LIKE_NUM):
context[7] = 4
context[7] = 0
cdef class Tagger:
"""Annotate part-of-speech tags on Doc objects."""
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
"""Create a Tagger.
vocab (Vocab): The vocabulary object. Must be shared with documents to
be processed.
model (thinc.linear.AveragedPerceptron): The statistical model.
RETURNS (Tagger): The newly constructed object.
if model is None:
model = TaggerModel(cfg.get('features', self.feature_templates),
self.vocab = vocab
self.model = model
self.model.l1_penalty = 0.0
# TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1
self.cfg = cfg
def tag_names(self):
return self.vocab.morphology.tag_names
def __reduce__(self):
return (self.__class__, (self.vocab, self.model), None, None)
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):
self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
doc (Doc): The tokens to be tagged.
if tokens.length == 0:
return 0
cdef Pool mem = Pool()
cdef int i, tag
cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS,
for i in range(tokens.length):
if tokens.c[i].pos == 0:
self.model.set_featuresC(&eg.c, tokens.c, i)
eg.c.features, eg.c.nr_feat)
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
eg.fill_scores(0, eg.c.nr_class)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2):
"""Tag a stream of documents.
stream: The sequence of documents to tag.
batch_size (int): The number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the Matcher implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
for doc in stream:
yield doc
def update(self, Doc tokens, GoldParse gold, itn=0):
"""Update the statistical model, with tags supplied for the given document.
doc (Doc): The document to update on.
gold (GoldParse): Manager for the gold-standard tags.
RETURNS (int): Number of tags predicted correctly.
gold_tag_strs = gold.tags
assert len(tokens) == len(gold_tag_strs)
for tag in gold_tag_strs:
if tag != None and tag not in self.tag_names:
msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "
"gold tags, to maintain coarse-grained mapping.")
raise ValueError(msg % tag)
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
cdef int correct = 0
cdef Pool mem = Pool()
cdef Example eg = Example(
for i in range(tokens.length):
self.model.set_featuresC(&eg.c, tokens.c, i)
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
eg.c.features, eg.c.nr_feat)
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
correct += eg.cost == 0
self.freqs[TAG][tokens.c[i].tag] += 1
eg.fill_scores(0, eg.c.nr_class)
eg.fill_costs(0, eg.c.nr_class)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
return correct
feature_templates = (
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(P1_pos, P2_pos),
(P1_pos, W_orth),
@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
import regex as re
from .strings cimport hash_string
from . import util
cimport cython
from .tokens.doc cimport Doc
from .strings cimport hash_string
from . import util
cdef class Tokenizer:
@ -21,7 +20,7 @@ cdef class Tokenizer:
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None):
suffix_search=None, infix_finditer=None, token_match=None):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types.
@ -74,9 +73,8 @@ cdef class Tokenizer:
RETURNS (Doc): A container for linguistic annotations.
if len(string) >= (2 ** 30):
raise ValueError(
"String is too long: %d characters. Max is 2**30." % len(string)
msg = "String is too long: %d characters. Max is 2**30."
raise ValueError(msg % len(string))
cdef int length = len(string)
cdef Doc doc = Doc(self.vocab)
if length == 0:
@ -122,8 +120,8 @@ cdef class Tokenizer:
"""Tokenize a stream of texts.
texts: A sequence of unicode texts.
batch_size (int): The number of texts to accumulate in an internal buffer.
n_threads (int): The number of threads to use, if the implementation
batch_size (int): Number of texts to accumulate in an internal buffer.
n_threads (int): Number of threads to use, if the implementation
supports multi-threading. The default tokenizer is single-threaded.
YIELDS (Doc): A sequence of Doc objects, in order.
@ -232,8 +230,8 @@ cdef class Tokenizer:
if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
# let's say we have dyn-o-mite-dave
# the regex finds the start and end positions of the hyphens
# let's say we have dyn-o-mite-dave - the regex finds the
# start and end positions of the hyphens
start = 0
for match in matches:
infix_start = match.start()
@ -293,8 +291,8 @@ cdef class Tokenizer:
return list(self.infix_finditer(string))
def find_prefix(self, unicode string):
"""Find the length of a prefix that should be segmented from the string,
or None if no prefix rules match.
"""Find the length of a prefix that should be segmented from the
string, or None if no prefix rules match.
string (unicode): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`.
@ -305,8 +303,8 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string):
"""Find the length of a suffix that should be segmented from the string,
or None if no suffix rules match.
"""Find the length of a suffix that should be segmented from the
string, or None if no suffix rules match.
string (unicode): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`.
@ -326,8 +324,8 @@ cdef class Tokenizer:
string (unicode): The string to specially tokenize.
token_attrs (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes must
exactly match the string when they are concatenated.
a token and its attributes. The `ORTH` fields of the attributes
must exactly match the string when they are concatenated.
substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -343,7 +341,7 @@ cdef class Tokenizer:
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
it doesn't exist. Paths may be either strings or Path-like objects.
with path.open('wb') as file_:
@ -2,4 +2,4 @@ from .doc import Doc
from .token import Token
from .span import Span
__all__ = [Doc, Token, Span]
__all__ = ['Doc', 'Token', 'Span']
@ -1,21 +0,0 @@
cdef class Binder:
def __init__(self, *docs):
def __iter__(self):
def __reduce__(self):
def to_bytes(self):
def from_bytes(cls, data):
def to_disk(self):
def from_disk(self, path):
@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs, IDS
from ..attrs cimport attr_id_t
from ..attrs cimport SENT_START
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle
@ -78,24 +78,25 @@ def _get_chunker(lang):
cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings.
The `Doc` object holds an array of `TokenC` structs. The Python-level
`Token` and `Span` objects are views of this array, i.e. they don't own
the data themselves.
annotations to numpy arrays, losslessly serialize to compressed binary
strings. The `Doc` object holds an array of `TokenC` structs. The
Python-level `Token` and `Span` objects are views of this array, i.e.
they don't own the data themselves.
EXAMPLE: Construction 1
>>> doc = nlp(u'Some text')
Construction 2
>>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False])
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
nr_defined = sum(t is not None for t in (default, getter, setter, method))
assert nr_defined == 1
Underscore.doc_extensions[name] = (default, method, getter, setter)
Underscore.doc_extensions[name] = (default, method, getter, setter)
def get_extension(cls, name):
@ -109,15 +110,14 @@ cdef class Doc:
"""Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you want
to use (e.g. tokenizer, parser, entity recognizer).
vocab (Vocab): A vocabulary object, which must match any models you
want to use (e.g. tokenizer, parser, entity recognizer).
words (list or None): A list of unicode strings to add to the document
as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc.
RETURNS (Doc): The newly constructed object.
self.vocab = vocab
@ -153,10 +153,10 @@ cdef class Doc:
spaces = [True] * len(words)
elif len(spaces) != len(words):
raise ValueError(
"Arguments 'words' and 'spaces' should be sequences of the "
"same length, or 'spaces' should be left default at None. "
"spaces should be a sequence of booleans, with True meaning "
"that the word owns a ' ' character following it.")
"Arguments 'words' and 'spaces' should be sequences of "
"the same length, or 'spaces' should be left default at "
"None. spaces should be a sequence of booleans, with True "
"meaning that the word owns a ' ' character following it.")
orths_and_spaces = zip(words, spaces)
if orths_and_spaces is not None:
for orth_space in orths_and_spaces:
@ -166,7 +166,8 @@ cdef class Doc:
elif isinstance(orth_space, bytes):
raise ValueError(
"orths_and_spaces expects either List(unicode) or "
"List((unicode, bool)). Got bytes instance: %s" % (str(orth_space)))
"List((unicode, bool)). "
"Got bytes instance: %s" % (str(orth_space)))
orth, has_space = orth_space
# Note that we pass self.mem here --- we have ownership, if LexemeC
@ -186,7 +187,8 @@ cdef class Doc:
def __getitem__(self, object i):
"""Get a `Token` or `Span` object.
i (int or tuple) The index of the token, or the slice of the document to get.
i (int or tuple) The index of the token, or the slice of the document
to get.
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
`doc[start : end]`.
@ -199,11 +201,11 @@ cdef class Doc:
>>> doc[start : end]]
Get a `Span` object, starting at position `start` and ending at
position `end`, where `start` and `end` are token indices. For
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
as `Span` objects must be contiguous (cannot have gaps). You can use
negative indices and open-ended ranges, which have their normal
Python semantics.
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
4. Stepped slices (e.g. `doc[start : end : step]`) are not
supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have
their normal Python semantics.
if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -262,8 +264,10 @@ cdef class Doc:
doc (Doc): The parent document.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
label (uint64 or string): A label to attach to the Span, e.g. for
named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
RETURNS (Span): The newly constructed object.
if not isinstance(label, int):
@ -322,7 +326,8 @@ cdef class Doc:
if self._vector is not None:
return self._vector
elif not len(self):
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
self._vector = numpy.zeros((self.vocab.vectors_length,),
return self._vector
elif self.has_vector:
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
@ -334,7 +339,8 @@ cdef class Doc:
self._vector = self.tensor.mean(axis=0)
return self._vector
return numpy.zeros((self.vocab.vectors_length,), dtype='float32')
return numpy.zeros((self.vocab.vectors_length,),
def __set__(self, value):
self._vector = value
@ -377,13 +383,14 @@ cdef class Doc:
return self.text
property ents:
"""Iterate over the entities in the document. Yields named-entity `Span`
objects, if the entity recognizer has been applied to the document.
"""Iterate over the entities in the document. Yields named-entity
`Span` objects, if the entity recognizer has been applied to the
YIELDS (Span): Entities in the document.
EXAMPLE: Iterate over the span to get individual Token objects, or access
the label:
EXAMPLE: Iterate over the span to get individual Token objects,
or access the label:
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
@ -419,7 +426,8 @@ cdef class Doc:
def __set__(self, ents):
# 1. Allow negative matches
# 2. Ensure pre-set NERs are not over-written during statistical prediction
# 2. Ensure pre-set NERs are not over-written during statistical
# prediction
# 3. Test basic data-driven ORTH gazetteer
# 4. Test more nuanced date and currency regex
cdef int i
@ -428,7 +436,7 @@ cdef class Doc:
# At this point we don't know whether the NER has run over the
# Doc. If the ent_iob is missing, leave it missing.
if self.c[i].ent_iob != 0:
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
cdef attr_t ent_type
cdef int start, end
for ent_info in ents:
@ -456,10 +464,11 @@ cdef class Doc:
property noun_chunks:
"""Iterate over the base noun phrases in the document. Yields base
noun-phrase #[code Span] objects, if the document has been syntactically
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
not permit other NPs to be nested within it – so no NP-level
coordination, no prepositional phrases, and no relative clauses.
noun-phrase #[code Span] objects, if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it – so no
NP-level coordination, no prepositional phrases, and no relative
YIELDS (Span): Noun chunks in the document.
@ -467,12 +476,14 @@ cdef class Doc:
if not self.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires data to be installed. For more info, see the "
"requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This prevents
# the tokenisation from being changed out from under us during the iteration.
# The tricky thing here is that Span accepts its tokenisation changing,
# so it's okay once we have the Span objects. See Issue #375
# Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts
# its tokenisation changing, so it's okay once we have the Span
# objects. See Issue #375.
spans = []
for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label))
@ -497,8 +508,9 @@ cdef class Doc:
if not self.is_parsed:
raise ValueError(
"sentence boundary detection requires the dependency parse, which "
"requires data to be installed. For more info, see the "
"Sentence boundary detection requires the dependency "
"parse, which requires a statistical model to be "
"installed and loaded. For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
cdef int i
start = 0
@ -537,12 +549,11 @@ cdef class Doc:
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Export given token attributes to a numpy `ndarray`.
If `attr_ids` is a sequence of M attributes, the output array will
be of shape `(N, M)`, where N is the length of the `Doc`
(in tokens). If `attr_ids` is a single attribute, the output shape will
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
or string name (e.g. 'LEMMA' or 'lemma').
If `attr_ids` is a sequence of M attributes, the output array will be
of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
`attr_ids` is a single attribute, the output shape will be (N,). You
can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
string name (e.g. 'LEMMA' or 'lemma').
attr_ids (list[]): A list of attributes (int IDs or string names).
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
@ -566,18 +577,19 @@ cdef class Doc:
# Allow strings, e.g. 'lemma' or 'LEMMA'
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
for id_ in py_attr_ids]
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
# Make an array from the attributes --- otherwise our inner loop is
# Python dict iteration.
attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)),
for i in range(self.length):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.c[i], feature)
# Handle 1d case
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
def count_by(self, attr_id_t attr_id, exclude=None,
PreshCounter counts=None):
"""Count the frequencies of a given attribute. Produces a dict of
`{attribute (int): count (ints)}` frequencies, keyed by the values of
the given attribute ID.
@ -641,13 +653,12 @@ cdef class Doc:
def from_array(self, attrs, array):
if SENT_START in attrs and HEAD in attrs:
raise ValueError(
"Conflicting attributes specified in doc.from_array():\n"
"Conflicting attributes specified in doc.from_array(): "
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
"based on the tree structure. This means the HEAD attribute would "
"potentially override the sentence boundaries set by SENT_START.\n"
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
"workarounds, and to propose solutions.")
"The HEAD attribute currently sets sentence boundaries "
"implicitly, based on the tree structure. This means the HEAD "
"attribute would potentially override the sentence boundaries "
"set by SENT_START.")
cdef int i, col
cdef attr_id_t attr_id
cdef TokenC* tokens = self.c
@ -675,18 +686,14 @@ cdef class Doc:
return self
def get_lca_matrix(self):
Calculates the lowest common ancestor matrix
for a given Spacy doc.
Returns LCA matrix containing the integer index
of the ancestor, or -1 if no common ancestor is
found (ex if span excludes a necessary ancestor).
Apologies about the recursion, but the
impact on performance is negligible given
the natural limitations on the depth of a typical human sentence.
"""Calculates the lowest common ancestor matrix for a given `Doc`.
Returns LCA matrix containing the integer index of the ancestor, or -1
if no common ancestor is found (ex if span excludes a necessary
ancestor). Apologies about the recursion, but the impact on
performance is negligible given the natural limitations on the depth
of a typical human sentence.
# Efficiency notes:
# We can easily improve the performance here by iterating in Cython.
# To loop over the tokens in Cython, the easiest way is:
# for token in doc.c[:doc.c.length]:
@ -705,7 +712,8 @@ cdef class Doc:
elif (token_j.head == token_j) and (token_k.head == token_k):
lca_index = -1
lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix)
lca_index = __pairwise_lca(token_j.head, token_k.head,
lca_matrix[token_j.i][token_k.i] = lca_index
lca_matrix[token_k.i][token_j.i] = lca_index
@ -719,14 +727,13 @@ cdef class Doc:
token_k = self[k]
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
lca_matrix[k][j] = lca_matrix[j][k]
return lca_matrix
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
it doesn't exist. Paths may be either strings or Path-like objects.
with path.open('wb') as file_:
@ -749,7 +756,7 @@ cdef class Doc:
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
# Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within
# keys, we must have tuples. In values we just have to hope
@ -792,7 +799,8 @@ cdef class Doc:
# keys, we must have tuples. In values we just have to hope
# users don't mind getting a list instead of a tuple.
if 'user_data' not in exclude and 'user_data_keys' in msg:
user_data_keys = msgpack.loads(msg['user_data_keys'], use_list=False)
user_data_keys = msgpack.loads(msg['user_data_keys'],
user_data_values = msgpack.loads(msg['user_data_values'])
for key, value in zip(user_data_keys, user_data_values):
self.user_data[key] = value
@ -819,14 +827,15 @@ cdef class Doc:
return self
def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
is merged into a single token. If `start_idx` and `end_idx `do not mark
start and end token boundaries, the document remains unchanged.
"""Retokenize the document, such that the span at
`doc.text[start_idx : end_idx]` is merged into a single token. If
`start_idx` and `end_idx `do not mark start and end token boundaries,
the document remains unchanged.
start_idx (int): The character index of the start of the slice to merge.
end_idx (int): The character index after the end of the slice to merge.
start_idx (int): Character index of the start of the slice to merge.
end_idx (int): Character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span.
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries.
@ -847,10 +856,11 @@ cdef class Doc:
attributes[ENT_TYPE] = attributes['ent_type']
elif args:
raise ValueError(
"Doc.merge received %d non-keyword arguments. "
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
"Doc.merge received %d non-keyword arguments. Expected either "
"3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
"Keyword arguments: %s\n" % (len(args), repr(args),
# More deprecated attribute handling =/
if 'label' in attributes:
@ -882,8 +892,9 @@ cdef class Doc:
Token.set_struct_attr(token, attr_name, attr_value)
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency
# bridges over the entity. Here the alignment of the tokens changes.
# Before thinking of something simpler, beware the case where a
# dependency bridges over the entity. Here the alignment of the
# tokens changes.
span_root = span.root.i
token.dep = span.root.dep
# We update token.lex after keeping span root and dep, since
@ -932,8 +943,9 @@ cdef class Doc:
>>> trees = doc.print_tree()
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
@ -1008,7 +1020,7 @@ def pickle_doc(doc):
def unpickle_doc(vocab, hooks_and_data, bytes_data):
user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data)
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
@ -1018,4 +1030,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
@ -43,8 +43,8 @@ def POS_tree(root, light=False, flat=False):
def parse_tree(doc, light=False, flat=False):
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to
the one used in displaCy. Generates the POS tree for all sentences in a doc.
"""Make a copy of the doc and construct a syntactic parse tree similar to
displaCy. Generates the POS tree for all sentences in a doc.
doc (Doc): The doc for parsing.
RETURNS (dict): The parse tree.
@ -66,8 +66,9 @@ def parse_tree(doc, light=False, flat=False):
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
merge_ents(doc_clone) # merge the entities into single tokens first
return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
return [POS_tree(sent.root, light=light, flat=flat)
for sent in doc_clone.sents]
@ -35,15 +35,16 @@ cdef class Span:
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
vector=None, vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`.
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
label (uint64): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
of the span.
RETURNS (Span): The newly constructed object.
if not (0 <= start <= end <= len(doc)):
@ -127,14 +128,17 @@ cdef class Span:
def _(self):
"""User space for adding custom attribute extensions."""
return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char)
def as_doc(self):
'''Create a Doc object view of the Span's data.
# TODO: fix
"""Create a `Doc` object view of the Span's data. This is mostly
useful for C-typed interfaces.
This is mostly useful for C-typed interfaces.
RETURNS (Doc): The `Doc` view of the span.
cdef Doc doc = Doc(self.doc.vocab)
doc.length = self.end-self.start
doc.c = &self.doc.c[self.start]
@ -162,7 +166,8 @@ cdef class Span:
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token.
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
return self.doc.merge(self.start_char, self.end_char, *args,
def similarity(self, other):
"""Make a semantic similarity estimate. The default estimate is cosine
@ -179,24 +184,19 @@ cdef class Span:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
def get_lca_matrix(self):
Calculates the lowest common ancestor matrix
for a given Spacy span.
Returns LCA matrix containing the integer index
of the ancestor, or -1 if no common ancestor is
found (ex if span excludes a necessary ancestor).
Apologies about the recursion, but the
impact on performance is negligible given
the natural limitations on the depth of a typical human sentence.
"""Calculates the lowest common ancestor matrix for a given `Span`.
Returns LCA matrix containing the integer index of the ancestor, or -1
if no common ancestor is found (ex if span excludes a necessary
ancestor). Apologies about the recursion, but the impact on
performance is negligible given the natural limitations on the depth
of a typical human sentence.
def __pairwise_lca(token_j, token_k, lca_matrix, margins):
offset = margins[0]
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
token_j_i = token_j.i - offset
token_k_i = token_k.i - offset
if lca_matrix[token_j_i][token_k_i] != -2:
return lca_matrix[token_j_i][token_k_i]
elif token_j == token_k:
@ -209,23 +209,19 @@ cdef class Span:
lca_index = -1
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
lca_matrix[token_j_i][token_k_i] = lca_index
lca_matrix[token_k_i][token_j_i] = lca_index
return lca_index
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
margins = [self.start, self.end]
for j in range(len(self)):
token_j = self[j]
for k in range(len(self)):
token_k = self[k]
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
lca_matrix[k][j] = lca_matrix[j][k]
return lca_matrix
cpdef np.ndarray to_array(self, object py_attr_ids):
@ -266,10 +262,7 @@ cdef class Span:
self.end = end + 1
property sent:
"""The sentence span that this span is a part of.
RETURNS (Span): The sentence span that the span is a part of.
"""RETURNS (Span): The sentence span that the span is a part of."""
def __get__(self):
if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self)
@ -282,13 +275,10 @@ cdef class Span:
n += 1
if n >= self.doc.length:
raise RuntimeError
return self.doc[root.l_edge : root.r_edge + 1]
return self.doc[root.l_edge:root.r_edge + 1]
property has_vector:
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""RETURNS (bool): Whether a word vector is associated with the object.
def __get__(self):
if 'has_vector' in self.doc.user_span_hooks:
@ -310,10 +300,7 @@ cdef class Span:
return self._vector
property vector_norm:
"""The L2 norm of the document's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""RETURNS (float): The L2 norm of the vector representation."""
def __get__(self):
if 'vector_norm' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['vector'](self)
@ -327,7 +314,9 @@ cdef class Span:
return self._vector_norm
property sentiment:
# TODO: docstring
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the span.
def __get__(self):
if 'sentiment' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sentiment'](self)
@ -335,10 +324,7 @@ cdef class Span:
return sum([token.sentiment for token in self]) / len(self)
property text:
"""A unicode representation of the span text.
RETURNS (unicode): The original verbatim text of the span.
"""RETURNS (unicode): The original verbatim text of the span."""
def __get__(self):
text = self.text_with_ws
if self[-1].whitespace_:
@ -349,7 +335,8 @@ cdef class Span:
"""The text content of the span with a trailing whitespace character if
the last token has one.
RETURNS (unicode): The text content of the span (with trailing whitespace).
RETURNS (unicode): The text content of the span (with trailing
def __get__(self):
return u''.join([t.text_with_ws for t in self])
@ -358,7 +345,8 @@ cdef class Span:
"""Yields base noun-phrase `Span` objects, if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it – so no
NP-level coordination, no prepositional phrases, and no relative clauses.
NP-level coordination, no prepositional phrases, and no relative
YIELDS (Span): Base noun-phrase `Span` objects
@ -366,12 +354,14 @@ cdef class Span:
if not self.doc.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires data to be installed. For more info, see the "
"requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This prevents
# the tokenisation from being changed out from under us during the iteration.
# The tricky thing here is that Span accepts its tokenisation changing,
# so it's okay once we have the Span objects. See Issue #375
# Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts
# its tokenisation changing, so it's okay once we have the Span
# objects. See Issue #375
spans = []
cdef attr_t label
for start, end, label in self.doc.noun_chunks_iterator(self):
@ -385,9 +375,9 @@ cdef class Span:
RETURNS (Token): The root token.
EXAMPLE: The root token has the shortest path to the root of the sentence
(or is the root itself). If multiple words are equally high in the
tree, the first word is taken. For example:
EXAMPLE: The root token has the shortest path to the root of the
sentence (or is the root itself). If multiple words are equally
high in the tree, the first word is taken. For example:
>>> toks = nlp(u'I like New York in Autumn.')
@ -437,11 +427,11 @@ cdef class Span:
if self.doc.c[i].head == 0:
return self.doc[i]
# If we don't have a sentence root, we do something that's not so
# algorithmically clever, but I think should be quite fast, especially
# for short spans.
# algorithmically clever, but I think should be quite fast,
# especially for short spans.
# For each word, we count the path length, and arg min this measure.
# We could use better tree logic to save steps here...But I think this
# should be okay.
# We could use better tree logic to save steps here...But I
# think this should be okay.
cdef int current_best = self.doc.length
cdef int root = -1
for i in range(self.start, self.end):
@ -463,7 +453,7 @@ cdef class Span:
YIELDS (Token):A left-child of a token of the span.
def __get__(self):
for token in reversed(self): # Reverse, so we get the tokens in order
for token in reversed(self): # Reverse, so we get tokens in order
for left in token.lefts:
if left.i < self.start:
yield left
@ -480,6 +470,22 @@ cdef class Span:
if right.i >= self.end:
yield right
property n_lefts:
"""RETURNS (int): The number of leftward immediate children of the
span, in the syntactic dependency parse.
# TODO: implement
def __get__(self):
raise NotImplementedError
property n_rights:
"""RETURNS (int): The number of rightward immediate children of the
span, in the syntactic dependency parse.
# TODO: implement
def __get__(self):
raise NotImplementedError
property subtree:
"""Tokens that descend from tokens in the span, but fall outside it.
@ -493,66 +499,55 @@ cdef class Span:
yield from word.subtree
property ent_id:
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
RETURNS (uint64): The entity ID.
"""RETURNS (uint64): The entity ID."""
def __get__(self):
return self.root.ent_id
def __set__(self, hash_t key):
raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on the issue "
"tracker: http://github.com/explosion/spaCy/issues")
"Can't yet set ent_id from Span. Vote for this feature on "
"the issue tracker: http://github.com/explosion/spaCy/issues")
property ent_id_:
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
RETURNS (unicode): The entity ID.
"""RETURNS (unicode): The (string) entity ID."""
def __get__(self):
return self.root.ent_id_
def __set__(self, hash_t key):
raise NotImplementedError(
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
"tracker: http://github.com/explosion/spaCy/issues")
"Can't yet set ent_id_ from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
property orth_:
# TODO: docstring
"""Verbatim text content (identical to Span.text). Exists mostly for
consistency with other attributes.
RETURNS (unicode): The span's text."""
def __get__(self):
return ''.join([t.string for t in self]).strip()
return ''.join([t.orth_ for t in self]).strip()
property lemma_:
"""The span's lemma.
RETURNS (unicode): The span's lemma.
"""RETURNS (unicode): The span's lemma."""
def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip()
property upper_:
# TODO: docstring
"""Deprecated. Use Span.text.upper() instead."""
def __get__(self):
return ''.join([t.string.upper() for t in self]).strip()
return ''.join([t.text_with_ws.upper() for t in self]).strip()
property lower_:
# TODO: docstring
"""Deprecated. Use Span.text.lower() instead."""
def __get__(self):
return ''.join([t.string.lower() for t in self]).strip()
return ''.join([t.text_with_ws.lower() for t in self]).strip()
property string:
# TODO: docstring
"""Deprecated: Use Span.text_with_ws instead."""
def __get__(self):
return ''.join([t.string for t in self])
return ''.join([t.text_with_ws for t in self])
property label_:
"""The span's label.
RETURNS (unicode): The span's label.
"""RETURNS (unicode): The span's label."""
def __get__(self):
return self.doc.vocab.strings[self.label]
@ -570,7 +565,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
n += 1
if n >= sent_length:
raise RuntimeError(
"Array bounds exceeded while searching for root word. This likely "
"means the parse tree is in an invalid state. Please report this "
"issue here: http://github.com/explosion/spaCy/issues")
"Array bounds exceeded while searching for root word. This "
"likely means the parse tree is in an invalid state. Please "
"report this issue here: "
return n
@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme
from .. import parts_of_speech
from ..attrs cimport LEMMA, POS, TAG, DEP
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..compat import is_config
from .. import about
from .underscore import Underscore
cdef class Token:
"""An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
"""An individual token – i.e. a word, punctuation symbol, whitespace,
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
@ -144,37 +145,33 @@ cdef class Token:
return self.doc.user_token_hooks['similarity'](self)
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
return (numpy.dot(self.vector, other.vector) /
(self.vector_norm * other.vector_norm))
property lex_id:
"""ID of the token's lexical type.
RETURNS (int): ID of the token's lexical type."""
"""RETURNS (int): Sequential ID of the token's lexical type."""
def __get__(self):
return self.c.lex.id
property rank:
# TODO: add docstring
"""RETURNS (int): Sequential ID of the token's lexical type, used to
index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.lex.id
property string:
"""Deprecated: Use Token.text_with_ws instead."""
def __get__(self):
return self.text_with_ws
property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""RETURNS (unicode): The original verbatim text of the token."""
def __get__(self):
return self.orth_
property text_with_ws:
"""The text content of the token with a trailing whitespace character if
it has one.
RETURNS (unicode): The text content of the span (with trailing whitespace).
"""RETURNS (unicode): The text content of the span (with trailing
def __get__(self):
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@ -184,74 +181,104 @@ cdef class Token:
return orth
property prob:
"""RETURNS (float): Smoothed log probability estimate of token type."""
def __get__(self):
return self.c.lex.prob
property sentiment:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the token."""
def __get__(self):
if 'sentiment' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['sentiment'](self)
return self.c.lex.sentiment
property lang:
"""RETURNS (uint64): ID of the language of the parent document's
def __get__(self):
return self.c.lex.lang
property idx:
"""RETURNS (int): The character offset of the token within the parent
def __get__(self):
return self.c.idx
property cluster:
"""RETURNS (int): Brown cluster ID."""
def __get__(self):
return self.c.lex.cluster
property orth:
"""RETURNS (uint64): ID of the verbatim text content."""
def __get__(self):
return self.c.lex.orth
property lower:
"""RETURNS (uint64): ID of the lowercase token text."""
def __get__(self):
return self.c.lex.lower
property norm:
"""RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
the token text. Usually set in the language's tokenizer exceptions
or norm exceptions.
def __get__(self):
return self.c.lex.norm
property shape:
"""RETURNS (uint64): ID of the token's shape, a transform of the
tokens's string, to show orthographic features (e.g. "Xxxx", "dd").
def __get__(self):
return self.c.lex.shape
property prefix:
"""RETURNS (uint64): ID of a length-N substring from the start of the
token. Defaults to `N=1`.
def __get__(self):
return self.c.lex.prefix
property suffix:
"""RETURNS (uint64): ID of a length-N substring from the end of the
token. Defaults to `N=3`.
def __get__(self):
return self.c.lex.suffix
property lemma:
"""Base form of the word, with no inflectional suffixes.
RETURNS (uint64): Token lemma.
"""RETURNS (uint64): ID of the base form of the word, with no
inflectional suffixes.
def __get__(self):
return self.c.lemma
def __set__(self, attr_t lemma):
self.c.lemma = lemma
property pos:
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
def __get__(self):
return self.c.pos
property tag:
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
def __get__(self):
return self.c.tag
def __set__(self, attr_t tag):
self.vocab.morphology.assign_tag(self.c, tag)
property dep:
"""RETURNS (uint64): ID of syntactic dependency label."""
def __get__(self):
return self.c.dep
def __set__(self, attr_t label):
self.c.dep = label
@ -292,23 +319,29 @@ cdef class Token:
return numpy.sqrt((vector ** 2).sum())
property n_lefts:
"""RETURNS (int): The number of leftward immediate children of the
word, in the syntactic dependency parse.
def __get__(self):
return self.c.l_kids
property n_rights:
"""RETURNS (int): The number of rightward immediate children of the
word, in the syntactic dependency parse.
def __get__(self):
return self.c.r_kids
property sent_start:
# TODO: fix and document
def __get__(self):
return self.c.sent_start
def __set__(self, value):
if self.doc.is_parsed:
raise ValueError(
'Refusing to write to token.sent_start if its document is parsed, '
'because this may cause inconsistent state. '
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
"Refusing to write to token.sent_start if its document "
"is parsed, because this may cause inconsistent state.")
if value is None:
self.c.sent_start = 0
elif value is True:
@ -316,15 +349,16 @@ cdef class Token:
elif value is False:
self.c.sent_start = -1
raise ValueError("Invalid value for token.sent_start -- must be one of "
"None, True, False")
raise ValueError("Invalid value for token.sent_start. Must be "
"one of: None, True, False")
property lefts:
"""The leftward immediate children of the word, in the syntactic
dependency parse.
YIELDS (Token): A left-child of the token.
def __get__(self):
The leftward immediate children of the word, in the syntactic
dependency parse.
cdef int nr_iter = 0
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
while ptr < self.c:
@ -334,15 +368,16 @@ cdef class Token:
nr_iter += 1
# This is ugly, but it's a way to guard out infinite loops
if nr_iter >= 10000000:
raise RuntimeError(
"Possibly infinite loop encountered while looking for token.lefts")
raise RuntimeError("Possibly infinite loop encountered "
"while looking for token.lefts")
property rights:
"""The rightward immediate children of the word, in the syntactic
dependency parse.
YIELDS (Token): A right-child of the token.
def __get__(self):
The rightward immediate children of the word, in the syntactic
dependency parse.
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
tokens = []
cdef int nr_iter = 0
@ -352,27 +387,26 @@ cdef class Token:
ptr -= 1
nr_iter += 1
if nr_iter >= 10000000:
raise RuntimeError(
"Possibly infinite loop encountered while looking for token.rights")
raise RuntimeError("Possibly infinite loop encountered "
"while looking for token.rights")
for t in tokens:
yield t
property children:
A sequence of the token's immediate syntactic children.
"""A sequence of the token's immediate syntactic children.
Yields: Token A child token such that child.head==self
YIELDS (Token): A child token such that child.head==self
def __get__(self):
yield from self.lefts
yield from self.rights
property subtree:
A sequence of all the token's syntactic descendents.
"""A sequence of all the token's syntactic descendents.
Yields: Token A descendent token such that self.is_ancestor(descendent)
YIELDS (Token): A descendent token such that
def __get__(self):
for word in self.lefts:
@ -422,18 +456,17 @@ cdef class Token:
if self.doc is not descendant.doc:
return False
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
property head:
"""The syntactic parent, or "governor", of this token.
RETURNS (Token): The token head.
RETURNS (Token): The token predicted by the parser to be the head of
the current token.
def __get__(self):
"""The token predicted by the parser to be the head of the current
return self.doc[self.i + self.c.head]
def __set__(self, Token new_head):
# this function sets the head of self to new_head
# and updates the counters for left/right dependents
@ -453,16 +486,18 @@ cdef class Token:
cdef Token anc, child
# update number of deps of old head
if self.c.head > 0: # left dependent
if self.c.head > 0: # left dependent
old_head.c.l_kids -= 1
if self.c.l_edge == old_head.c.l_edge:
# the token dominates the left edge so the left edge of the head
# may change when the token is reattached
# it may not change if the new head is a descendant of the current head
# the token dominates the left edge so the left edge of
# the head may change when the token is reattached, it may
# not change if the new head is a descendant of the current
# head
new_edge = self.c.l_edge
# the new l_edge is the left-most l_edge on any of the other dependents
# where the l_edge is left of the head, otherwise it is the head
# the new l_edge is the left-most l_edge on any of the
# other dependents where the l_edge is left of the head,
# otherwise it is the head
if not is_desc:
new_edge = old_head.i
for child in old_head.children:
@ -472,14 +507,15 @@ cdef class Token:
new_edge = child.c.l_edge
old_head.c.l_edge = new_edge
# walk up the tree from old_head and assign new l_edge to ancestors
# until an ancestor already has an l_edge that's further left
# walk up the tree from old_head and assign new l_edge to
# ancestors until an ancestor already has an l_edge that's
# further left
for anc in old_head.ancestors:
if anc.c.l_edge <= new_edge:
anc.c.l_edge = new_edge
elif self.c.head < 0: # right dependent
elif self.c.head < 0: # right dependent
old_head.c.r_kids -= 1
# do the same thing as for l_edge
if self.c.r_edge == old_head.c.r_edge:
@ -500,7 +536,7 @@ cdef class Token:
anc.c.r_edge = new_edge
# update number of deps of new head
if rel_newhead_i > 0: # left dependent
if rel_newhead_i > 0: # left dependent
new_head.c.l_kids += 1
# walk up the tree from new head and set l_edge to self.l_edge
# until you hit a token with an l_edge further to the left
@ -511,7 +547,7 @@ cdef class Token:
anc.c.l_edge = self.c.l_edge
elif rel_newhead_i < 0: # right dependent
elif rel_newhead_i < 0: # right dependent
new_head.c.r_kids += 1
# do the same as for l_edge
if self.c.r_edge > new_head.c.r_edge:
@ -542,12 +578,10 @@ cdef class Token:
yield from word.conjuncts
property ent_type:
"""Named entity type.
RETURNS (uint64): Named entity type.
"""RETURNS (uint64): Named entity type."""
def __get__(self):
return self.c.ent_type
def __set__(self, ent_type):
self.c.ent_type = ent_type
@ -561,19 +595,17 @@ cdef class Token:
return self.c.ent_iob
property ent_type_:
"""Named entity type.
RETURNS (unicode): Named entity type.
"""RETURNS (unicode): Named entity type."""
def __get__(self):
return self.vocab.strings[self.c.ent_type]
def __set__(self, ent_type):
self.c.ent_type = self.vocab.strings.add(ent_type)
property ent_iob_:
"""IOB code of named entity tag. "B" means the token begins an entity,
"I" means it is inside an entity, "O" means it is outside an entity, and
"" means no entity tag is set.
"I" means it is inside an entity, "O" means it is outside an entity,
and "" means no entity tag is set.
RETURNS (unicode): IOB code of named entity tag.
@ -582,10 +614,8 @@ cdef class Token:
return iob_strings[self.c.ent_iob]
property ent_id:
"""ID of the entity the token is an instance of, if any. Usually
assigned by patterns in the Matcher.
RETURNS (uint64): ID of the entity.
"""RETURNS (uint64): ID of the entity the token is an instance of,
if any.
def __get__(self):
return self.c.ent_id
@ -594,10 +624,8 @@ cdef class Token:
self.c.ent_id = key
property ent_id_:
"""ID of the entity the token is an instance of, if any. Usually
assigned by patterns in the Matcher.
RETURNS (unicode): ID of the entity.
"""RETURNS (unicode): ID of the entity the token is an instance of,
if any.
def __get__(self):
return self.vocab.strings[self.c.ent_id]
@ -606,107 +634,192 @@ cdef class Token:
self.c.ent_id = self.vocab.strings.add(name)
property whitespace_:
"""RETURNS (unicode): The trailing whitespace character, if present.
def __get__(self):
return ' ' if self.c.spacy else ''
property orth_:
"""RETURNS (unicode): Verbatim text content (identical to
`Token.text`). Existst mostly for consistency with the other
def __get__(self):
return self.vocab.strings[self.c.lex.orth]
property lower_:
"""RETURNS (unicode): The lowercase token text. Equivalent to
def __get__(self):
return self.vocab.strings[self.c.lex.lower]
property norm_:
"""RETURNS (unicode): The token's norm, i.e. a normalised form of the
token text. Usually set in the language's tokenizer exceptions or
norm exceptions.
def __get__(self):
return self.vocab.strings[self.c.lex.norm]
property shape_:
"""RETURNS (unicode): Transform of the tokens's string, to show
orthographic features. For example, "Xxxx" or "dd".
def __get__(self):
return self.vocab.strings[self.c.lex.shape]
property prefix_:
"""RETURNS (unicode): A length-N substring from the start of the token.
Defaults to `N=1`.
def __get__(self):
return self.vocab.strings[self.c.lex.prefix]
property suffix_:
"""RETURNS (unicode): A length-N substring from the end of the token.
Defaults to `N=3`.
def __get__(self):
return self.vocab.strings[self.c.lex.suffix]
property lang_:
"""RETURNS (unicode): Language of the parent document's vocabulary,
e.g. 'en'.
def __get__(self):
return self.vocab.strings[self.c.lex.lang]
property lemma_:
"""Base form of the word, with no inflectional suffixes.
RETURNS (unicode): Token lemma.
"""RETURNS (unicode): The token lemma, i.e. the base form of the word,
with no inflectional suffixes.
def __get__(self):
return self.vocab.strings[self.c.lemma]
def __set__(self, unicode lemma_):
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
"""RETURNS (unicode): Coarse-grained part-of-speech tag."""
def __get__(self):
return parts_of_speech.NAMES[self.c.pos]
property tag_:
"""RETURNS (unicode): Fine-grained part-of-speech tag."""
def __get__(self):
return self.vocab.strings[self.c.tag]
def __set__(self, tag):
self.tag = self.vocab.strings.add(tag)
property dep_:
"""RETURNS (unicode): The syntactic dependency label."""
def __get__(self):
return self.vocab.strings[self.c.dep]
def __set__(self, unicode label):
self.c.dep = self.vocab.strings.add(label)
property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
"""RETURNS (bool): Whether the token is out-of-vocabulary."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_OOV)
property is_stop:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
"""RETURNS (bool): Whether the token is a stop word, i.e. part of a
"stop list" defined by the language data.
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_STOP)
property is_alpha:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
"""RETURNS (bool): Whether the token consists of alpha characters.
Equivalent to `token.text.isalpha()`.
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
property is_ascii:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
"""RETURNS (bool): Whether the token consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
property is_digit:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
"""RETURNS (bool): Whether the token consists of digits. Equivalent to
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
property is_lower:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
"""RETURNS (bool): Whether the token is in lowercase. Equivalent to
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
property is_upper:
"""RETURNS (bool): Whether the token is in uppercase. Equivalent to
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
property is_title:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
"""RETURNS (bool): Whether the token is in titlecase. Equivalent to
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
property is_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
"""RETURNS (bool): Whether the token is punctuation."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
"""RETURNS (bool): Whether the token consists of whitespace characters.
Equivalent to `token.text.isspace()`.
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
"""RETURNS (bool): Whether the token is a bracket."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
"""RETURNS (bool): Whether the token is a quotation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
"""RETURNS (bool): Whether the token is a left punctuation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
"""RETURNS (bool): Whether the token is a left punctuation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
"""RETURNS (bool): Whether the token resembles a URL."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
property like_num:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
"""RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
"10", "ten", etc.
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
property like_email:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
"""RETURNS (bool): Whether the token resembles an email address."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
@ -1,5 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
import functools
class Underscore(object):
doc_extensions = {}
span_extensions = {}
@ -1 +0,0 @@
@ -10,25 +10,27 @@ from pathlib import Path
import sys
import textwrap
import random
import numpy
import io
import dill
from collections import OrderedDict
from thinc.neural._classes.model import Model
import functools
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import import_file
import msgpack
import msgpack_numpy
import ujson
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import copy_array, normalize_string_keys, getattr_, import_file
_data_path = Path(__file__).parent / 'data'
_PRINT_ENV = False
def set_env_log(value):
global _PRINT_ENV
_PRINT_ENV = value
def get_lang_class(lang):
@ -38,11 +40,12 @@ def get_lang_class(lang):
RETURNS (Language): Language class.
if not lang in LANGUAGES:
if lang not in LANGUAGES:
module = importlib.import_module('.lang.%s' % lang, 'spacy')
except ImportError:
raise ImportError("Can't import language %s from spacy.lang." %lang)
msg = "Can't import language %s from spacy.lang."
raise ImportError(msg % lang)
LANGUAGES[lang] = getattr(module, module.__all__[0])
return LANGUAGES[lang]
@ -100,14 +103,14 @@ def load_model(name, **overrides):
data_path = get_data_path()
if not data_path or not data_path.exists():
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
if isinstance(name, basestring_):
if name in set([d.name for d in data_path.iterdir()]): # in data dir / shortcut
if isinstance(name, basestring_): # in data dir / shortcut
if name in set([d.name for d in data_path.iterdir()]):
return load_model_from_link(name, **overrides)
if is_package(name): # installed as package
if is_package(name): # installed as package
return load_model_from_package(name, **overrides)
if Path(name).exists(): # path to model data directory
if Path(name).exists(): # path to model data directory
return load_model_from_path(Path(name), **overrides)
elif hasattr(name, 'exists'): # Path or Path-like to model data
elif hasattr(name, 'exists'): # Path or Path-like to model data
return load_model_from_path(name, **overrides)
raise IOError("Can't find model '%s'" % name)
@ -120,7 +123,7 @@ def load_model_from_link(name, **overrides):
except AttributeError:
raise IOError(
"Cant' load '%s'. If you're using a shortcut link, make sure it "
"points to a valid model package (not just a data directory)." % name)
"points to a valid package (not just a data directory)." % name)
return cls.load(**overrides)
@ -164,7 +167,8 @@ def load_model_from_init_py(init_file, **overrides):
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
data_path = model_path / data_dir
if not model_path.exists():
raise ValueError("Can't find model directory: %s" % path2str(data_path))
msg = "Can't find model directory: %s"
raise ValueError(msg % path2str(data_path))
return load_model_from_path(data_path, meta, **overrides)
@ -176,14 +180,16 @@ def get_model_meta(path):
model_path = ensure_path(path)
if not model_path.exists():
raise ValueError("Can't find model directory: %s" % path2str(model_path))
msg = "Can't find model directory: %s"
raise ValueError(msg % path2str(model_path))
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
raise IOError("Could not read meta.json from %s" % meta_path)
meta = read_json(meta_path)
for setting in ['lang', 'name', 'version']:
if setting not in meta or not meta[setting]:
raise ValueError("No valid '%s' setting found in model meta.json" % setting)
msg = "No valid '%s' setting found in model meta.json"
raise ValueError(msg % setting)
return meta
@ -240,7 +246,7 @@ def get_async(stream, numpy_array):
return numpy_array
array = cupy.ndarray(numpy_array.shape, order='C',
array.set(numpy_array, stream=stream)
return array
@ -274,12 +280,6 @@ def itershuffle(iterable, bufsize=1000):
raise StopIteration
_PRINT_ENV = False
def set_env_log(value):
global _PRINT_ENV
_PRINT_ENV = value
def env_opt(name, default=None):
if type(default) is float:
type_convert = float
@ -305,17 +305,20 @@ def read_regex(path):
path = ensure_path(path)
with path.open() as file_:
entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
expression = '|'.join(['^' + re.escape(piece)
for piece in entries if piece.strip()])
return re.compile(expression)
def compile_prefix_regex(entries):
if '(' in entries:
# Handle deprecated data
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
expression = '|'.join(['^' + re.escape(piece)
for piece in entries if piece.strip()])
return re.compile(expression)
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
expression = '|'.join(['^' + piece
for piece in entries if piece.strip()])
return re.compile(expression)
@ -359,16 +362,15 @@ def update_exc(base_exceptions, *addition_dicts):
exc = dict(base_exceptions)
for additions in addition_dicts:
for orth, token_attrs in additions.items():
if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
if not all(isinstance(attr[ORTH], unicode_)
for attr in token_attrs):
msg = "Invalid ORTH value in exception: key='%s', orths='%s'"
raise ValueError(msg % (orth, token_attrs))
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
if orth != described_orth:
raise ValueError("Invalid tokenizer exception: ORTH values "
"combined don't match original string. "
"key='%s', orths='%s'" % (orth, described_orth))
# overlap = set(exc.keys()).intersection(set(additions))
# assert not overlap, overlap
msg = ("Invalid tokenizer exception: ORTH values combined "
"don't match original string. key='%s', orths='%s'")
raise ValueError(msg % (orth, described_orth))
exc = expand_exc(exc, "'", "’")
return exc
@ -401,17 +403,15 @@ def normalize_slice(length, start, stop, step=None):
raise ValueError("Stepped slices not supported in Span objects."
"Try: list(tokens)[start:stop:step] instead.")
if start is None:
start = 0
start = 0
elif start < 0:
start += length
start += length
start = min(length, max(0, start))
if stop is None:
stop = length
stop = length
elif stop < 0:
stop += length
stop += length
stop = min(length, max(start, stop))
assert 0 <= start <= stop <= length
return start, stop
@ -428,7 +428,7 @@ def compounding(start, stop, compound):
>>> assert next(sizes) == 1.5 * 1.5
def clip(value):
return max(value, stop) if (start>stop) else min(value, stop)
return max(value, stop) if (start > stop) else min(value, stop)
curr = float(start)
while True:
yield clip(curr)
@ -438,7 +438,7 @@ def compounding(start, stop, compound):
def decaying(start, stop, decay):
"""Yield an infinite series of linearly decaying values."""
def clip(value):
return max(value, stop) if (start>stop) else min(value, stop)
return max(value, stop) if (start > stop) else min(value, stop)
nr_upd = 1.
while True:
yield clip(start * 1./(1. + decay * nr_upd))
@ -530,17 +530,19 @@ def print_markdown(data, title=None):
if isinstance(data, dict):
data = list(data.items())
markdown = ["* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)]
markdown = ["* **{}:** {}".format(l, unicode_(v))
for l, v in data if not excl_value(v)]
if title:
print("\n## {}".format(title))
def prints(*texts, **kwargs):
"""Print formatted message (manual ANSI escape sequences to avoid dependency)
"""Print formatted message (manual ANSI escape sequences to avoid
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
**kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
**kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
exits = kwargs.get('exits', None)
title = kwargs.get('title', None)
@ -570,7 +572,8 @@ def _wrap(text, wrap_max=80, indent=4):
def minify_html(html):
"""Perform a template-specific, rudimentary HTML minification for displaCy.
Disclaimer: NOT a general-purpose solution, only removes indentation/newlines.
Disclaimer: NOT a general-purpose solution, only removes indentation and
html (unicode): Markup to minify.
RETURNS (unicode): "Minified" HTML.
@ -1,5 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from libc.stdint cimport int32_t, uint64_t
import numpy
from collections import OrderedDict
import msgpack
@ -9,23 +10,20 @@ cimport numpy as np
from thinc.neural.util import get_array_module
from thinc.neural._classes.model import Model
from .typedefs cimport attr_t
from .strings cimport StringStore
from . import util
from .compat import basestring_, path2str
from . import util
cdef class Vectors:
'''Store, save and load word vectors.
"""Store, save and load word vectors.
Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors)
or cupy.ndarray (for GPU vectors).
vectors.key2row is a dictionary mapping word hashes to rows
in the vectors.data table. The array `vectors.keys` keeps
the keys in order, such that keys[vectors.key2row[key]] == key.
instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
(for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
rows in the vectors.data table. The array `vectors.keys` keeps the keys in
order, such that `keys[vectors.key2row[key]] == key`.
cdef public object data
cdef readonly StringStore strings
cdef public object key2row
@ -33,6 +31,16 @@ cdef class Vectors:
cdef public int i
def __init__(self, strings, width=0, data=None):
"""Create a new vector store. To keep the vector table empty, pass
`width=0`. You can also create the vector table and add vectors one by
one, or set the vector values directly on initialisation.
strings (StringStore or list): List of strings or StringStore that maps
strings to hash values, and vice versa.
width (int): Number of dimensions.
data (numpy.ndarray): The vector data.
RETURNS (Vectors): The newly created object.
if isinstance(strings, StringStore):
self.strings = strings
@ -55,11 +63,13 @@ cdef class Vectors:
return (Vectors, (self.strings, self.data))
def __getitem__(self, key):
'''Get a vector by key. If key is a string, it is hashed
to an integer ID using the vectors.strings table.
"""Get a vector by key. If key is a string, it is hashed to an integer
ID using the vectors.strings table. If the integer key is not found in
the table, a KeyError is raised.
If the integer key is not found in the table, a KeyError is raised.
key (unicode / int): The key to get the vector for.
RETURNS (numpy.ndarray): The vector for the key.
if isinstance(key, basestring):
key = self.strings[key]
i = self.key2row[key]
@ -69,30 +79,47 @@ cdef class Vectors:
return self.data[i]
def __setitem__(self, key, vector):
'''Set a vector for the given key. If key is a string, it is hashed
"""Set a vector for the given key. If key is a string, it is hashed
to an integer ID using the vectors.strings table.
key (unicode / int): The key to set the vector for.
vector (numpy.ndarray): The vector to set.
if isinstance(key, basestring):
key = self.strings.add(key)
i = self.key2row[key]
self.data[i] = vector
def __iter__(self):
'''Yield vectors from the table.'''
"""Yield vectors from the table.
YIELDS (numpy.ndarray): A vector.
yield from self.data
def __len__(self):
'''Return the number of vectors that have been assigned.'''
"""Return the number of vectors that have been assigned.
RETURNS (int): The number of vectors in the data.
return self.i
def __contains__(self, key):
'''Check whether a key has a vector entry in the table.'''
"""Check whether a key has a vector entry in the table.
key (unicode / int): The key to check.
RETURNS (bool): Whether the key has a vector entry.
if isinstance(key, basestring_):
key = self.strings[key]
return key in self.key2row
def add(self, key, vector=None):
'''Add a key to the table, optionally setting a vector value as well.'''
"""Add a key to the table, optionally setting a vector value as well.
key (unicode / int): The key to add.
vector (numpy.ndarray): An optional vector to add.
if isinstance(key, basestring_):
key = self.strings.add(key)
if key not in self.key2row:
@ -110,24 +137,36 @@ cdef class Vectors:
return i
def items(self):
'''Iterate over (string key, vector) pairs, in order.'''
"""Iterate over `(string key, vector)` pairs, in order.
YIELDS (tuple): A key/vector pair.
for i, key in enumerate(self.keys):
string = self.strings[key]
yield string, self.data[i]
def shape(self):
"""Get `(rows, dims)` tuples of number of rows and number of dimensions
in the vector table.
RETURNS (tuple): A `(rows, dims)` pair.
return self.data.shape
def most_similar(self, key):
# TODO: implement
raise NotImplementedError
def from_glove(self, path):
'''Load GloVe vectors from a directory. Assumes binary format,
"""Load GloVe vectors from a directory. Assumes binary format,
that the vocab is in a vocab.txt, and that vectors are named
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
By default GloVe outputs 64-bit vectors.'''
By default GloVe outputs 64-bit vectors.
path (unicode / Path): The path to load the GloVe vectors from.
path = util.ensure_path(path)
for name in path.iterdir():
if name.parts[-1].startswith('vectors'):
@ -150,9 +189,15 @@ cdef class Vectors:
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
path (unicode / Path): A path to a directory, which will be created if
it doesn't exists. Either a string or a Path-like object.
xp = get_array_module(self.data)
if xp is numpy:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
save_array = lambda arr, file_: xp.save(file_, arr,
save_array = lambda arr, file_: xp.save(file_, arr)
serializers = OrderedDict((
@ -162,6 +207,12 @@ cdef class Vectors:
return util.to_disk(path, serializers, exclude)
def from_disk(self, path, **exclude):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode / Path): Directory path, string or Path-like object.
RETURNS (Vectors): The modified object.
def load_keys(path):
if path.exists():
self.keys = numpy.load(path2str(path))
@ -182,6 +233,11 @@ cdef class Vectors:
return self
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vectors` object.
def serialize_weights():
if hasattr(self.data, 'to_bytes'):
return self.data.to_bytes()
@ -194,6 +250,12 @@ cdef class Vectors:
return util.to_bytes(serializers, exclude)
def from_bytes(self, data, **exclude):
"""Load state from a binary string.
data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Vectors): The `Vectors` object.
def deserialize_weights(b):
if hasattr(self.data, 'from_bytes'):
@ -1,33 +1,24 @@
# coding: utf8
from __future__ import unicode_literals
import bz2
import ujson
import re
import numpy
import dill
from libc.string cimport memset, memcpy
from libc.stdint cimport int32_t
from libc.math cimport sqrt
from cymem.cymem cimport Address
from collections import OrderedDict
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
from .strings cimport hash_string
from .typedefs cimport attr_t
from .tokens.token cimport Token
from .attrs cimport PROB, LANG
from .attrs cimport PROB, LANG, ORTH, TAG
from .structs cimport SerializedLexemeC
from .compat import copy_reg, pickle, basestring_
from .compat import copy_reg, basestring_
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .vectors import Vectors
from . import util
from . import attrs
from . import symbols
from ._ml import link_vectors_to_models
from . import util
cdef class Vocab:
@ -36,23 +27,22 @@ cdef class Vocab:
C-data that is shared between `Doc` objects.
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), **deprecated_kwargs):
strings=tuple(), **deprecated_kwargs):
"""Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
to compute them. Defaults to `None`.
tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
lex_attr_getters (dict): A dictionary mapping attribute IDs to
functions to compute them. Defaults to `None`.
tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
parts-of-speech, and optionally morphological attributes.
lemmatizer (object): A lemmatizer. Defaults to `None`.
strings (StringStore): StringStore that maps strings to integers, and
vice versa.
RETURNS (Vocab): The newly constructed vocab object.
RETURNS (Vocab): The newly constructed object.
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {})
self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap()
@ -84,19 +74,20 @@ cdef class Vocab:
The flag_getter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able
to access the flag value on each token, using token.check_flag(flag_id).
to access the flag value on each token using token.check_flag(flag_id).
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
flag_getter (callable): A function `f(unicode) -> bool`, to get the
flag value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest
available bit will be chosen.
RETURNS (int): The integer ID by which the flag value can be checked.
>>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
>>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy']
>>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter)
>>> doc = nlp(u'I like spaCy')
>>> assert doc[2].check_flag(MY_PRODUCT) == True
@ -107,9 +98,10 @@ cdef class Vocab:
raise ValueError(
"Cannot find empty bit for new lexical flag. All bits between "
"0 and 63 are occupied. You can replace one by specifying the "
"flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA")
"Cannot find empty bit for new lexical flag. All bits "
"between 0 and 63 are occupied. You can replace one by "
"specifying the flag_id explicitly, e.g. "
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
elif flag_id >= 64 or flag_id < 1:
raise ValueError(
"Invalid value for flag_id: %d. Flag IDs must be between "
@ -120,9 +112,9 @@ cdef class Vocab:
return flag_id
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
if string == u'':
@ -139,9 +131,9 @@ cdef class Vocab:
return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
if orth == 0:
@ -203,8 +195,8 @@ cdef class Vocab:
for orth, addr in self._by_orth.items():
yield Lexeme(self, orth)
def __getitem__(self, id_or_string):
"""Retrieve a lexeme, given an int ID or a unicode string. If a
def __getitem__(self, id_or_string):
"""Retrieve a lexeme, given an int ID or a unicode string. If a
previously unseen unicode string is given, a new lexeme is created and
@ -229,13 +221,14 @@ cdef class Vocab:
cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
props = intify_attrs(props, strings_map=self.strings,
token = &tokens[i]
# Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
token.lex = lex
if attrs.TAG in props:
self.morphology.assign_tag(token, props[attrs.TAG])
if TAG in props:
self.morphology.assign_tag(token, props[TAG])
for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value)
Lexeme.set_struct_attr(lex, attr_id, value)
@ -254,16 +247,13 @@ cdef class Vocab:
self.vectors = Vectors(self.strings, width=new_dim)
def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary.
"""Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is
Words can be looked up by string or int ID.
A word vector. Size and shape determined by the
vocab.vectors instance. Usually, a numpy ndarray
of shape (300,) and dtype float32.
RAISES: If no vectors data is loaded, ValueError is raised.
RETURNS (numpy.ndarray): A word vector. Size
and shape determined by the `vocab.vectors` instance. Usually, a
numpy ndarray of shape (300,) and dtype float32.
if isinstance(orth, basestring_):
orth = self.strings.add(orth)
@ -273,21 +263,16 @@ cdef class Vocab:
return numpy.zeros((self.vectors_length,), dtype='f')
def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary.
Words can be referenced by string or int ID.
"""Set a vector for a word in the vocabulary. Words can be referenced
by string or int ID.
if not isinstance(orth, basestring_):
orth = self.strings[orth]
self.vectors.add(orth, vector=vector)
def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no
vectors have been loaded. Words can be looked up by string
or int ID."""
"""Check whether a word has a vector. Returns False if no vectors have
been loaded. Words can be looked up by string or int ID."""
if isinstance(orth, basestring_):
orth = self.strings.add(orth)
return orth in self.vectors
@ -296,7 +281,7 @@ cdef class Vocab:
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
it doesn't exist. Paths may be either strings or Path-like objects.
path = util.ensure_path(path)
if not path.exists():
@ -421,16 +406,13 @@ def pickle_vocab(vocab):
length = vocab.length
data_dir = vocab.data_dir
lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
lexemes_data = vocab.lexemes_to_bytes()
return (unpickle_vocab,
(sstore, morph, data_dir, lex_attr_getters,
lexemes_data, length))
(sstore, morph, data_dir, lex_attr_getters, lexemes_data, length))
def unpickle_vocab(sstore, morphology, data_dir,
lex_attr_getters, bytes lexemes_data, int length):
lex_attr_getters, bytes lexemes_data, int length):
cdef Vocab vocab = Vocab()
vocab.length = length
vocab.strings = sstore
@ -450,12 +432,10 @@ class LookupError(Exception):
def mismatched_strings(cls, id_, id_string, original_string):
return cls(
"Error fetching a Lexeme from the Vocab. When looking up a string, "
"the lexeme returned had an orth ID that did not match the query string. "
"This means that the cached lexeme structs are mismatched to the "
"string encoding table. The mismatched:\n"
"Query string: {query}\n"
"Orth cached: {orth_str}\n"
"ID of orth: {orth_id}".format(
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
"Error fetching a Lexeme from the Vocab. When looking up a "
"string, the lexeme returned had an orth ID that did not match "
"the query string. This means that the cached lexeme structs are "
"mismatched to the string encoding table. The mismatched:\n"
"Query string: {}\n"
"Orth cached: {}\n"
"Orth ID: {}".format(repr(original_string), repr(id_string), id_))
@ -784,3 +784,10 @@ p
| A dictionary that allows customisation of properties of
| #[code Span] children.
+cell #[code _]
+cell #[code Underscore]
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
@ -157,27 +157,61 @@ p The L2 norm of the lexeme's vector representation.
+cell #[code vocab]
+cell #[code Vocab]
+cell The lexeme's vocabulary.
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+cell #[code orth_]
+cell unicode
| Verbatim text content (identical to #[code Lexeme.text]). Existst
| mostly for consistency with the other attributes.
+cell #[code lex_id]
+cell int
+cell ID of the lexeme's lexical type.
+cell #[code rank]
+cell int
| Sequential ID of the lexemes's lexical type, used to index into
| tables, e.g. for word vectors.
+cell #[code flags]
+cell int
+cell Container of the lexeme's binary flags.
+cell #[code norm]
+cell int
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
+cell #[code norm_]
+cell unicode
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
+cell #[code lower]
+cell int
+cell Lower-case form of the word.
+cell Lowercase form of the word.
+cell #[code lower_]
+cell unicode
+cell Lower-case form of the word.
+cell Lowercase form of the word.
+cell #[code shape]
@ -192,22 +226,30 @@ p The L2 norm of the lexeme's vector representation.
+cell #[code prefix]
+cell int
+cell Length-N substring from the start of the word. Defaults to #[code N=1].
| Length-N substring from the start of the word. Defaults to
| #[code N=1].
+cell #[code prefix_]
+cell unicode
+cell Length-N substring from the start of the word. Defaults to #[code N=1].
| Length-N substring from the start of the word. Defaults to
| #[code N=1].
+cell #[code suffix]
+cell int
+cell Length-N substring from the end of the word. Defaults to #[code N=3].
| Length-N substring from the end of the word. Defaults to
| #[code N=3].
+cell #[code suffix_]
+cell unicode
+cell Length-N substring from the start of the word. Defaults to #[code N=3].
| Length-N substring from the start of the word. Defaults to
| #[code N=3].
+cell #[code is_alpha]
@ -237,6 +279,13 @@ p The L2 norm of the lexeme's vector representation.
| Is the lexeme in lowercase? Equivalent to
| #[code lexeme.text.islower()].
+cell #[code is_upper]
+cell bool
| Is the lexeme in uppercase? Equivalent to
| #[code lexeme.text.isupper()].
+cell #[code is_title]
+cell bool
@ -249,6 +298,16 @@ p The L2 norm of the lexeme's vector representation.
+cell bool
+cell Is the lexeme punctuation?
+cell #[code is_left_punct]
+cell bool
+cell Is the lexeme a left punctuation mark, e.g. #[code (]?
+cell #[code is_right_punct]
+cell bool
+cell Is the lexeme a right punctuation mark, e.g. #[code )]?
+cell #[code is_space]
+cell bool
@ -256,6 +315,16 @@ p The L2 norm of the lexeme's vector representation.
| Does the lexeme consist of whitespace characters? Equivalent to
| #[code lexeme.text.isspace()].
+cell #[code is_bracket]
+cell bool
+cell Is the lexeme a bracket?
+cell #[code is_quote]
+cell bool
+cell Is the lexeme a quotation mark?
+cell #[code like_url]
+cell bool
@ -285,6 +354,7 @@ p The L2 norm of the lexeme's vector representation.
+cell #[code lang]
+cell int
+cell Language of the parent vocabulary.
+cell #[code lang_]
+cell unicode
@ -293,9 +363,16 @@ p The L2 norm of the lexeme's vector representation.
+cell #[code prob]
+cell float
+cell Smoothed log probability estimate of lexeme's type.
+cell Smoothed log probability estimate of the lexeme's type.
+cell #[code cluster]
+cell int
+cell Brown cluster ID.
+cell #[code sentiment]
+cell float
+cell A scalar value indicating the positivity or negativity of the lexeme.
| A scalar value indicating the positivity or negativity of the
| lexeme.
@ -248,6 +248,28 @@ p
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "get_lca_matrix") Span.get_lca_matrix
+tag method
| Calculates the lowest common ancestor matrix for a given #[code Span].
| Returns LCA matrix containing the integer index of the ancestor, or
| #[code -1] if no common ancestor is found, e.g. if span excludes a
| necessary ancestor.
doc = nlp(u'I like New York in Autumn')
span = doc[1:4]
matrix = span.get_lca_matrix()
# array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
+table(["Name", "Type", "Description"])
+cell returns
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell The lowest common ancestor matrix of the #[code Span].
+h(2, "to_array") Span.to_array
+tag method
@ -347,7 +369,7 @@ p
+tag property
p Tokens that are to the left of the span, whose head is within the span.
p Tokens that are to the left of the span, whose heads are within the span.
doc = nlp(u'I like New York in Autumn.')
@ -364,7 +386,7 @@ p Tokens that are to the left of the span, whose head is within the span.
+tag property
p Tokens that are to the right of the span, whose head is within the span.
p Tokens that are to the right of the span, whose heads are within the span.
doc = nlp(u'I like New York in Autumn.')
@ -377,6 +399,42 @@ p Tokens that are to the right of the span, whose head is within the span.
+cell #[code Token]
+cell A right-child of a token of the span.
+h(2, "n_lefts") Span.n_lefts
+tag property
| The number of tokens that are to the left of the span, whose heads are
| within the span.
doc = nlp(u'I like New York in Autumn.')
assert doc[3:7].n_lefts == 1
+table(["Name", "Type", "Description"])
+cell returns
+cell int
+cell The number of left-child tokens.
+h(2, "n_rights") Span.n_rights
+tag property
| The number of tokens that are to the right of the span, whose heads are
| within the span.
doc = nlp(u'I like New York in Autumn.')
assert doc[2:4].n_rights == 1
+table(["Name", "Type", "Description"])
+cell returns
+cell int
+cell The number of right-child tokens.
+h(2, "subtree") Span.subtree
+tag property
@ -495,6 +553,18 @@ p
| The text content of the span with a trailing whitespace character
| if the last token has one.
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+cell #[code orth_]
+cell unicode
| Verbatim text content (identical to #[code Span.text]). Existst
| mostly for consistency with the other attributes.
+cell #[code label]
+cell int
@ -519,3 +589,17 @@ p
+cell #[code ent_id_]
+cell unicode
+cell The string ID of the named entity the token is an instance of.
+cell #[code sentiment]
+cell float
| A scalar value indicating the positivity or negativity of the
| span.
+cell #[code _]
+cell #[code Underscore]
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
@ -302,6 +302,80 @@ p A sequence of the token's immediate syntactic children.
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "lefts") Token.lefts
+tag property
| The leftward immediate children of the word, in the syntactic dependency
| parse.
doc = nlp(u'I like New York in Autumn.')
lefts = [t.text for t in doc[3].lefts]
assert lefts == [u'New']
+table(["Name", "Type", "Description"])
+cell yields
+cell #[code Token]
+cell A left-child of the token.
+h(2, "rights") Token.rights
+tag property
| The rightward immediate children of the word, in the syntactic
| dependency parse.
doc = nlp(u'I like New York in Autumn.')
rights = [t.text for t in doc[3].rights]
assert rights == [u'in']
+table(["Name", "Type", "Description"])
+cell yields
+cell #[code Token]
+cell A right-child of the token.
+h(2, "n_lefts") Token.n_lefts
+tag property
| The number of leftward immediate children of the word, in the syntactic
| dependency parse.
doc = nlp(u'I like New York in Autumn.')
assert doc[3].n_lefts == 1
+table(["Name", "Type", "Description"])
+cell returns
+cell int
+cell The number of left-child tokens.
+h(2, "n_rights") Token.n_rights
+tag property
| The number of rightward immediate children of the word, in the syntactic
| dependency parse.
doc = nlp(u'I like New York in Autumn.')
assert doc[3].n_rights == 1
+table(["Name", "Type", "Description"])
+cell returns
+cell int
+cell The number of right-child tokens.
+h(2, "subtree") Token.subtree
+tag property
@ -489,15 +563,35 @@ p The L2 norm of the token's vector representation.
+cell unicode
+cell Base form of the token, with no inflectional suffixes.
+cell #[code norm]
+cell int
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+cell #[code norm_]
+cell unicode
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+cell #[code lower]
+cell int
+cell Lower-case form of the token.
+cell Lowercase form of the token.
+cell #[code lower_]
+cell unicode
+cell Lower-case form of the token.
| Lowercase form of the token text. Equivalent to
| #[code Token.text.lower()].
+cell #[code shape]
@ -537,7 +631,9 @@ p The L2 norm of the token's vector representation.
+cell #[code suffix_]
+cell unicode
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
| Length-N substring from the end of the token. Defaults to
| #[code N=3].
+cell #[code is_alpha]
@ -672,6 +768,7 @@ p The L2 norm of the token's vector representation.
+cell #[code lang]
+cell int
+cell Language of the parent document's vocabulary.
+cell #[code lang_]
+cell unicode
@ -690,9 +787,30 @@ p The L2 norm of the token's vector representation.
+cell #[code sentiment]
+cell float
+cell A scalar value indicating the positivity or negativity of the token.
| A scalar value indicating the positivity or negativity of the
| token.
+cell #[code lex_id]
+cell int
+cell ID of the token's lexical type.
+cell Sequential ID of the token's lexical type.
+cell #[code rank]
+cell int
| Sequential ID of the token's lexical type, used to index into
| tables, e.g. for word vectors.
+cell #[code cluster]
+cell int
+cell Brown cluster ID.
+cell #[code _]
+cell #[code Underscore]
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
@ -36,12 +36,14 @@ p
| that maps strings to hash values, and vice versa.
+cell #[code data]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code width]
+cell int
+cell Number of dimensions.
+cell #[code width]
+cell Number of dimensions.
+cell #[code data]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell The vector data.
+cell returns
@ -208,7 +210,7 @@ p
+cell returns
+cell tuple
+cell #[code (rows, dims)] pairs.
+cell A #[code (rows, dims)] pair.
+h(2, "from_glove") Vectors.from_glove
+tag method
@ -238,11 +240,16 @@ p Save the current state to a directory.
+table(["Name", "Type", "Description"])
+cell #[code path]
+cell unicode or #[code Path]
+cell unicode / #[code Path]
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+h(2, "from_disk") Vectors.from_disk
+tag method
@ -255,7 +262,7 @@ p Loads state from a directory. Modifies the object in place and returns it.
+table(["Name", "Type", "Description"])
+cell #[code path]
+cell unicode or #[code Path]
+cell unicode / #[code Path]
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
@ -297,7 +304,7 @@ p Load state from a binary string.
+table(["Name", "Type", "Description"])
+cell #[code bytes_data]
+cell #[code data]
+cell bytes
+cell The data to load from.
@ -111,11 +111,13 @@ p
| A few more convenience attributes are provided for iterating around the
| local tree from the token. The #[code .lefts] and #[code .rights]
| attributes provide sequences of syntactic children that occur before and
| after the token. Both sequences are in sentences order. There are also
| two integer-typed attributes, #[code .n_rights] and #[code .n_lefts],
| that give the number of left and right children.
| local tree from the token. The #[+api("token#lefts") #[code Token.lefts]]
| and #[+api("token#rights") #[code Token.rights]] attributes provide
| sequences of syntactic children that occur before and after the token.
| Both sequences are in sentence order. There are also two integer-typed
| attributes, #[+api("token#n_rights") #[code Token.n_rights]] and
| #[+api("token#n_lefts") #[code Token.n_lefts]], that give the number of
| left and right children.
doc = nlp(u'bright red apples on the tree')
@ -126,10 +128,11 @@ p
| You can get a whole phrase by its syntactic head using the
| #[code .subtree] attribute. This returns an ordered sequence of tokens.
| You can walk up the tree with the #[code .ancestors] attribute, and
| check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]]
| method.
| #[+api("token#subtree") #[code Token.subtree]] attribute. This returns an
| ordered sequence of tokens. You can walk up the tree with the
| #[+api("token#ancestors") #[code Token.ancestors]] attribute, and
| check dominance with
| #[+api("token#is_ancestor") #[code Token.is_ancestor()]].
+aside("Projective vs. non-projective")
| For the #[+a("/models/en") default English model], the
Reference in New Issue
Block a user