Merge pull request #1468 from explosion/feature/tidy-up

💫 Tidy up v2.0 code base
This commit is contained in:
Matthew Honnibal 2017-10-28 04:20:29 +02:00 committed by GitHub
commit 4b78c1762b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
71 changed files with 2003 additions and 3332 deletions

View File

@ -1,93 +0,0 @@
#!/usr/bin/env python
from __future__ import unicode_literals, print_function
import plac
import joblib
from os import path
import os
import bz2
import ujson
from preshed.counter import PreshCounter
from joblib import Parallel, delayed
import io
from spacy.en import English
from spacy.strings import StringStore
from spacy.attrs import ORTH
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for line in file_:
yield ujson.loads(line)
def count_freqs(input_loc, output_loc):
print(output_loc)
vocab = English.default_vocab(get_lex_attr=None)
tokenizer = Tokenizer.from_dir(vocab,
path.join(English.default_data_dir(), 'tokenizer'))
counts = PreshCounter()
for json_comment in iter_comments(input_loc):
doc = tokenizer(json_comment['body'])
doc.count_by(ORTH, counts=counts)
with io.open(output_loc, 'w', 'utf8') as file_:
for orth, freq in counts:
string = tokenizer.vocab.strings[orth]
if not string.isspace():
file_.write('%d\t%s\n' % (freq, string))
def parallelize(func, iterator, n_jobs):
Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
def merge_counts(locs, out_loc):
string_map = StringStore()
counts = PreshCounter()
for loc in locs:
with io.open(loc, 'r', encoding='utf8') as file_:
for line in file_:
freq, word = line.strip().split('\t', 1)
orth = string_map[word]
counts.inc(orth, int(freq))
with io.open(out_loc, 'w', encoding='utf8') as file_:
for orth, count in counts:
string = string_map[orth]
file_.write('%d\t%s\n' % (count, string))
@plac.annotations(
input_loc=("Location of input file list"),
freqs_dir=("Directory for frequency files"),
output_loc=("Location for output file"),
n_jobs=("Number of workers", "option", "n", int),
skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
)
def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
tasks = []
outputs = []
for input_path in open(input_loc):
input_path = input_path.strip()
if not input_path:
continue
filename = input_path.split('/')[-1]
output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
outputs.append(output_path)
if not path.exists(output_path) or not skip_existing:
tasks.append((input_path, output_path))
if tasks:
parallelize(count_freqs, tasks, n_jobs)
print("Merge")
merge_counts(outputs, output_loc)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,89 +0,0 @@
#!/usr/bin/env python
from __future__ import unicode_literals
from xml.etree import cElementTree as ElementTree
import json
import re
import plac
from pathlib import Path
from os import path
escaped_tokens = {
'-LRB-': '(',
'-RRB-': ')',
'-LSB-': '[',
'-RSB-': ']',
'-LCB-': '{',
'-RCB-': '}',
}
def read_parses(parse_loc):
offset = 0
doc = []
for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
parse = _adjust_token_ids(parse, offset)
offset += len(parse.split('\n'))
doc.append(parse)
return doc
def _adjust_token_ids(parse, offset):
output = []
for line in parse.split('\n'):
pieces = line.split()
pieces[0] = str(int(pieces[0]) + offset)
pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
output.append('\t'.join(pieces))
return '\n'.join(output)
def _fmt_doc(filename, paras):
return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}
def _fmt_para(raw, sents):
return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}
def _fmt_sent(sent):
return {
'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
'brackets': []}
def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
head = int(head) - 1
id_ = int(id_) - 1
head = (head - id_) if head != -1 else 0
return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}
tags_re = re.compile(r'<[\w\?/][^>]+>')
def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
ewtb_dir = Path(ewtb_dir)
out_dir = Path(out_dir)
if not out_dir.exists():
out_dir.mkdir()
for genre_dir in ewtb_dir.joinpath('data').iterdir():
#if 'answers' in str(genre_dir): continue
parse_dir = genre_dir.joinpath('penntree')
docs = []
for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
filename = source_loc.parts[-1].replace('.sgm.sgm', '')
filename = filename.replace('.xml', '')
filename = filename.replace('.txt', '')
parse_loc = parse_dir.joinpath(filename + '.xml.tree')
parses = read_parses(parse_loc)
source = source_loc.open().read().strip()
if 'answers' in str(genre_dir):
source = tags_re.sub('', source).strip()
docs.append(_fmt_doc(filename, [[source, parses]]))
out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
with open(str(out_loc), 'w') as out_file:
out_file.write(json.dumps(docs, indent=4))
if __name__ == '__main__':
plac.call(main)

View File

@ -1,32 +0,0 @@
import io
import plac
from spacy.en import English
def main(text_loc):
with io.open(text_loc, 'r', encoding='utf8') as file_:
text = file_.read()
NLU = English()
for paragraph in text.split('\n\n'):
tokens = NLU(paragraph)
ent_starts = {}
ent_ends = {}
for span in tokens.ents:
ent_starts[span.start] = span.label_
ent_ends[span.end] = span.label_
output = []
for token in tokens:
if token.i in ent_starts:
output.append('<%s>' % ent_starts[token.i])
output.append(token.orth_)
if (token.i+1) in ent_ends:
output.append('</%s>' % ent_ends[token.i+1])
output.append('\n\n')
print ' '.join(output)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,157 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import io
import random
import time
import gzip
import plac
import cProfile
import pstats
import spacy.util
from spacy.en import English
from spacy.gold import GoldParse
from spacy.syntax.util import Config
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.parser import Parser
from spacy.scorer import Scorer
from spacy.tagger import Tagger
# Last updated for spaCy v0.97
def read_conll(file_):
"""Read a standard CoNLL/MALT-style format"""
sents = []
for sent_str in file_.read().strip().split('\n\n'):
ids = []
words = []
heads = []
labels = []
tags = []
for i, line in enumerate(sent_str.split('\n')):
word, pos_string, head_idx, label = _parse_line(line)
words.append(word)
if head_idx < 0:
head_idx = i
ids.append(i)
heads.append(head_idx)
labels.append(label)
tags.append(pos_string)
text = ' '.join(words)
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
sents.append((None, [(annot, [])]))
return sents
def _parse_line(line):
pieces = line.split()
if len(pieces) == 4:
word, pos, head_idx, label = pieces
head_idx = int(head_idx)
elif len(pieces) == 15:
id_ = int(pieces[0].split('_')[-1])
word = pieces[1]
pos = pieces[4]
head_idx = int(pieces[8])-1
label = pieces[10]
else:
id_ = int(pieces[0].split('_')[-1])
word = pieces[1]
pos = pieces[4]
head_idx = int(pieces[6])-1
label = pieces[7]
if head_idx == 0:
label = 'ROOT'
return word, pos, head_idx, label
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=False)
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
gold_preproc=False, force_gold=False):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=ArcEager.get_labels(gold_tuples))
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
loss = 0
for _, sents in gold_tuples:
for annot_tuples, _ in sents:
if len(annot_tuples[1]) == 1:
continue
score_model(scorer, nlp, None, annot_tuples, verbose=False)
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples, make_projective=True)
if not gold.is_projective:
raise Exception(
"Non-projective sentence in training, after we should "
"have enforced projectivity: %s" % annot_tuples
)
loss += nlp.parser.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags)
random.shuffle(gold_tuples)
print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
scorer.tags_acc, scorer.token_acc))
print('end training')
nlp.end_training(model_dir)
print('done')
@plac.annotations(
train_loc=("Location of CoNLL 09 formatted training file"),
dev_loc=("Location of CoNLL 09 formatted development file"),
model_dir=("Location of output model directory"),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_iter=("Number of training iterations", "option", "i", int),
)
def main(train_loc, dev_loc, model_dir, n_iter=15):
with io.open(train_loc, 'r', encoding='utf8') as file_:
train_sents = read_conll(file_)
if not eval_only:
train(English, train_sents, model_dir, n_iter=n_iter)
nlp = English(data_dir=model_dir)
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
scorer = Scorer()
for _, sents in dev_sents:
for annot_tuples, _ in sents:
score_model(scorer, nlp, None, annot_tuples)
print('TOK', 100-scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,187 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function
import os
from os import path
import shutil
import io
import random
import plac
import re
import spacy.util
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.gold import merge_sents
from spacy.scorer import Scorer
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.ner import BiluoPushDown
from spacy.tagger import Tagger
from spacy.syntax.parser import Parser
from spacy.syntax.nonproj import PseudoProjectivity
def _corrupt(c, noise_level):
if random.random() >= noise_level:
return c
elif c == ' ':
return '\n'
elif c == '\n':
return ' '
elif c in ['.', "'", "!", "?"]:
return ''
else:
return c.lower()
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
corrupted = [_corrupt(word, noise_level) for word in orig]
corrupted = [w for w in corrupted if w]
return corrupted
else:
return ''.join(_corrupt(c, noise_level) for c in orig)
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
else:
tokens = nlp.tokenizer(raw_text)
nlp.tagger(tokens)
nlp.entity(tokens)
nlp.parser(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
with Language.train(model_dir, train_data,
tagger_cfg, parser_cfg, entity_cfg) as trainer:
loss = 0
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
augment_data=None)):
for doc, gold in epoch:
trainer.update(doc, gold)
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None, cand_preproc=None):
print("Load parser", model_dir)
nlp = Language(path=model_dir)
if nlp.lang == 'de':
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
scorer = Scorer()
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
else:
sents = merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.parser(tokens)
nlp.entity(tokens)
else:
tokens = nlp(raw_text)
gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
return scorer
def write_parses(Language, dev_loc, model_dir, out_loc):
nlp = Language(data_dir=model_dir)
gold_tuples = read_json_file(dev_loc)
scorer = Scorer()
out_file = io.open(out_loc, 'w', 'utf8')
for raw_text, sents in gold_tuples:
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.entity(tokens)
nlp.parser(tokens)
else:
tokens = nlp(raw_text)
#gold = GoldParse(tokens, annot_tuples)
#scorer.score(tokens, gold, verbose=False)
for sent in tokens.sents:
for t in sent:
if not t.is_space:
out_file.write(
'%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
)
out_file.write('\n')
@plac.annotations(
language=("The language to train", "positional", None, str, ['en','de', 'zh']),
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
corruption_level=("Amount of noise to add to training data", "option", "c", float),
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
out_loc=("Out location", "option", "o", str),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool),
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
L1=("L1 regularization penalty", "option", "L", float),
)
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
L1=1e-6):
parser_cfg = dict(locals())
tagger_cfg = dict(locals())
entity_cfg = dict(locals())
lang = spacy.util.get_lang_class(language)
parser_cfg['features'] = lang.Defaults.parser_features
entity_cfg['features'] = lang.Defaults.entity_features
if not eval_only:
gold_train = list(read_json_file(train_loc))
gold_dev = list(read_json_file(dev_loc))
if n_sents > 0:
gold_train = gold_train[:n_sents]
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
n_iter=n_iter)
if out_loc:
write_parses(lang, dev_loc, model_dir, out_loc)
scorer = evaluate(lang, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose)
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
print('NER P', scorer.ents_p)
print('NER R', scorer.ents_r)
print('NER F', scorer.ents_f)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,201 +0,0 @@
from __future__ import unicode_literals, print_function
import plac
import json
import random
import pathlib
from spacy.tokens import Doc
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.language import Language
from spacy.gold import GoldParse
from spacy.tagger import Tagger
from spacy.pipeline import DependencyParser, TokenVectorEncoder
from spacy.syntax.parser import get_templates
from spacy.syntax.arc_eager import ArcEager
from spacy.scorer import Scorer
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
import spacy.attrs
import io
from thinc.neural.ops import CupyOps
from thinc.neural import Model
from spacy.es import Spanish
from spacy.attrs import POS
from thinc.neural import Model
try:
import cupy
from thinc.neural.ops import CupyOps
except:
cupy = None
def read_conllx(loc, n=0):
with io.open(loc, 'r', encoding='utf8') as file_:
text = file_.read()
i = 0
for sent in text.strip().split('\n\n'):
lines = sent.strip().split('\n')
if lines:
while lines[0].startswith('#'):
lines.pop(0)
tokens = []
for line in lines:
id_, word, lemma, pos, tag, morph, head, dep, _1, \
_2 = line.split('\t')
if '-' in id_ or '.' in id_:
continue
try:
id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_
dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
tag = pos+'__'+dep+'__'+morph
Spanish.Defaults.tag_map[tag] = {POS: pos}
tokens.append((id_, word, tag, head, dep, 'O'))
except:
raise
tuples = [list(t) for t in zip(*tokens)]
yield (None, [[tuples, []]])
i += 1
if n >= 1 and i >= n:
break
def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
scorer = Scorer()
correct = 0.
total = 0.
for doc, gold in zip(Xs, ys):
doc = Doc(vocab, words=[w.text for w in doc])
encoder(doc)
parser(doc)
PseudoProjectivity.deprojectivize(doc)
scorer.score(doc, gold, verbose=verbose)
for token, tag in zip(doc, gold.tags):
if '_' in token.tag_:
univ_guess, _ = token.tag_.split('_', 1)
else:
univ_guess = ''
univ_truth, _ = tag.split('_', 1)
correct += univ_guess == univ_truth
total += 1
return scorer
def organize_data(vocab, train_sents):
Xs = []
ys = []
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
doc = Doc(vocab, words=words)
gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
Xs.append(doc)
ys.append(gold)
return Xs, ys
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
LangClass = spacy.util.get_lang_class(lang_name)
train_sents = list(read_conllx(train_loc))
dev_sents = list(read_conllx(dev_loc))
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
actions = ArcEager.get_actions(gold_parses=train_sents)
features = get_templates('basic')
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
if not (model_dir / 'deps').exists():
(model_dir / 'deps').mkdir()
if not (model_dir / 'pos').exists():
(model_dir / 'pos').mkdir()
with (model_dir / 'deps' / 'config.json').open('wb') as file_:
file_.write(
json.dumps(
{'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
vocab = LangClass.Defaults.create_vocab()
if not (model_dir / 'vocab').exists():
(model_dir / 'vocab').mkdir()
else:
if (model_dir / 'vocab' / 'strings.json').exists():
with (model_dir / 'vocab' / 'strings.json').open() as file_:
vocab.strings.load(file_)
if (model_dir / 'vocab' / 'lexemes.bin').exists():
vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
if clusters_loc is not None:
clusters_loc = pathlib.Path(clusters_loc)
with clusters_loc.open() as file_:
for line in file_:
try:
cluster, word, freq = line.split()
except ValueError:
continue
lex = vocab[word]
lex.cluster = int(cluster[::-1], 2)
# Populate vocab
for _, doc_sents in train_sents:
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
for word in words:
_ = vocab[word]
for dep in deps:
_ = vocab[dep]
for tag in tags:
_ = vocab[tag]
if vocab.morphology.tag_map:
for tag in tags:
vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
tagger = Tagger(vocab)
encoder = TokenVectorEncoder(vocab, width=64)
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
Xs, ys = organize_data(vocab, train_sents)
dev_Xs, dev_ys = organize_data(vocab, dev_sents)
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
docs = list(Xs)
for doc in docs:
encoder(doc)
nn_loss = [0.]
def track_progress():
with encoder.tagger.use_params(optimizer.averages):
with parser.model.use_params(optimizer.averages):
scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
itn = len(nn_loss)
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
nn_loss.append(0.)
track_progress()
trainer.each_epoch.append(track_progress)
trainer.batch_size = 24
trainer.nb_epoch = 40
for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
tokvecs, upd_tokvecs = encoder.begin_update(docs)
for doc, tokvec in zip(docs, tokvecs):
doc.tensor = tokvec
d_tokvecs = parser.update(docs, golds, sgd=optimizer)
upd_tokvecs(d_tokvecs, sgd=optimizer)
encoder.update(docs, golds, sgd=optimizer)
nlp = LangClass(vocab=vocab, parser=parser)
scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
#nlp.end_training(model_dir)
#scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
#print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
if __name__ == '__main__':
import cProfile
import pstats
if 1:
plac.call(main)
else:
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()
plac.call(main)

View File

@ -1,194 +0,0 @@
"""Convert OntoNotes into a json format.
doc: {
id: string,
paragraphs: [{
raw: string,
sents: [int],
tokens: [{
start: int,
tag: string,
head: int,
dep: string}],
ner: [{
start: int,
end: int,
label: string}],
brackets: [{
start: int,
end: int,
label: string}]}]}
Consumes output of spacy/munge/align_raw.py
"""
from __future__ import unicode_literals
import plac
import json
from os import path
import os
import re
import io
from collections import defaultdict
from spacy.munge import read_ptb
from spacy.munge import read_conll
from spacy.munge import read_ner
def _iter_raw_files(raw_loc):
files = json.load(open(raw_loc))
for f in files:
yield f
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
ptb_sents = read_ptb.split(ptb_text)
dep_sents = read_conll.split(dep_text)
if len(ptb_sents) != len(dep_sents):
return None
if ner_text is not None:
ner_sents = read_ner.split(ner_text)
else:
ner_sents = [None] * len(ptb_sents)
i = 0
doc = {'id': file_id}
if raw_paras is None:
doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
#for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents):
# doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent]))
else:
doc['paragraphs'] = []
for raw_sents in raw_paras:
para = format_para(
' '.join(raw_sents).replace('<SEP>', ''),
ptb_sents[i:i+len(raw_sents)],
dep_sents[i:i+len(raw_sents)],
ner_sents[i:i+len(raw_sents)])
if para['sentences']:
doc['paragraphs'].append(para)
i += len(raw_sents)
return doc
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
para = {'raw': raw_text, 'sentences': []}
offset = 0
assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
_, deps = read_conll.parse(dep_text, strip_bad_periods=True)
if deps and 'VERB' in [t['tag'] for t in deps]:
continue
if ner_text is not None:
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
else:
ner = ['-' for _ in deps]
_, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
# Necessary because the ClearNLP converter deletes EDITED words.
if len(ner) != len(deps):
ner = ['-' for _ in deps]
para['sentences'].append(format_sentence(deps, ner, brackets))
return para
def format_sentence(deps, ner, brackets):
sent = {'tokens': [], 'brackets': []}
for token_id, (token, token_ent) in enumerate(zip(deps, ner)):
sent['tokens'].append(format_token(token_id, token, token_ent))
for label, start, end in brackets:
if start != end:
sent['brackets'].append({
'label': label,
'first': start,
'last': (end-1)})
return sent
def format_token(token_id, token, ner):
assert token_id == token['id']
head = (token['head'] - token_id) if token['head'] != -1 else 0
return {
'id': token_id,
'orth': token['word'],
'tag': token['tag'],
'head': head,
'dep': token['dep'],
'ner': ner}
def read_file(*pieces):
loc = path.join(*pieces)
if not path.exists(loc):
return None
else:
return io.open(loc, 'r', encoding='utf8').read().strip()
def get_file_names(section_dir, subsection):
filenames = []
for fn in os.listdir(path.join(section_dir, subsection)):
filenames.append(fn.rsplit('.', 1)[0])
return list(sorted(set(filenames)))
def read_wsj_with_source(onto_dir, raw_dir):
# Now do WSJ, with source alignment
onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
docs = {}
for i in range(25):
section = str(i) if i >= 10 else ('0' + str(i))
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
if section == '00':
j += 1
if section == '04' and filename == '55':
continue
ptb = read_file(onto_dir, section, '%s.parse' % filename)
dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
ner = read_file(onto_dir, section, '%s.name' % filename)
if ptb is not None and dep is not None:
docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner)
return docs
def get_doc(onto_dir, file_path, wsj_docs):
filename = file_path.rsplit('/', 1)[1]
if filename in wsj_docs:
return wsj_docs[filename]
else:
ptb = read_file(onto_dir, file_path + '.parse')
dep = read_file(onto_dir, file_path + '.parse.dep')
ner = read_file(onto_dir, file_path + '.name')
if ptb is not None and dep is not None:
return format_doc(filename, None, ptb, dep, ner)
else:
return None
def read_ids(loc):
return open(loc).read().strip().split('\n')
def main(onto_dir, raw_dir, out_dir):
wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
for partition in ('train', 'test', 'development'):
ids = read_ids(path.join(onto_dir, '%s.id' % partition))
docs_by_genre = defaultdict(list)
for file_path in ids:
doc = get_doc(onto_dir, file_path, wsj_docs)
if doc is not None:
genre = file_path.split('/')[3]
docs_by_genre[genre].append(doc)
part_dir = path.join(out_dir, partition)
if not path.exists(part_dir):
os.mkdir(part_dir)
for genre, docs in sorted(docs_by_genre.items()):
out_loc = path.join(part_dir, genre + '.json')
with open(out_loc, 'w') as file_:
json.dump(docs, file_, indent=4)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,13 +0,0 @@
"""Read a vector file, and prepare it as binary data, for easy consumption"""
import plac
from spacy.vocab import write_binary_vectors
def main(in_loc, out_loc):
write_binary_vectors(in_loc, out_loc)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,175 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function
import os
from os import path
import shutil
import codecs
import random
import plac
import re
import spacy.util
from spacy.en import English
from spacy.tagger import Tagger
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.scorer import Scorer
def score_model(scorer, nlp, raw_text, annot_tuples):
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
else:
tokens = nlp.tokenizer(raw_text)
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold)
def _merge_sents(sents):
m_deps = [[], [], [], [], [], []]
m_brackets = []
i = 0
for (ids, words, tags, heads, labels, ner), brackets in sents:
m_deps[0].extend(id_ + i for id_ in ids)
m_deps[1].extend(words)
m_deps[2].extend(tags)
m_deps[3].extend(head + i for head in heads)
m_deps[4].extend(labels)
m_deps[5].extend(ner)
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
i += len(ids)
return [(m_deps, m_brackets)]
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
beam_width=1, verbose=False,
use_orig_arc_eager=False):
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
templates = Tagger.default_templates()
nlp = Language(data_dir=model_dir, tagger=False)
nlp.tagger = Tagger.blank(nlp.vocab, templates)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
loss = 0
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
else:
sents = _merge_sents(sents)
for annot_tuples, ctnt in sents:
words = annot_tuples[1]
gold_tags = annot_tuples[2]
score_model(scorer, nlp, raw_text, annot_tuples)
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(words)
else:
tokens = nlp.tokenizer(raw_text)
loss += nlp.tagger.train(tokens, gold_tags)
random.shuffle(gold_tuples)
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc))
nlp.end_training(model_dir)
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None):
nlp = Language(data_dir=model_dir)
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
scorer = Scorer()
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
else:
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.entity(tokens)
nlp.parser(tokens)
else:
tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
return scorer
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
nlp = Language(data_dir=model_dir)
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
gold_tuples = read_json_file(dev_loc)
scorer = Scorer()
out_file = codecs.open(out_loc, 'w', 'utf8')
for raw_text, sents in gold_tuples:
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.entity(tokens)
nlp.parser(tokens)
else:
tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=False)
for t in tokens:
out_file.write(
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
)
return scorer
@plac.annotations(
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
corruption_level=("Amount of noise to add to training data", "option", "c", float),
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
out_loc=("Out location", "option", "o", str),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool),
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
if not eval_only:
gold_train = list(read_json_file(train_loc))
train(English, gold_train, model_dir,
feat_set='basic' if not debug else 'debug',
gold_preproc=gold_preproc, n_sents=n_sents,
corruption_level=corruption_level, n_iter=n_iter,
verbose=verbose)
#if out_loc:
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
scorer = evaluate(English, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose)
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
print('NER P', scorer.ents_p)
print('NER R', scorer.ents_r)
print('NER F', scorer.ents_f)
if __name__ == '__main__':
plac.call(main)

View File

@ -1,160 +0,0 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
import os
from os import path
import shutil
import io
import random
import time
import gzip
import ujson
import plac
import cProfile
import pstats
import spacy.util
from spacy.de import German
from spacy.gold import GoldParse
from spacy.tagger import Tagger
from spacy.scorer import PRFScore
from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags
from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags
from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
def default_templates():
return spacy.tagger.Tagger.default_templates()
def default_templates_without_clusters():
return (
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,),
)
def make_tagger(vocab, templates):
model = spacy.tagger.TaggerModel(templates)
return spacy.tagger.Tagger(vocab,model)
def read_conll(file_):
def sentences():
words, tags = [], []
for line in file_:
line = line.strip()
if line:
word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
words.append(word)
tags.append(tag)
elif words:
yield words, tags
words, tags = [], []
if words:
yield words, tags
return [ s for s in sentences() ]
def score_model(score, nlp, words, gold_tags):
tokens = nlp.tokenizer.tokens_from_list(words)
assert(len(tokens) == len(gold_tags))
nlp.tagger(tokens)
for token, gold_tag in zip(tokens,gold_tags):
score.score_set(set([token.tag_]),set([gold_tag]))
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
# make shuffling deterministic
random.seed(seed)
# set up directory for model
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(pos_model_dir)
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = make_tagger(nlp.vocab,default_templates())
print("Itn.\ttrain acc %\tdev acc %")
for itn in range(n_iter):
# train on train set
#train_acc = PRFScore()
correct, total = 0., 0.
for words, gold_tags in train_sents:
tokens = nlp.tokenizer.tokens_from_list(words)
correct += nlp.tagger.train(tokens, gold_tags)
total += len(words)
train_acc = correct/total
# test on dev set
dev_acc = PRFScore()
for words, gold_tags in dev_sents:
score_model(dev_acc, nlp, words, gold_tags)
random.shuffle(train_sents)
print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
print('end training')
nlp.end_training(model_dir)
print('done')
@plac.annotations(
train_loc=("Location of CoNLL 09 formatted training file"),
dev_loc=("Location of CoNLL 09 formatted development file"),
model_dir=("Location of output model directory"),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
n_iter=("Number of training iterations", "option", "i", int),
)
def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
# training
if not eval_only:
with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
io.open(dev_loc, 'r', encoding='utf8') as devfile_:
train_sents = read_conll(trainfile_)
dev_sents = read_conll(devfile_)
train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
# testing
with io.open(dev_loc, 'r', encoding='utf8') as file_:
dev_sents = read_conll(file_)
nlp = German(data_dir=model_dir)
dev_acc = PRFScore()
for words, gold_tags in dev_sents:
score_model(dev_acc, nlp, words, gold_tags)
print('POS: %6.2f %%' % (100*dev_acc.precision))
if __name__ == '__main__':
plac.call(main)

View File

@ -24,7 +24,6 @@ MOD_NAMES = [
'spacy.vocab', 'spacy.vocab',
'spacy.attrs', 'spacy.attrs',
'spacy.morphology', 'spacy.morphology',
'spacy.tagger',
'spacy.pipeline', 'spacy.pipeline',
'spacy.syntax.stateclass', 'spacy.syntax.stateclass',
'spacy.syntax._state', 'spacy.syntax._state',

View File

@ -3,8 +3,6 @@ from __future__ import unicode_literals
from .cli.info import info as cli_info from .cli.info import info as cli_info
from .glossary import explain from .glossary import explain
from .deprecated import resolve_load_name
#from .about import __version__
from .about import __version__ from .about import __version__
from . import util from . import util

View File

@ -1,47 +1,40 @@
import ujson # coding: utf8
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU from __future__ import unicode_literals
import numpy
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
from thinc.i2v import HashEmbed, StaticVectors from thinc.i2v import HashEmbed, StaticVectors
from thinc.t2t import ExtractWindow, ParametricAttention from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool from thinc.t2v import Pooling, sum_pool
from thinc.misc import Residual from thinc.misc import Residual
from thinc.misc import BatchNorm as BN
from thinc.misc import LayerNorm as LN from thinc.misc import LayerNorm as LN
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.api import FeatureExtracter, with_getitem from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
from thinc.api import uniqued, wrap, flatten_add_lengths, noop from thinc.api import uniqued, wrap, noop
from thinc.linear.linear import LinearModel from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
import random
import cytoolz
from thinc import describe from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed from thinc.neural._classes.affine import _set_dimensions_if_needed
import thinc.extra.load_nlp import thinc.extra.load_nlp
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
from .tokens.doc import Doc
from . import util from . import util
import numpy
import io
# TODO: Unset this once we don't want to support models previous models.
import thinc.neural._classes.layernorm
thinc.neural._classes.layernorm.set_compat_six_eight(False)
VECTORS_KEY = 'spacy_pretrained_vectors' VECTORS_KEY = 'spacy_pretrained_vectors'
@layerize @layerize
def _flatten_add_lengths(seqs, pad=0, drop=0.): def _flatten_add_lengths(seqs, pad=0, drop=0.):
ops = Model.ops ops = Model.ops
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i') lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
def finish_update(d_X, sgd=None): def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths, pad=pad) return ops.unflatten(d_X, lengths, pad=pad)
X = ops.flatten(seqs, pad=pad) X = ops.flatten(seqs, pad=pad)
return (X, lengths), finish_update return (X, lengths), finish_update
@ -55,33 +48,14 @@ def _logistic(X, drop=0.):
X = xp.minimum(X, 10., X) X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X) X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X)) Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None): def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y)) dX = dY * (Y * (1-Y))
return dX return dX
return Y, logistic_bwd return Y, logistic_bwd
@layerize
def add_tuples(X, drop=0.):
"""Give inputs of sequence pairs, where each sequence is (vals, length),
sum the values, returning a single sequence.
If input is:
((vals1, length), (vals2, length)
Output is:
(vals1+vals2, length)
vals are a single tensor for the whole batch.
"""
(vals1, length1), (vals2, length2) = X
assert length1 == length2
def add_tuples_bwd(dY, sgd=None):
return (dY, dY)
return (vals1+vals2, length), add_tuples_bwd
def _zero_init(model): def _zero_init(model):
def _zero_init_impl(self, X, y): def _zero_init_impl(self, X, y):
self.W.fill(0) self.W.fill(0)
@ -120,8 +94,7 @@ def _init_for_precomputed(W, ops):
b=Biases("Bias vector", b=Biases("Bias vector",
lambda obj: (obj.nO,)), lambda obj: (obj.nO,)),
d_W=Gradient("W"), d_W=Gradient("W"),
d_b=Gradient("b") d_b=Gradient("b"))
)
class PrecomputableAffine(Model): class PrecomputableAffine(Model):
def __init__(self, nO=None, nI=None, nF=None, **kwargs): def __init__(self, nO=None, nI=None, nF=None, **kwargs):
Model.__init__(self, **kwargs) Model.__init__(self, **kwargs)
@ -138,6 +111,7 @@ class PrecomputableAffine(Model):
Yf = self.ops.xp.tensordot( Yf = self.ops.xp.tensordot(
X, self.W, axes=[[1], [2]]) X, self.W, axes=[[1], [2]])
Yf += self.b Yf += self.b
def backward(dY_ids, sgd=None): def backward(dY_ids, sgd=None):
tensordot = self.ops.xp.tensordot tensordot = self.ops.xp.tensordot
dY, ids = dY_ids dY, ids = dY_ids
@ -154,6 +128,7 @@ class PrecomputableAffine(Model):
if sgd is not None: if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id) sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf return dXf
return Yf, backward return Yf, backward
@ -169,8 +144,7 @@ class PrecomputableAffine(Model):
b=Biases("Bias vector", b=Biases("Bias vector",
lambda obj: (obj.nO, obj.nP)), lambda obj: (obj.nO, obj.nP)),
d_W=Gradient("W"), d_W=Gradient("W"),
d_b=Gradient("b") d_b=Gradient("b"))
)
class PrecomputableMaxouts(Model): class PrecomputableMaxouts(Model):
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs): def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
Model.__init__(self, **kwargs) Model.__init__(self, **kwargs)
@ -186,114 +160,26 @@ class PrecomputableMaxouts(Model):
# dYp: (b, o, p) # dYp: (b, o, p)
# W: (f, o, p, i) # W: (f, o, p, i)
# b: (o, p) # b: (o, p)
# bi,opfi->bfop # bi,opfi->bfop
# bop,fopi->bfi # bop,fopi->bfi
# bop,fbi->opfi : fopi # bop,fbi->opfi : fopi
tensordot = self.ops.xp.tensordot tensordot = self.ops.xp.tensordot
ascontiguous = self.ops.xp.ascontiguousarray
Yfp = tensordot(X, self.W, axes=[[1], [3]]) Yfp = tensordot(X, self.W, axes=[[1], [3]])
Yfp += self.b Yfp += self.b
def backward(dYp_ids, sgd=None): def backward(dYp_ids, sgd=None):
dYp, ids = dYp_ids dYp, ids = dYp_ids
Xf = X[ids] Xf = X[ids]
dXf = tensordot(dYp, self.W, axes=[[1, 2], [1, 2]]) dXf = tensordot(dYp, self.W, axes=[[1, 2], [1, 2]])
dW = tensordot(dYp, Xf, axes=[[0], [0]]) dW = tensordot(dYp, Xf, axes=[[0], [0]])
self.d_W += dW.transpose((2, 0, 1, 3)) self.d_W += dW.transpose((2, 0, 1, 3))
self.d_b += dYp.sum(axis=0) self.d_b += dYp.sum(axis=0)
if sgd is not None: if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id) sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf return dXf
return Yfp, backward return Yfp, backward
# Thinc's Embed class is a bit broken atm, so drop this here.
from thinc import describe
from thinc.neural._classes.embed import _uniform_init
@describe.attributes(
nV=describe.Dimension("Number of vectors"),
nO=describe.Dimension("Size of output"),
vectors=describe.Weights("Embedding table",
lambda obj: (obj.nV, obj.nO),
_uniform_init(-0.1, 0.1)
),
d_vectors=describe.Gradient("vectors")
)
class Embed(Model):
name = 'embed'
def __init__(self, nO, nV=None, **kwargs):
if nV is not None:
nV += 1
Model.__init__(self, **kwargs)
if 'name' in kwargs:
self.name = kwargs['name']
self.column = kwargs.get('column', 0)
self.nO = nO
self.nV = nV
def predict(self, ids):
if ids.ndim == 2:
ids = ids[:, self.column]
return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
def begin_update(self, ids, drop=0.):
if ids.ndim == 2:
ids = ids[:, self.column]
vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
def backprop_embed(d_vectors, sgd=None):
n_vectors = d_vectors.shape[0]
self.ops.scatter_add(self.d_vectors, ids, d_vectors)
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return None
return vectors, backprop_embed
def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
'''Wrap a model, adding features representing action history.'''
if hist_size == 0:
return layerize(noop())
embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
for i in range(hist_size)]
embed = chain(concatenate(*embed_tables),
LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
ops = embed.ops
def add_history_fwd(vectors_hists, drop=0.):
vectors, hist_ids = vectors_hists
hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop)
outputs = ops.xp.hstack((vectors, hist_feats))
def add_history_bwd(d_outputs, sgd=None):
d_vectors = d_outputs[:, :vectors.shape[1]]
d_hists = d_outputs[:, vectors.shape[1]:]
bp_hists(d_hists, sgd=sgd)
return embed.ops.xp.ascontiguousarray(d_vectors)
return outputs, add_history_bwd
return wrap(add_history_fwd, embed)
def drop_layer(layer, factor=2.):
def drop_layer_fwd(X, drop=0.):
if drop <= 0.:
return layer.begin_update(X, drop=drop)
else:
coinflip = layer.ops.xp.random.random()
if (coinflip / factor) >= drop:
return layer.begin_update(X, drop=drop)
else:
return X, lambda dX, sgd=None: dX
model = wrap(drop_layer_fwd, layer)
model.predict = layer
return model
def link_vectors_to_models(vocab): def link_vectors_to_models(vocab):
vectors = vocab.vectors vectors = vocab.vectors
@ -308,16 +194,21 @@ def link_vectors_to_models(vocab):
# (unideal, I know) # (unideal, I know)
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
def Tok2Vec(width, embed_size, **kwargs): def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = kwargs.get('pretrained_dims', 0) pretrained_dims = kwargs.get('pretrained_dims', 0)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
'*': reapply}): '+': add, '*': reapply}):
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') norm = HashEmbed(width, embed_size, column=cols.index(NORM),
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') name='embed_norm')
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') name='embed_prefix')
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
name='embed_suffix')
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
name='embed_shape')
if pretrained_dims is not None and pretrained_dims >= 1: if pretrained_dims is not None and pretrained_dims >= 1:
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID)) glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
@ -329,7 +220,6 @@ def Tok2Vec(width, embed_size, **kwargs):
(norm | prefix | suffix | shape) (norm | prefix | suffix | shape)
>> LN(Maxout(width, width*4, pieces=3)), column=5) >> LN(Maxout(width, width*4, pieces=3)), column=5)
convolution = Residual( convolution = Residual(
ExtractWindow(nW=1) ExtractWindow(nW=1)
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces)) >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
@ -354,6 +244,7 @@ def reapply(layer, n_times):
Y, backprop = layer.begin_update(X, drop=drop) Y, backprop = layer.begin_update(X, drop=drop)
X = Y X = Y
backprops.append(backprop) backprops.append(backprop)
def reapply_bwd(dY, sgd=None): def reapply_bwd(dY, sgd=None):
dX = None dX = None
for backprop in reversed(backprops): for backprop in reversed(backprops):
@ -363,39 +254,20 @@ def reapply(layer, n_times):
else: else:
dX += dY dX += dY
return dX return dX
return Y, reapply_bwd return Y, reapply_bwd
return wrap(reapply_fwd, layer) return wrap(reapply_fwd, layer)
def asarray(ops, dtype): def asarray(ops, dtype):
def forward(X, drop=0.): def forward(X, drop=0.):
return ops.asarray(X, dtype=dtype), None return ops.asarray(X, dtype=dtype), None
return layerize(forward) return layerize(forward)
def foreach(layer):
def forward(Xs, drop=0.):
results = []
backprops = []
for X in Xs:
result, bp = layer.begin_update(X, drop=drop)
results.append(result)
backprops.append(bp)
def backward(d_results, sgd=None):
dXs = []
for d_result, backprop in zip(d_results, backprops):
dXs.append(backprop(d_result, sgd))
return dXs
return results, backward
model = layerize(forward)
model._layers.append(layer)
return model
def rebatch(size, layer): def rebatch(size, layer):
ops = layer.ops ops = layer.ops
def forward(X, drop=0.): def forward(X, drop=0.):
if X.shape[0] < size: if X.shape[0] < size:
return layer.begin_update(X) return layer.begin_update(X)
@ -403,6 +275,7 @@ def rebatch(size, layer):
results, bp_results = zip(*[layer.begin_update(p, drop=drop) results, bp_results = zip(*[layer.begin_update(p, drop=drop)
for p in parts]) for p in parts])
y = ops.flatten(results) y = ops.flatten(results)
def backward(dy, sgd=None): def backward(dy, sgd=None):
d_parts = [bp(y, sgd=sgd) for bp, y in d_parts = [bp(y, sgd=sgd) for bp, y in
zip(bp_results, _divide_array(dy, size))] zip(bp_results, _divide_array(dy, size))]
@ -413,6 +286,7 @@ def rebatch(size, layer):
except ValueError: except ValueError:
dX = None dX = None
return dX return dX
return y, backward return y, backward
model = layerize(forward) model = layerize(forward)
model._layers.append(layer) model._layers.append(layer)
@ -430,6 +304,7 @@ def _divide_array(X, size):
def get_col(idx): def get_col(idx):
assert idx >= 0, idx assert idx >= 0, idx
def forward(X, drop=0.): def forward(X, drop=0.):
assert idx >= 0, idx assert idx >= 0, idx
if isinstance(X, numpy.ndarray): if isinstance(X, numpy.ndarray):
@ -437,30 +312,28 @@ def get_col(idx):
else: else:
ops = CupyOps() ops = CupyOps()
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
def backward(y, sgd=None): def backward(y, sgd=None):
assert idx >= 0, idx assert idx >= 0, idx
dX = ops.allocate(X.shape) dX = ops.allocate(X.shape)
dX[:, idx] += y dX[:, idx] += y
return dX return dX
return output, backward return output, backward
return layerize(forward) return layerize(forward)
def zero_init(model):
def _hook(self, X, y=None):
self.W.fill(0)
model.on_data_hooks.append(_hook)
return model
def doc2feats(cols=None): def doc2feats(cols=None):
if cols is None: if cols is None:
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
def forward(docs, drop=0.): def forward(docs, drop=0.):
feats = [] feats = []
for doc in docs: for doc in docs:
feats.append(doc.to_array(cols)) feats.append(doc.to_array(cols))
return feats, None return feats, None
model = layerize(forward) model = layerize(forward)
model.cols = cols model.cols = cols
return model return model
@ -474,28 +347,14 @@ def print_shape(prefix):
@layerize @layerize
def get_token_vectors(tokens_attrs_vectors, drop=0.): def get_token_vectors(tokens_attrs_vectors, drop=0.):
ops = Model.ops
tokens, attrs, vectors = tokens_attrs_vectors tokens, attrs, vectors = tokens_attrs_vectors
def backward(d_output, sgd=None): def backward(d_output, sgd=None):
return (tokens, d_output) return (tokens, d_output)
return vectors, backward return vectors, backward
@layerize
def flatten(seqs, drop=0.):
if isinstance(seqs[0], numpy.ndarray):
ops = NumpyOps()
elif hasattr(CupyOps.xp, 'ndarray') and isinstance(seqs[0], CupyOps.xp.ndarray):
ops = CupyOps()
else:
raise ValueError("Unable to flatten sequence of type %s" % type(seqs[0]))
lengths = [len(seq) for seq in seqs]
def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths)
X = ops.xp.vstack(seqs)
return X, finish_update
@layerize @layerize
def logistic(X, drop=0.): def logistic(X, drop=0.):
xp = get_array_module(X) xp = get_array_module(X)
@ -505,9 +364,11 @@ def logistic(X, drop=0.):
X = xp.minimum(X, 10., X) X = xp.minimum(X, 10., X)
X = xp.maximum(X, -10., X) X = xp.maximum(X, -10., X)
Y = 1. / (1. + xp.exp(-X)) Y = 1. / (1. + xp.exp(-X))
def logistic_bwd(dY, sgd=None): def logistic_bwd(dY, sgd=None):
dX = dY * (Y * (1-Y)) dX = dY * (Y * (1-Y))
return dX return dX
return Y, logistic_bwd return Y, logistic_bwd
@ -517,6 +378,7 @@ def zero_init(model):
model.on_data_hooks.append(_zero_init_impl) model.on_data_hooks.append(_zero_init_impl)
return model return model
@layerize @layerize
def preprocess_doc(docs, drop=0.): def preprocess_doc(docs, drop=0.):
keys = [doc.to_array([LOWER]) for doc in docs] keys = [doc.to_array([LOWER]) for doc in docs]
@ -526,11 +388,13 @@ def preprocess_doc(docs, drop=0.):
vals = ops.allocate(keys.shape[0]) + 1 vals = ops.allocate(keys.shape[0]) + 1
return (keys, vals, lengths), None return (keys, vals, lengths), None
def getitem(i): def getitem(i):
def getitem_fwd(X, drop=0.): def getitem_fwd(X, drop=0.):
return X[i], None return X[i], None
return layerize(getitem_fwd) return layerize(getitem_fwd)
def build_tagger_model(nr_class, **cfg): def build_tagger_model(nr_class, **cfg):
embed_size = util.env_opt('embed_size', 7000) embed_size = util.env_opt('embed_size', 7000)
if 'token_vector_width' in cfg: if 'token_vector_width' in cfg:
@ -555,8 +419,6 @@ def build_tagger_model(nr_class, **cfg):
@layerize @layerize
def SpacyVectors(docs, drop=0.): def SpacyVectors(docs, drop=0.):
xp = get_array_module(docs[0].vocab.vectors.data)
width = docs[0].vocab.vectors.data.shape[1]
batch = [] batch = []
for doc in docs: for doc in docs:
indices = numpy.zeros((len(doc),), dtype='i') indices = numpy.zeros((len(doc),), dtype='i')
@ -570,29 +432,6 @@ def SpacyVectors(docs, drop=0.):
return batch, None return batch, None
def foreach(layer, drop_factor=1.0):
'''Map a layer across elements in a list'''
def foreach_fwd(Xs, drop=0.):
drop *= drop_factor
ys = []
backprops = []
for X in Xs:
y, bp_y = layer.begin_update(X, drop=drop)
ys.append(y)
backprops.append(bp_y)
def foreach_bwd(d_ys, sgd=None):
d_Xs = []
for d_y, bp_y in zip(d_ys, backprops):
if bp_y is not None and bp_y is not None:
d_Xs.append(d_y, sgd=sgd)
else:
d_Xs.append(None)
return d_Xs
return ys, foreach_bwd
model = wrap(foreach_fwd, layer)
return model
def build_text_classifier(nr_class, width=64, **cfg): def build_text_classifier(nr_class, width=64, **cfg):
nr_vector = cfg.get('nr_vector', 5000) nr_vector = cfg.get('nr_vector', 5000)
pretrained_dims = cfg.get('pretrained_dims', 0) pretrained_dims = cfg.get('pretrained_dims', 0)
@ -602,9 +441,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
model = ( model = (
SpacyVectors SpacyVectors
>> flatten_add_lengths >> flatten_add_lengths
>> with_getitem(0, >> with_getitem(0, Affine(width, pretrained_dims))
Affine(width, pretrained_dims)
)
>> ParametricAttention(width) >> ParametricAttention(width)
>> Pooling(sum_pool) >> Pooling(sum_pool)
>> Residual(ReLu(width, width)) ** 2 >> Residual(ReLu(width, width)) ** 2
@ -613,7 +450,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
) )
return model return model
lower = HashEmbed(width, nr_vector, column=1) lower = HashEmbed(width, nr_vector, column=1)
prefix = HashEmbed(width//2, nr_vector, column=2) prefix = HashEmbed(width//2, nr_vector, column=2)
suffix = HashEmbed(width//2, nr_vector, column=3) suffix = HashEmbed(width//2, nr_vector, column=3)
@ -671,33 +507,40 @@ def build_text_classifier(nr_class, width=64, **cfg):
model.lsuv = False model.lsuv = False
return model return model
@layerize @layerize
def flatten(seqs, drop=0.): def flatten(seqs, drop=0.):
ops = Model.ops ops = Model.ops
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i') lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
def finish_update(d_X, sgd=None): def finish_update(d_X, sgd=None):
return ops.unflatten(d_X, lengths, pad=0) return ops.unflatten(d_X, lengths, pad=0)
X = ops.flatten(seqs, pad=0) X = ops.flatten(seqs, pad=0)
return X, finish_update return X, finish_update
def concatenate_lists(*layers, **kwargs): # pragma: no cover def concatenate_lists(*layers, **kwargs): # pragma: no cover
'''Compose two or more models `f`, `g`, etc, such that their outputs are """Compose two or more models `f`, `g`, etc, such that their outputs are
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
''' """
if not layers: if not layers:
return noop() return noop()
drop_factor = kwargs.get('drop_factor', 1.0) drop_factor = kwargs.get('drop_factor', 1.0)
ops = layers[0].ops ops = layers[0].ops
layers = [chain(layer, flatten) for layer in layers] layers = [chain(layer, flatten) for layer in layers]
concat = concatenate(*layers) concat = concatenate(*layers)
def concatenate_lists_fwd(Xs, drop=0.): def concatenate_lists_fwd(Xs, drop=0.):
drop *= drop_factor drop *= drop_factor
lengths = ops.asarray([len(X) for X in Xs], dtype='i') lengths = ops.asarray([len(X) for X in Xs], dtype='i')
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
ys = ops.unflatten(flat_y, lengths) ys = ops.unflatten(flat_y, lengths)
def concatenate_lists_bwd(d_ys, sgd=None): def concatenate_lists_bwd(d_ys, sgd=None):
return bp_flat_y(ops.flatten(d_ys), sgd=sgd) return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
return ys, concatenate_lists_bwd return ys, concatenate_lists_bwd
model = wrap(concatenate_lists_fwd, concat) model = wrap(concatenate_lists_fwd, concat)
return model return model

View File

@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
""" """
Normalize a dictionary of attributes, converting them to ints. Normalize a dictionary of attributes, converting them to ints.
Arguments: stringy_attrs (dict): Dictionary keyed by attribute string names. Values
stringy_attrs (dict): can be ints or strings.
Dictionary keyed by attribute string names. Values can be ints or strings. strings_map (StringStore): Defaults to None. If provided, encodes string
values into ints.
strings_map (StringStore): RETURNS (dict): Attributes dictionary with keys and optionally values
Defaults to None. If provided, encodes string values into ints. converted to ints.
Returns:
inty_attrs (dict):
Attributes dictionary with keys and optionally values converted to
ints.
""" """
inty_attrs = {} inty_attrs = {}
if _do_deprecated: if _do_deprecated:

View File

@ -7,10 +7,9 @@ from pathlib import Path
from .converters import conllu2json, iob2json, conll_ner2json from .converters import conllu2json, iob2json, conll_ner2json
from ..util import prints from ..util import prints
# Converters are matched by file extension. To add a converter, add a new entry # Converters are matched by file extension. To add a converter, add a new
# to this dict with the file extension mapped to the converter function imported # entry to this dict with the file extension mapped to the converter function
# from /converters. # imported from /converters.
CONVERTERS = { CONVERTERS = {
'conllu': conllu2json, 'conllu': conllu2json,
'conll': conllu2json, 'conll': conllu2json,
@ -24,8 +23,7 @@ CONVERTERS = {
output_dir=("output directory for converted file", "positional", None, str), output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", int), n_sents=("Number of sentences per doc", "option", "n", int),
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool) morphology=("Enable appending morphology to tags", "flag", "m", bool))
)
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False, def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
converter='auto'): converter='auto'):
""" """
@ -40,7 +38,7 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
prints(output_path, title="Output directory not found", exits=1) prints(output_path, title="Output directory not found", exits=1)
if converter == 'auto': if converter == 'auto':
converter = input_path.suffix[1:] converter = input_path.suffix[1:]
if not converter in CONVERTERS: if converter not in CONVERTERS:
prints("Can't find converter for %s" % converter, prints("Can't find converter for %s" % converter,
title="Unknown format", exits=1) title="Unknown format", exits=1)
func = CONVERTERS[converter] func = CONVERTERS[converter]

View File

@ -8,7 +8,8 @@ from ...gold import iob_to_biluo
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
""" """
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli. Convert files in the CoNLL-2003 NER format into JSON format for use with
train cli.
""" """
docs = read_conll_ner(input_path) docs = read_conll_ner(input_path)

View File

@ -13,10 +13,9 @@ from .. import about
@plac.annotations( @plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str), model=("model to download, shortcut or name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't " direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool) "perform compatibility check", "flag", "d", bool))
)
def download(cmd, model, direct=False): def download(cmd, model, direct=False):
""" """
Download compatible model from default download path using pip. Model Download compatible model from default download path using pip. Model
@ -30,21 +29,25 @@ def download(cmd, model, direct=False):
model_name = shortcuts.get(model, model) model_name = shortcuts.get(model, model)
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
v=version))
if dl == 0: if dl == 0:
try: try:
# Get package path here because link uses # Get package path here because link uses
# pip.get_installed_distributions() to check if model is a package, # pip.get_installed_distributions() to check if model is a
# which fails if model was just installed via subprocess # package, which fails if model was just installed via
# subprocess
package_path = get_package_path(model_name) package_path = get_package_path(model_name)
link(None, model_name, model, force=True, model_path=package_path) link(None, model_name, model, force=True,
model_path=package_path)
except: except:
# Dirty, but since spacy.download and the auto-linking is mostly # Dirty, but since spacy.download and the auto-linking is
# a convenience wrapper, it's best to show a success message and # mostly a convenience wrapper, it's best to show a success
# loading instructions, even if linking fails. # message and loading instructions, even if linking fails.
prints("Creating a shortcut link for 'en' didn't work (maybe you " prints(
"don't have admin permissions?), but you can still load " "Creating a shortcut link for 'en' didn't work (maybe "
"the model via its full package name:", "you don't have admin permissions?), but you can still "
"load the model via its full package name:",
"nlp = spacy.load('%s')" % model_name, "nlp = spacy.load('%s')" % model_name,
title="Download successful") title="Download successful")
@ -52,9 +55,10 @@ def download(cmd, model, direct=False):
def get_json(url, desc): def get_json(url, desc):
r = requests.get(url) r = requests.get(url)
if r.status_code != 200: if r.status_code != 200:
prints("Couldn't fetch %s. Please find a model for your spaCy installation " msg = ("Couldn't fetch %s. Please find a model for your spaCy "
"(v%s), and download it manually." % (desc, about.__version__), "installation (v%s), and download it manually.")
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1) prints(msg % (desc, about.__version__), about.__docs_models__,
title="Server error (%d)" % r.status_code, exits=1)
return r.json() return r.json()
@ -71,13 +75,13 @@ def get_compatibility():
def get_version(model, comp): def get_version(model, comp):
if model not in comp: if model not in comp:
version = about.__version__ version = about.__version__
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version), msg = "No compatible model found for '%s' (spaCy v%s)."
title="Compatibility error", exits=1) prints(msg % (model, version), title="Compatibility error", exits=1)
return comp[model][0] return comp[model][0]
def download_model(filename): def download_model(filename):
download_url = about.__download_url__ + '/' + filename download_url = about.__download_url__ + '/' + filename
return subprocess.call([sys.executable, '-m', return subprocess.call(
'pip', 'install', '--no-cache-dir', download_url], [sys.executable, '-m', 'pip', 'install', '--no-cache-dir',
env=os.environ.copy()) download_url], env=os.environ.copy())

View File

@ -2,27 +2,15 @@
from __future__ import unicode_literals, division, print_function from __future__ import unicode_literals, division, print_function
import plac import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path
import dill
import tqdm
from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer from timeit import default_timer as timer
import random import random
import numpy.random import numpy.random
from ..tokens.doc import Doc from ..gold import GoldCorpus
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch
from ..util import prints from ..util import prints
from .. import util from .. import util
from .. import about
from .. import displacy from .. import displacy
from ..compat import json_dumps
random.seed(0) random.seed(0)
numpy.random.seed(0) numpy.random.seed(0)
@ -30,17 +18,18 @@ numpy.random.seed(0)
@plac.annotations( @plac.annotations(
model=("Model name or path", "positional", None, str), model=("Model name or path", "positional", None, str),
data_path=("Location of JSON-formatted evaluation data", "positional", None, str), data_path=("Location of JSON-formatted evaluation data", "positional",
None, str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool),
gpu_id=("Use GPU", "option", "g", int), gpu_id=("Use GPU", "option", "g", int),
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), displacy_path=("Directory to output rendered parses as HTML", "option",
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int) "dp", str),
) displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
displacy_path=None, displacy_limit=25): displacy_path=None, displacy_limit=25):
""" """
Evaluate a model. To render a sample of parses in a HTML file, set an output Evaluate a model. To render a sample of parses in a HTML file, set an
directory as the displacy_path argument. output directory as the displacy_path argument.
""" """
if gpu_id >= 0: if gpu_id >= 0:
util.use_gpu(gpu_id) util.use_gpu(gpu_id)
@ -50,7 +39,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
if not data_path.exists(): if not data_path.exists():
prints(data_path, title="Evaluation data not found", exits=1) prints(data_path, title="Evaluation data not found", exits=1)
if displacy_path and not displacy_path.exists(): if displacy_path and not displacy_path.exists():
prints(displacy_path, title="Visualization output directory not found", exits=1) prints(displacy_path, title="Visualization output directory not found",
exits=1)
corpus = GoldCorpus(data_path, data_path) corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model) nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
@ -64,12 +54,14 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
docs, golds = zip(*dev_docs) docs, golds = zip(*dev_docs)
render_deps = 'parser' in nlp.meta.get('pipeline', []) render_deps = 'parser' in nlp.meta.get('pipeline', [])
render_ents = 'ner' in nlp.meta.get('pipeline', []) render_ents = 'ner' in nlp.meta.get('pipeline', [])
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit, render_parses(docs, displacy_path, model_name=model,
deps=render_deps, ents=render_ents) limit=displacy_limit, deps=render_deps, ents=render_ents)
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit) msg = "Generated %s parses as HTML" % displacy_limit
prints(displacy_path, title=msg)
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True): def render_parses(docs, output_path, model_name='', limit=250, deps=True,
ents=True):
docs[0].user_data['title'] = model_name docs[0].user_data['title'] = model_name
if ents: if ents:
with (output_path / 'entities.html').open('w') as file_: with (output_path / 'entities.html').open('w') as file_:
@ -77,7 +69,8 @@ def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=T
file_.write(html) file_.write(html)
if deps: if deps:
with (output_path / 'parses.html').open('w') as file_: with (output_path / 'parses.html').open('w') as file_:
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True}) html = displacy.render(docs[:limit], style='dep', page=True,
options={'compact': True})
file_.write(html) file_.write(html)

View File

@ -12,8 +12,7 @@ from .. import util
@plac.annotations( @plac.annotations(
model=("optional: shortcut link of model", "positional", None, str), model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str) markdown=("generate Markdown for GitHub issues", "flag", "md", str))
)
def info(cmd, model=None, markdown=False): def info(cmd, model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is """Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown speficied as an argument, print model information. Flag --markdown

View File

@ -12,8 +12,7 @@ from .. import util
@plac.annotations( @plac.annotations(
origin=("package name or local path to model", "positional", None, str), origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str), link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool) force=("force overwriting of existing link", "flag", "f", bool))
)
def link(cmd, origin, link_name, force=False, model_path=None): def link(cmd, origin, link_name, force=False, model_path=None):
""" """
Create a symlink for models within the spacy/data directory. Accepts Create a symlink for models within the spacy/data directory. Accepts
@ -46,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None):
# This is quite dirty, but just making sure other errors are caught. # This is quite dirty, but just making sure other errors are caught.
prints("Creating a symlink in spacy/data failed. Make sure you have " prints("Creating a symlink in spacy/data failed. Make sure you have "
"the required permissions and try re-running the command as " "the required permissions and try re-running the command as "
"admin, or use a virtualenv. You can still import the model as a " "admin, or use a virtualenv. You can still import the model as "
"module and call its load() method, or create the symlink manually.", "a module and call its load() method, or create the symlink "
"manually.",
"%s --> %s" % (path2str(model_path), path2str(link_path)), "%s --> %s" % (path2str(model_path), path2str(link_path)),
title="Error: Couldn't link model to '%s'" % link_name) title="Error: Couldn't link model to '%s'" % link_name)
raise raise

View File

@ -16,10 +16,12 @@ from .. import about
input_dir=("directory with model data", "positional", None, str), input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str), output_dir=("output parent directory", "positional", None, str),
meta_path=("path to meta.json", "option", "m", str), meta_path=("path to meta.json", "option", "m", str),
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool), create_meta=("create meta.json, even if one exists in directory", "flag",
force=("force overwriting of existing folder in output directory", "flag", "f", bool) "c", bool),
) force=("force overwriting of existing folder in output directory", "flag",
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False): "f", bool))
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
force=False):
""" """
Generate Python package for model data, including meta and required Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified installation files. A new directory will be created in the specified
@ -52,13 +54,15 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force
package_path = main_path / model_name package_path = main_path / model_name
create_dirs(package_path, force) create_dirs(package_path, force)
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) shutil.copytree(path2str(input_path),
path2str(package_path / model_name_v))
create_file(main_path / 'meta.json', json_dumps(meta)) create_file(main_path / 'meta.json', json_dumps(meta))
create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'setup.py', template_setup)
create_file(main_path / 'MANIFEST.in', template_manifest) create_file(main_path / 'MANIFEST.in', template_manifest)
create_file(package_path / '__init__.py', template_init) create_file(package_path / '__init__.py', template_init)
prints(main_path, "To build the package, run `python setup.py sdist` in this " prints(main_path, "To build the package, run `python setup.py sdist` in "
"directory.", title="Successfully created package '%s'" % model_name_v) "this directory.",
title="Successfully created package '%s'" % model_name_v)
def create_dirs(package_path, force): def create_dirs(package_path, force):
@ -66,9 +70,10 @@ def create_dirs(package_path, force):
if force: if force:
shutil.rmtree(path2str(package_path)) shutil.rmtree(path2str(package_path))
else: else:
prints(package_path, "Please delete the directory and try again, or " prints(package_path, "Please delete the directory and try again, "
"use the --force flag to overwrite existing directories.", "or use the --force flag to overwrite existing "
title="Package directory already exists", exits=1) "directories.", title="Package directory already exists",
exits=1)
Path.mkdir(package_path, parents=True) Path.mkdir(package_path, parents=True)
@ -82,7 +87,8 @@ def generate_meta(model_path):
settings = [('lang', 'Model language', 'en'), settings = [('lang', 'Model language', 'en'),
('name', 'Model name', 'model'), ('name', 'Model name', 'model'),
('version', 'Model version', '0.0.0'), ('version', 'Model version', '0.0.0'),
('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__), ('spacy_version', 'Required spaCy version',
'>=%s,<3.0.0' % about.__version__),
('description', 'Model description', False), ('description', 'Model description', False),
('author', 'Author', False), ('author', 'Author', False),
('email', 'Author email', False), ('email', 'Author email', False),

View File

@ -27,15 +27,15 @@ def read_inputs(loc):
@plac.annotations( @plac.annotations(
lang=("model/language", "positional", None, str), lang=("model/language", "positional", None, str),
inputs=("Location of input file", "positional", None, read_inputs) inputs=("Location of input file", "positional", None, read_inputs))
)
def profile(cmd, lang, inputs=None): def profile(cmd, lang, inputs=None):
""" """
Profile a spaCy pipeline, to find out which functions take the most time. Profile a spaCy pipeline, to find out which functions take the most time.
""" """
nlp = spacy.load(lang) nlp = spacy.load(lang)
texts = list(cytoolz.take(10000, inputs)) texts = list(cytoolz.take(10000, inputs))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
"Profile.prof")
s = pstats.Stats("Profile.prof") s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats() s.strip_dirs().sort_stats("time").print_stats()

View File

@ -2,21 +2,14 @@
from __future__ import unicode_literals, division, print_function from __future__ import unicode_literals, division, print_function
import plac import plac
import json
from collections import defaultdict
import cytoolz
from pathlib import Path from pathlib import Path
import dill import dill
import tqdm import tqdm
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer from timeit import default_timer as timer
import random import random
import numpy.random import numpy.random
from ..tokens.doc import Doc
from ..scorer import Scorer
from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch from ..gold import GoldCorpus, minibatch
from ..util import prints from ..util import prints
from .. import util from .. import util
@ -31,8 +24,10 @@ numpy.random.seed(0)
@plac.annotations( @plac.annotations(
lang=("model language", "positional", None, str), lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str), output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str), train_data=("location of JSON-formatted training data", "positional",
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), None, str),
dev_data=("location of JSON-formatted development data (optional)",
"positional", None, str),
n_iter=("number of iterations", "option", "n", int), n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int), n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int), use_gpu=("Use GPU", "option", "g", int),
@ -42,11 +37,12 @@ numpy.random.seed(0)
no_entities=("Don't train NER", "flag", "N", bool), no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool),
version=("Model version", "option", "V", str), version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) meta_path=("Optional path to meta.json. All relevant properties will be "
) "overwritten.", "option", "m", Path))
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
gold_preproc=False, version="0.0.0", meta_path=None): no_entities=False, gold_preproc=False, version="0.0.0",
meta_path=None):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
@ -72,9 +68,12 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
meta.setdefault('name', 'unnamed') meta.setdefault('name', 'unnamed')
pipeline = ['tagger', 'parser', 'ner'] pipeline = ['tagger', 'parser', 'ner']
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') if no_tagger and 'tagger' in pipeline:
if no_parser and 'parser' in pipeline: pipeline.remove('parser') pipeline.remove('tagger')
if no_entities and 'ner' in pipeline: pipeline.remove('ner') if no_parser and 'parser' in pipeline:
pipeline.remove('parser')
if no_entities and 'ner' in pipeline:
pipeline.remove('ner')
# Take dropout and batch size as generators of values -- dropout # Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore. # starts high and decays sharply, to force the optimizer to explore.
@ -157,7 +156,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
with meta_loc.open('w') as file_: with meta_loc.open('w') as file_:
file_.write(json_dumps(meta)) file_.write(json_dumps(meta))
util.set_env_log(True) util.set_env_log(True)
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
gpu_wps=gpu_wps)
finally: finally:
print("Saving model...") print("Saving model...")
try: try:

View File

@ -1,5 +1,5 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals, print_function
import requests import requests
import pkg_resources import pkg_resources
@ -29,8 +29,10 @@ def validate(cmd):
model_links = get_model_links(current_compat) model_links = get_model_links(current_compat)
model_pkgs = get_model_pkgs(current_compat, all_models) model_pkgs = get_model_pkgs(current_compat, all_models)
incompat_links = {l for l, d in model_links.items() if not d['compat']} incompat_links = {l for l, d in model_links.items() if not d['compat']}
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']} incompat_models = {d['name'] for _, d in model_pkgs.items()
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']]) if not d['compat']}
incompat_models.update([d['name'] for _, d in model_links.items()
if not d['compat']])
na_models = [m for m in incompat_models if m not in current_compat] na_models = [m for m in incompat_models if m not in current_compat]
update_models = [m for m in incompat_models if m in current_compat] update_models = [m for m in incompat_models if m in current_compat]
@ -90,7 +92,6 @@ def get_model_pkgs(compat, all_models):
def get_model_row(compat, name, data, type='package'): def get_model_row(compat, name, data, type='package'):
tpl_row = ' {:<10}' + (' {:<20}' * 4)
tpl_red = '\x1b[38;5;1m{}\x1b[0m' tpl_red = '\x1b[38;5;1m{}\x1b[0m'
tpl_green = '\x1b[38;5;2m{}\x1b[0m' tpl_green = '\x1b[38;5;2m{}\x1b[0m'
if data['compat']: if data['compat']:
@ -110,7 +111,8 @@ def get_row(*args):
def is_model_path(model_path): def is_model_path(model_path):
exclude = ['cache', 'pycache', '__pycache__'] exclude = ['cache', 'pycache', '__pycache__']
name = model_path.parts[-1] name = model_path.parts[-1]
return model_path.is_dir() and name not in exclude and not name.startswith('.') return (model_path.is_dir() and name not in exclude
and not name.startswith('.'))
def is_compat(compat, name, version): def is_compat(compat, name, version):
@ -118,6 +120,7 @@ def is_compat(compat, name, version):
def reformat_version(version): def reformat_version(version):
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
if version.endswith('-alpha'): if version.endswith('-alpha'):
return version.replace('-alpha', 'a0') return version.replace('-alpha', 'a0')
return version.replace('-alpha', 'a') return version.replace('-alpha', 'a')

View File

@ -87,15 +87,15 @@ def symlink_to(orig, dest):
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
return ((python2 == None or python2 == is_python2) and return ((python2 is None or python2 == is_python2) and
(python3 == None or python3 == is_python3) and (python3 is None or python3 == is_python3) and
(windows == None or windows == is_windows) and (windows is None or windows == is_windows) and
(linux == None or linux == is_linux) and (linux is None or linux == is_linux) and
(osx == None or osx == is_osx)) (osx is None or osx == is_osx))
def normalize_string_keys(old): def normalize_string_keys(old):
'''Given a dictionary, make sure keys are unicode strings, not bytes.''' """Given a dictionary, make sure keys are unicode strings, not bytes."""
new = {} new = {}
for key, value in old.items(): for key, value in old.items():
if isinstance(key, bytes_): if isinstance(key, bytes_):

View File

@ -24,7 +24,7 @@ def depr_model_download(lang):
def resolve_load_name(name, **overrides): def resolve_load_name(name, **overrides):
"""Resolve model loading if deprecated path kwarg is specified in overrides. """Resolve model loading if deprecated path kwarg in overrides.
name (unicode): Name of model to load. name (unicode): Name of model to load.
**overrides: Overrides specified in spacy.load(). **overrides: Overrides specified in spacy.load().
@ -32,8 +32,9 @@ def resolve_load_name(name, **overrides):
""" """
if overrides.get('path') not in (None, False, True): if overrides.get('path') not in (None, False, True):
name = overrides.get('path') name = overrides.get('path')
prints("To load a model from a path, you can now use the first argument. " prints("To load a model from a path, you can now use the first "
"The model meta is used to load the required Language class.", "argument. The model meta is used to load the Language class.",
"OLD: spacy.load('en', path='/some/path')", "NEW: spacy.load('/some/path')", "OLD: spacy.load('en', path='/some/path')",
"NEW: spacy.load('/some/path')",
title="Warning: deprecated argument 'path'") title="Warning: deprecated argument 'path'")
return name return name

View File

@ -21,7 +21,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
jupyter (bool): Experimental, use Jupyter's `display()` to output markup. jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
options (dict): Visualiser-specific options, e.g. colors. options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
RETURNS (unicode): Rendered HTML markup. RETURNS (unicode): Rendered HTML markup.
""" """
factories = {'dep': (DependencyRenderer, parse_deps), factories = {'dep': (DependencyRenderer, parse_deps),
@ -50,13 +50,15 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
page (bool): Render markup as full HTML page. page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors. options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
port (int): Port to serve visualisation. port (int): Port to serve visualisation.
""" """
from wsgiref import simple_server from wsgiref import simple_server
render(docs, style=style, page=page, minify=minify, options=options, manual=manual) render(docs, style=style, page=page, minify=minify, options=options,
manual=manual)
httpd = simple_server.make_server('0.0.0.0', port, app) httpd = simple_server.make_server('0.0.0.0', port, app)
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) prints("Using the '%s' visualizer" % style,
title="Serving on port %d..." % port)
try: try:
httpd.serve_forever() httpd.serve_forever()
except KeyboardInterrupt: except KeyboardInterrupt:
@ -67,7 +69,8 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
def app(environ, start_response): def app(environ, start_response):
# headers and status need to be bytes in Python 2, see #1227 # headers and status need to be bytes in Python 2, see #1227
headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))] headers = [(b_to_str(b'Content-type'),
b_to_str(b'text/html; charset=utf-8'))]
start_response(b_to_str(b'200 OK'), headers) start_response(b_to_str(b'200 OK'), headers)
res = _html['parsed'].encode(encoding='utf-8') res = _html['parsed'].encode(encoding='utf-8')
return [res] return [res]
@ -114,5 +117,6 @@ def parse_ents(doc, options={}):
""" """
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
for ent in doc.ents] for ent in doc.ents]
title = doc.user_data.get('title', None) if hasattr(doc, 'user_data') else None title = (doc.user_data.get('title', None)
if hasattr(doc, 'user_data') else None)
return {'text': doc.text, 'ents': ents, 'title': title} return {'text': doc.text, 'ents': ents, 'title': title}

View File

@ -14,13 +14,15 @@ class DependencyRenderer(object):
"""Initialise dependency renderer. """Initialise dependency renderer.
options (dict): Visualiser-specific options (compact, word_spacing, options (dict): Visualiser-specific options (compact, word_spacing,
arrow_spacing, arrow_width, arrow_stroke, distance, arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
offset_x, color, bg, font) color, bg, font)
""" """
self.compact = options.get('compact', False) self.compact = options.get('compact', False)
self.word_spacing = options.get('word_spacing', 45) self.word_spacing = options.get('word_spacing', 45)
self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20) self.arrow_spacing = options.get('arrow_spacing',
self.arrow_width = options.get('arrow_width', 6 if self.compact else 10) 12 if self.compact else 20)
self.arrow_width = options.get('arrow_width',
6 if self.compact else 10)
self.arrow_stroke = options.get('arrow_stroke', 2) self.arrow_stroke = options.get('arrow_stroke', 2)
self.distance = options.get('distance', 150 if self.compact else 175) self.distance = options.get('distance', 150 if self.compact else 175)
self.offset_x = options.get('offset_x', 50) self.offset_x = options.get('offset_x', 50)
@ -39,7 +41,8 @@ class DependencyRenderer(object):
rendered = [self.render_svg(i, p['words'], p['arcs']) rendered = [self.render_svg(i, p['words'], p['arcs'])
for i, p in enumerate(parsed)] for i, p in enumerate(parsed)]
if page: if page:
content = ''.join([TPL_FIGURE.format(content=svg) for svg in rendered]) content = ''.join([TPL_FIGURE.format(content=svg)
for svg in rendered])
markup = TPL_PAGE.format(content=content) markup = TPL_PAGE.format(content=content)
else: else:
markup = ''.join(rendered) markup = ''.join(rendered)
@ -63,12 +66,13 @@ class DependencyRenderer(object):
self.id = render_id self.id = render_id
words = [self.render_word(w['text'], w['tag'], i) words = [self.render_word(w['text'], w['tag'], i)
for i, w in enumerate(words)] for i, w in enumerate(words)]
arcs = [self.render_arrow(a['label'], a['start'], a['end'], a['dir'], i) arcs = [self.render_arrow(a['label'], a['start'],
a['end'], a['dir'], i)
for i, a in enumerate(arcs)] for i, a in enumerate(arcs)]
content = ''.join(words) + ''.join(arcs) content = ''.join(words) + ''.join(arcs)
return TPL_DEP_SVG.format(id=self.id, width=self.width, height=self.height, return TPL_DEP_SVG.format(id=self.id, width=self.width,
color=self.color, bg=self.bg, font=self.font, height=self.height, color=self.color,
content=content) bg=self.bg, font=self.font, content=content)
def render_word(self, text, tag, i): def render_word(self, text, tag, i):
"""Render individual word. """Render individual word.
@ -133,8 +137,10 @@ class DependencyRenderer(object):
if direction is 'left': if direction is 'left':
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2) pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
else: else:
pos1, pos2, pos3 = (end, end+self.arrow_width-2, end-self.arrow_width+2) pos1, pos2, pos3 = (end, end+self.arrow_width-2,
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, y-self.arrow_width) end-self.arrow_width+2)
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3,
y-self.arrow_width)
return "M{},{} L{},{} {},{}".format(*arrowhead) return "M{},{} L{},{} {},{}".format(*arrowhead)
def get_levels(self, arcs): def get_levels(self, arcs):
@ -159,9 +165,10 @@ class EntityRenderer(object):
""" """
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74', colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb', 'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LANGUAGE': '#ff8197', 'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197',
'WORK_OF_ART': '#f0d0ff', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff',
'MONEY': '#e4e7d2', 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2',
'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'} 'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
colors.update(options.get('colors', {})) colors.update(options.get('colors', {}))
self.default_color = '#ddd' self.default_color = '#ddd'
@ -176,9 +183,11 @@ class EntityRenderer(object):
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered HTML markup. RETURNS (unicode): Rendered HTML markup.
""" """
rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed] rendered = [self.render_ents(p['text'], p['ents'],
p.get('title', None)) for p in parsed]
if page: if page:
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered]) docs = ''.join([TPL_FIGURE.format(content=doc)
for doc in rendered])
markup = TPL_PAGE.format(content=docs) markup = TPL_PAGE.format(content=docs)
else: else:
markup = ''.join(rendered) markup = ''.join(rendered)

View File

@ -264,7 +264,6 @@ GLOSSARY = {
'nk': 'noun kernel element', 'nk': 'noun kernel element',
'nmc': 'numerical component', 'nmc': 'numerical component',
'oa': 'accusative object', 'oa': 'accusative object',
'oa': 'second accusative object',
'oc': 'clausal object', 'oc': 'clausal object',
'og': 'genitive object', 'og': 'genitive object',
'op': 'prepositional object', 'op': 'prepositional object',

View File

@ -2,7 +2,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import io
import re import re
import ujson import ujson
import random import random
@ -10,9 +9,8 @@ import cytoolz
import itertools import itertools
from .syntax import nonproj from .syntax import nonproj
from .util import ensure_path
from . import util
from .tokens import Doc from .tokens import Doc
from . import util
def tags_to_entities(tags): def tags_to_entities(tags):
@ -54,7 +52,8 @@ def merge_sents(sents):
m_deps[3].extend(head + i for head in heads) m_deps[3].extend(head + i for head in heads)
m_deps[4].extend(labels) m_deps[4].extend(labels)
m_deps[5].extend(ner) m_deps[5].extend(ner)
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])
for b in brackets)
i += len(ids) i += len(ids)
return [(m_deps, m_brackets)] return [(m_deps, m_brackets)]
@ -80,6 +79,8 @@ def align(cand_words, gold_words):
punct_re = re.compile(r'\W') punct_re = re.compile(r'\W')
def _min_edit_path(cand_words, gold_words): def _min_edit_path(cand_words, gold_words):
cdef: cdef:
Pool mem Pool mem
@ -98,9 +99,9 @@ def _min_edit_path(cand_words, gold_words):
mem = Pool() mem = Pool()
n_cand = len(cand_words) n_cand = len(cand_words)
n_gold = len(gold_words) n_gold = len(gold_words)
# Levenshtein distance, except we need the history, and we may want different # Levenshtein distance, except we need the history, and we may want
# costs. # different costs. Mark operations with a string, and score the history
# Mark operations with a string, and score the history using _edit_cost. # using _edit_cost.
previous_row = [] previous_row = []
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int)) prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int)) curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
@ -144,9 +145,9 @@ def _min_edit_path(cand_words, gold_words):
def minibatch(items, size=8): def minibatch(items, size=8):
'''Iterate over batches of items. `size` may be an iterator, """Iterate over batches of items. `size` may be an iterator,
so that batch-size can vary on each step. so that batch-size can vary on each step.
''' """
if isinstance(size, int): if isinstance(size, int):
size_ = itertools.repeat(8) size_ = itertools.repeat(8)
else: else:
@ -168,6 +169,7 @@ class GoldCorpus(object):
train_path (unicode or Path): File or directory of training data. train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data. dev_path (unicode or Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
""" """
self.train_path = util.ensure_path(train_path) self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path) self.dev_path = util.ensure_path(dev_path)
@ -222,7 +224,6 @@ class GoldCorpus(object):
def dev_docs(self, nlp, gold_preproc=False): def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
#gold_docs = nlp.preprocess_gold(gold_docs)
yield from gold_docs yield from gold_docs
@classmethod @classmethod
@ -233,7 +234,6 @@ class GoldCorpus(object):
raw_text = None raw_text = None
else: else:
paragraph_tuples = merge_sents(paragraph_tuples) paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc, noise_level=noise_level) gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples) golds = cls._make_golds(docs, paragraph_tuples)
@ -248,17 +248,20 @@ class GoldCorpus(object):
raw_text = add_noise(raw_text, noise_level) raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)] return [nlp.make_doc(raw_text)]
else: else:
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) return [Doc(nlp.vocab,
words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples] for (sent_tuples, brackets) in paragraph_tuples]
@classmethod @classmethod
def _make_golds(cls, docs, paragraph_tuples): def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples) assert len(docs) == len(paragraph_tuples)
if len(docs) == 1: if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])] return [GoldParse.from_annot_tuples(docs[0],
paragraph_tuples[0][0])]
else: else:
return [GoldParse.from_annot_tuples(doc, sent_tuples) return [GoldParse.from_annot_tuples(doc, sent_tuples)
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)] for doc, (sent_tuples, brackets)
in zip(docs, paragraph_tuples)]
@staticmethod @staticmethod
def walk_corpus(path): def walk_corpus(path):
@ -305,7 +308,7 @@ def _corrupt(c, noise_level):
def read_json_file(loc, docs_filter=None, limit=None): def read_json_file(loc, docs_filter=None, limit=None):
loc = ensure_path(loc) loc = util.ensure_path(loc)
if loc.is_dir(): if loc.is_dir():
for filename in loc.iterdir(): for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit) yield from read_json_file(loc / filename, limit=limit)
@ -382,19 +385,21 @@ cdef class GoldParse:
@classmethod @classmethod
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
_, words, tags, heads, deps, entities = annot_tuples _, words, tags, heads, deps, entities = annot_tuples
return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities, return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
make_projective=make_projective) entities=entities, make_projective=make_projective)
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None,
deps=None, entities=None, make_projective=False, heads=None, deps=None, entities=None, make_projective=False,
cats=None): cats=None):
"""Create a GoldParse. """Create a GoldParse.
doc (Doc): The document the annotations refer to. doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings. words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations. tags (iterable): A sequence of strings, representing tag annotations.
heads (iterable): A sequence of integers, representing syntactic head offsets. heads (iterable): A sequence of integers, representing syntactic
deps (iterable): A sequence of strings, representing the syntactic relation types. head offsets.
deps (iterable): A sequence of strings, representing the syntactic
relation types.
entities (iterable): A sequence of named entity annotations, either as entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples, BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions. representing the entity positions.
@ -404,9 +409,10 @@ cdef class GoldParse:
document (usually a sentence). Unlike entity annotations, label document (usually a sentence). Unlike entity annotations, label
annotations can overlap, i.e. a single word can be covered by annotations can overlap, i.e. a single word can be covered by
multiple labelled spans. The TextCategorizer component expects multiple labelled spans. The TextCategorizer component expects
true examples of a label to have the value 1.0, and negative examples true examples of a label to have the value 1.0, and negative
of a label to have the value 0.0. Labels not in the dictionary are examples of a label to have the value 0.0. Labels not in the
treated as missing -- the gradient for those labels will be zero. dictionary are treated as missing - the gradient for those labels
will be zero.
RETURNS (GoldParse): The newly constructed object. RETURNS (GoldParse): The newly constructed object.
""" """
if words is None: if words is None:
@ -470,7 +476,7 @@ cdef class GoldParse:
self.ner[i] = entities[gold_i] self.ner[i] = entities[gold_i]
cycle = nonproj.contains_cycle(self.heads) cycle = nonproj.contains_cycle(self.heads)
if cycle != None: if cycle is not None:
raise Exception("Cycle found: %s" % cycle) raise Exception("Cycle found: %s" % cycle)
if make_projective: if make_projective:
@ -497,20 +503,19 @@ cdef class GoldParse:
def biluo_tags_from_offsets(doc, entities, missing='O'): def biluo_tags_from_offsets(doc, entities, missing='O'):
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out """Encode labelled spans into per-token tags, using the
scheme (BILUO). Begin/In/Last/Unit/Out scheme (BILUO).
doc (Doc): The document that the entity offsets refer to. The output tags doc (Doc): The document that the entity offsets refer to. The output tags
will refer to the token boundaries within the document. will refer to the token boundaries within the document.
entities (iterable): A sequence of `(start, end, label)` triples. `start` and entities (iterable): A sequence of `(start, end, label)` triples. `start`
`end` should be character-offset integers denoting the slice into the and `end` should be character-offset integers denoting the slice into
original string. the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The entity offsets don't align with the tokenization in the `Doc` object.
training algorithm will view these as missing values. "O" denotes a The training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity, non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end "I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity. of an entity of two or more tokens. "U" denotes a single-token entity.

View File

@ -1,31 +1,28 @@
# coding: utf8 # coding: utf8
from __future__ import absolute_import, unicode_literals from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import copy
from thinc.neural import Model
from thinc.neural.optimizers import Adam
import random import random
import ujson import ujson
from collections import OrderedDict
import itertools import itertools
import weakref import weakref
import functools import functools
import tqdm from collections import OrderedDict
from contextlib import contextmanager
from copy import copy
from thinc.neural import Model
from thinc.neural.optimizers import Adam
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
from .tagger import Tagger
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import DependencyParser, Tensorizer, Tagger from .pipeline import SimilarityHook, TextCategorizer
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer from .compat import json_dumps, izip
from .compat import json_dumps, izip, copy_reg
from .scorer import Scorer from .scorer import Scorer
from ._ml import link_vectors_to_models from ._ml import link_vectors_to_models
from .attrs import IS_STOP from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS, is_stop from .lang.lex_attrs import LEX_ATTRS, is_stop
@ -57,16 +54,18 @@ class BaseDefaults(object):
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions rules = cls.tokenizer_exceptions
token_match = cls.token_match token_match = cls.token_match
prefix_search = util.compile_prefix_regex(cls.prefixes).search \ prefix_search = (util.compile_prefix_regex(cls.prefixes).search
if cls.prefixes else None if cls.prefixes else None)
suffix_search = util.compile_suffix_regex(cls.suffixes).search \ suffix_search = (util.compile_suffix_regex(cls.suffixes).search
if cls.suffixes else None if cls.suffixes else None)
infix_finditer = util.compile_infix_regex(cls.infixes).finditer \ infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
if cls.infixes else None if cls.infixes else None)
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
return Tokenizer(vocab, rules=rules, return Tokenizer(vocab, rules=rules,
prefix_search=prefix_search, suffix_search=suffix_search, prefix_search=prefix_search,
infix_finditer=infix_finditer, token_match=token_match) suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match)
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
token_match = TOKEN_MATCH token_match = TOKEN_MATCH
@ -98,7 +97,7 @@ class Language(object):
factories = { factories = {
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), 'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), 'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), 'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), 'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
@ -218,14 +217,14 @@ class Language(object):
def add_pipe(self, component, name=None, before=None, after=None, def add_pipe(self, component, name=None, before=None, after=None,
first=None, last=None): first=None, last=None):
"""Add a component to the processing pipeline. Valid components are """Add a component to the processing pipeline. Valid components are
callables that take a `Doc` object, modify it and return it. Only one of callables that take a `Doc` object, modify it and return it. Only one
before, after, first or last can be set. Default behaviour is "last". of before/after/first/last can be set. Default behaviour is "last".
component (callable): The pipeline component. component (callable): The pipeline component.
name (unicode): Name of pipeline component. Overwrites existing name (unicode): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is the component exposes no name attribute, component.__name__ is
used. An error is raised if the name already exists in the pipeline. used. An error is raised if a name already exists in the pipeline.
before (unicode): Component name to insert component directly before. before (unicode): Component name to insert component directly before.
after (unicode): Component name to insert component directly after. after (unicode): Component name to insert component directly after.
first (bool): Insert component first / not first in the pipeline. first (bool): Insert component first / not first in the pipeline.
@ -240,7 +239,8 @@ class Language(object):
name = component.name name = component.name
elif hasattr(component, '__name__'): elif hasattr(component, '__name__'):
name = component.__name__ name = component.__name__
elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'): elif (hasattr(component, '__class__') and
hasattr(component.__class__, '__name__')):
name = component.__class__.__name__ name = component.__class__.__name__
else: else:
name = repr(component) name = repr(component)
@ -269,7 +269,7 @@ class Language(object):
`name in nlp.pipe_names`. `name in nlp.pipe_names`.
name (unicode): Name of the component. name (unicode): Name of the component.
RETURNS (bool): Whether a component of that name exists in the pipeline. RETURNS (bool): Whether a component of the name exists in the pipeline.
""" """
return name in self.pipe_names return name in self.pipe_names
@ -332,15 +332,12 @@ class Language(object):
return doc return doc
def disable_pipes(self, *names): def disable_pipes(self, *names):
'''Disable one or more pipeline components. """Disable one or more pipeline components. If used as a context
manager, the pipeline will be restored to the initial state at the end
If used as a context manager, the pipeline will be restored to the initial of the block. Otherwise, a DisabledPipes object is returned, that has
state at the end of the block. Otherwise, a DisabledPipes object is a `.restore()` method you can use to undo your changes.
returned, that has a `.restore()` method you can use to undo your
changes.
EXAMPLE: EXAMPLE:
>>> nlp.add_pipe('parser') >>> nlp.add_pipe('parser')
>>> nlp.add_pipe('tagger') >>> nlp.add_pipe('tagger')
>>> with nlp.disable_pipes('parser', 'tagger'): >>> with nlp.disable_pipes('parser', 'tagger'):
@ -351,7 +348,7 @@ class Language(object):
>>> assert not nlp.has_pipe('parser') >>> assert not nlp.has_pipe('parser')
>>> disabled.restore() >>> disabled.restore()
>>> assert nlp.has_pipe('parser') >>> assert nlp.has_pipe('parser')
''' """
return DisabledPipes(self, *names) return DisabledPipes(self, *names)
def make_doc(self, text): def make_doc(self, text):
@ -367,7 +364,7 @@ class Language(object):
RETURNS (dict): Results from the update. RETURNS (dict): Results from the update.
EXAMPLE: EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer): >>> with nlp.begin_training(gold) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold): >>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch: >>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer) >>> state = nlp.update(docs, golds, sgd=optimizer)
@ -382,8 +379,10 @@ class Language(object):
self._optimizer = Adam(Model.ops, 0.001) self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer sgd = self._optimizer
grads = {} grads = {}
def get_grads(W, dW, key=None): def get_grads(W, dW, key=None):
grads[key] = (W, dW) grads[key] = (W, dW)
pipes = list(self.pipeline) pipes = list(self.pipeline)
random.shuffle(pipes) random.shuffle(pipes)
for name, proc in pipes: for name, proc in pipes:
@ -513,16 +512,16 @@ class Language(object):
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
disable=[]): disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports """Process texts as a stream, and yield `Doc` objects in order.
GIL-free multi-threading. Supports GIL-free multi-threading.
texts (iterator): A sequence of texts to process. texts (iterator): A sequence of texts to process.
as_tuples (bool): as_tuples (bool):
If set to True, inputs should be a sequence of If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of (text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False. (doc, context) tuples. Defaults to False.
n_threads (int): The number of worker threads to use. If -1, OpenMP will n_threads (int): The number of worker threads to use. If -1, OpenMP
decide how many to use at run time. Default is 2. will decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer. batch_size (int): The number of texts to buffer.
disable (list): Names of the pipeline components to disable. disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text. YIELDS (Doc): Documents in the order of the original text.
@ -546,7 +545,8 @@ class Language(object):
if name in disable: if name in disable:
continue continue
if hasattr(proc, 'pipe'): if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) docs = proc.pipe(docs, n_threads=n_threads,
batch_size=batch_size)
else: else:
# Apply the function, but yield the doc # Apply the function, but yield the doc
docs = _pipe(proc, docs) docs = _pipe(proc, docs)
@ -583,7 +583,7 @@ class Language(object):
will include the model. will include the model.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be strings or `Path`-like objects.
disable (list): Names of pipeline components to disable and prevent disable (list): Names of pipeline components to disable and prevent
from being saved. from being saved.
@ -649,7 +649,7 @@ class Language(object):
serializers = OrderedDict(( serializers = OrderedDict((
('vocab', lambda: self.vocab.to_bytes()), ('vocab', lambda: self.vocab.to_bytes()),
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
('meta', lambda: ujson.dumps(self.meta)) ('meta', lambda: json_dumps(self.meta))
)) ))
for i, (name, proc) in enumerate(self.pipeline): for i, (name, proc) in enumerate(self.pipeline):
if name in disable: if name in disable:
@ -682,14 +682,14 @@ class Language(object):
class DisabledPipes(list): class DisabledPipes(list):
'''Manager for temporary pipeline disabling.''' """Manager for temporary pipeline disabling."""
def __init__(self, nlp, *names): def __init__(self, nlp, *names):
self.nlp = nlp self.nlp = nlp
self.names = names self.names = names
# Important! Not deep copy -- we just want the container (but we also # Important! Not deep copy -- we just want the container (but we also
# want to support people providing arbitrarily typed nlp.pipeline # want to support people providing arbitrarily typed nlp.pipeline
# objects.) # objects.)
self.original_pipeline = copy.copy(nlp.pipeline) self.original_pipeline = copy(nlp.pipeline)
list.__init__(self) list.__init__(self)
self.extend(nlp.remove_pipe(name) for name in names) self.extend(nlp.remove_pipe(name) for name in names)
@ -702,7 +702,8 @@ class DisabledPipes(list):
def restore(self): def restore(self):
'''Restore the pipeline to its state when DisabledPipes was created.''' '''Restore the pipeline to its state when DisabledPipes was created.'''
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)] unexpected = [name for name, pipe in current
if not self.nlp.has_pipe(name)]
if unexpected: if unexpected:
# Don't change the pipeline if we're raising an error. # Don't change the pipeline if we're raising an error.
self.nlp.pipeline = current self.nlp.pipeline = current

View File

@ -43,16 +43,15 @@ class Lemmatizer(object):
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology others = [key for key in morphology
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')] if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('Number') == 'sing': if univ_pos == 'noun' and morphology.get('Number') == 'sing':
return True return True
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf': elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
return True return True
# This maps 'VBP' to base form -- probably just need 'IS_BASE' # This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology # morphology
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and
morphology.get('Tense') == 'pres' and \ morphology.get('Tense') == 'pres' and
morphology.get('Number') is None and \ morphology.get('Number') is None and
not others): not others):
return True return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
@ -89,9 +88,6 @@ class Lemmatizer(object):
def lemmatize(string, index, exceptions, rules): def lemmatize(string, index, exceptions, rules):
string = string.lower() string = string.lower()
forms = [] forms = []
# TODO: Is this correct? See discussion in Issue #435.
#if string in index:
# forms.append(string)
forms.extend(exceptions.get(string, [])) forms.extend(exceptions.get(string, []))
oov_forms = [] oov_forms = []
if not forms: if not forms:

View File

@ -2,27 +2,17 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from libc.math cimport sqrt
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
# Compiler crashes on memory view coercion without this. Should report bug. # Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray from cython.view cimport array as cvarray
cimport numpy as np cimport numpy as np
np.import_array() np.import_array()
from libc.string cimport memset from libc.string cimport memset
import numpy import numpy
from .typedefs cimport attr_t, flags_t from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
from .attrs cimport IS_QUOTE
from .attrs cimport IS_LEFT_PUNCT
from .attrs cimport IS_RIGHT_PUNCT
from .attrs cimport IS_OOV
from . import about from . import about
@ -32,8 +22,8 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef class Lexeme: cdef class Lexeme:
"""An entry in the vocabulary. A `Lexeme` has no string context it's a """An entry in the vocabulary. A `Lexeme` has no string context it's a
word-type, as opposed to a word token. It therefore has no part-of-speech word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag, dependency parse, or lemma (lemmatization depends on the
tag). part-of-speech tag).
""" """
def __init__(self, Vocab vocab, attr_t orth): def __init__(self, Vocab vocab, attr_t orth):
"""Create a Lexeme object. """Create a Lexeme object.
@ -104,7 +94,8 @@ cdef class Lexeme:
""" """
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return (numpy.dot(self.vector, other.vector) /
(self.vector_norm * other.vector_norm))
def to_bytes(self): def to_bytes(self):
lex_data = Lexeme.c_to_bytes(self.c) lex_data = Lexeme.c_to_bytes(self.c)
@ -130,19 +121,13 @@ cdef class Lexeme:
self.orth = self.c.orth self.orth = self.c.orth
property has_vector: property has_vector:
"""A boolean value indicating whether a word vector is associated with """RETURNS (bool): Whether a word vector is associated with the object.
the object.
RETURNS (bool): Whether a word vector is associated with the object.
""" """
def __get__(self): def __get__(self):
return self.vocab.has_vector(self.c.orth) return self.vocab.has_vector(self.c.orth)
property vector_norm: property vector_norm:
"""The L2 norm of the lexeme's vector representation. """RETURNS (float): The L2 norm of the vector representation."""
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self): def __get__(self):
vector = self.vector vector = self.vector
return numpy.sqrt((vector**2).sum()) return numpy.sqrt((vector**2).sum())
@ -169,149 +154,320 @@ cdef class Lexeme:
self.vocab.set_vector(self.c.orth, vector) self.vocab.set_vector(self.c.orth, vector)
property rank: property rank:
"""RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self): def __get__(self):
return self.c.id return self.c.id
def __set__(self, value): def __set__(self, value):
self.c.id = value self.c.id = value
property sentiment: property sentiment:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme."""
def __get__(self): def __get__(self):
return self.c.sentiment return self.c.sentiment
def __set__(self, float sentiment): def __set__(self, float sentiment):
self.c.sentiment = sentiment self.c.sentiment = sentiment
property orth_: property orth_:
"""RETURNS (unicode): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.orth] return self.vocab.strings[self.c.orth]
property text: property text:
"""A unicode representation of the token text. """RETURNS (unicode): The original verbatim text of the lexeme."""
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self): def __get__(self):
return self.orth_ return self.orth_
property lower: property lower:
def __get__(self): return self.c.lower """RETURNS (unicode): Lowercase form of the lexeme."""
def __set__(self, attr_t x): self.c.lower = x def __get__(self):
return self.c.lower
def __set__(self, attr_t x):
self.c.lower = x
property norm: property norm:
def __get__(self): return self.c.norm """RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the
def __set__(self, attr_t x): self.c.norm = x lexeme text.
"""
def __get__(self):
return self.c.norm
def __set__(self, attr_t x):
self.c.norm = x
property shape: property shape:
def __get__(self): return self.c.shape """RETURNS (uint64): Transform of the word's string, to show
def __set__(self, attr_t x): self.c.shape = x orthographic features.
"""
def __get__(self):
return self.c.shape
def __set__(self, attr_t x):
self.c.shape = x
property prefix: property prefix:
def __get__(self): return self.c.prefix """RETURNS (uint64): Length-N substring from the start of the word.
def __set__(self, attr_t x): self.c.prefix = x Defaults to `N=1`.
"""
def __get__(self):
return self.c.prefix
def __set__(self, attr_t x):
self.c.prefix = x
property suffix: property suffix:
def __get__(self): return self.c.suffix """RETURNS (uint64): Length-N substring from the end of the word.
def __set__(self, attr_t x): self.c.suffix = x Defaults to `N=3`.
"""
def __get__(self):
return self.c.suffix
def __set__(self, attr_t x):
self.c.suffix = x
property cluster: property cluster:
def __get__(self): return self.c.cluster """RETURNS (int): Brown cluster ID."""
def __set__(self, attr_t x): self.c.cluster = x def __get__(self):
return self.c.cluster
def __set__(self, attr_t x):
self.c.cluster = x
property lang: property lang:
def __get__(self): return self.c.lang """RETURNS (uint64): Language of the parent vocabulary."""
def __set__(self, attr_t x): self.c.lang = x def __get__(self):
return self.c.lang
def __set__(self, attr_t x):
self.c.lang = x
property prob: property prob:
def __get__(self): return self.c.prob """RETURNS (float): Smoothed log probability estimate of the lexeme's
def __set__(self, float x): self.c.prob = x type."""
def __get__(self):
return self.c.prob
def __set__(self, float x):
self.c.prob = x
property lower_: property lower_:
def __get__(self): return self.vocab.strings[self.c.lower] """RETURNS (unicode): Lowercase form of the word."""
def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x) def __get__(self):
return self.vocab.strings[self.c.lower]
def __set__(self, unicode x):
self.c.lower = self.vocab.strings.add(x)
property norm_: property norm_:
def __get__(self): return self.vocab.strings[self.c.norm] """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x) lexeme text.
"""
def __get__(self):
return self.vocab.strings[self.c.norm]
def __set__(self, unicode x):
self.c.norm = self.vocab.strings.add(x)
property shape_: property shape_:
def __get__(self): return self.vocab.strings[self.c.shape] """RETURNS (unicode): Transform of the word's string, to show
def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x) orthographic features.
"""
def __get__(self):
return self.vocab.strings[self.c.shape]
def __set__(self, unicode x):
self.c.shape = self.vocab.strings.add(x)
property prefix_: property prefix_:
def __get__(self): return self.vocab.strings[self.c.prefix] """RETURNS (unicode): Length-N substring from the start of the word.
def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x) Defaults to `N=1`.
"""
def __get__(self):
return self.vocab.strings[self.c.prefix]
def __set__(self, unicode x):
self.c.prefix = self.vocab.strings.add(x)
property suffix_: property suffix_:
def __get__(self): return self.vocab.strings[self.c.suffix] """RETURNS (unicode): Length-N substring from the end of the word.
def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x) Defaults to `N=3`.
"""
def __get__(self):
return self.vocab.strings[self.c.suffix]
def __set__(self, unicode x):
self.c.suffix = self.vocab.strings.add(x)
property lang_: property lang_:
def __get__(self): return self.vocab.strings[self.c.lang] """RETURNS (unicode): Language of the parent vocabulary."""
def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x) def __get__(self):
return self.vocab.strings[self.c.lang]
def __set__(self, unicode x):
self.c.lang = self.vocab.strings.add(x)
property flags: property flags:
def __get__(self): return self.c.flags """RETURNS (uint64): Container of the lexeme's binary flags."""
def __set__(self, flags_t x): self.c.flags = x def __get__(self):
return self.c.flags
def __set__(self, flags_t x):
self.c.flags = x
property is_oov: property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV) """RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_OOV)
def __set__(self, attr_t x):
Lexeme.c_set_flag(self.c, IS_OOV, x)
property is_stop: property is_stop:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP) """RETURNS (bool): Whether the lexeme is a stop word."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_STOP)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_STOP, x)
property is_alpha: property is_alpha:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA) """RETURNS (bool): Whether the lexeme consists of alphanumeric
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x) characters. Equivalent to `lexeme.text.isalpha()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ALPHA)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
property is_ascii: property is_ascii:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII) """RETURNS (bool): Whether the lexeme consists of ASCII characters.
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x) Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ASCII)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ASCII, x)
property is_digit: property is_digit:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT) """RETURNS (bool): Whether the lexeme consists of digits. Equivalent
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x) to `lexeme.text.isdigit()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_DIGIT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
property is_lower: property is_lower:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER) """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x) `lexeme.text.islower()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LOWER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LOWER, x)
property is_upper:
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
`lexeme.text.isupper()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_UPPER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_UPPER, x)
property is_title: property is_title:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE) """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x) `lexeme.text.istitle()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_TITLE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_TITLE, x)
property is_punct: property is_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT) """RETURNS (bool): Whether the lexeme is punctuation."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
property is_space: property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE) """RETURNS (bool): Whether the lexeme consist of whitespace characters.
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x) Equivalent to `lexeme.text.isspace()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_SPACE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_SPACE, x)
property is_bracket: property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET) """RETURNS (bool): Whether the lexeme is a bracket."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_BRACKET)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
property is_quote: property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE) """RETURNS (bool): Whether the lexeme is a quotation mark."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_QUOTE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
property is_left_punct: property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) """RETURNS (bool): Whether the lexeme is left punctuation, e.g. )."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
property is_right_punct: property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) """RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) def __get__(self):
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property like_url: property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) """RETURNS (bool): Whether the lexeme resembles a URL."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_URL, x)
property like_num: property like_num:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM) """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x) "10", "ten", etc.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_NUM)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
property like_email: property like_email:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL) """RETURNS (bool): Whether the lexeme resembles an email address."""
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)

View File

@ -4,12 +4,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson import ujson
from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .attrs cimport attr_id_t
from .structs cimport TokenC
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from libcpp.vector cimport vector from libcpp.vector cimport vector
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
from .attrs cimport ID, NULL_ATTR, ENT_TYPE from .typedefs cimport attr_t
from . import attrs from .typedefs cimport hash_t
from .tokens.doc cimport get_token_attr from .structs cimport TokenC
from .tokens.doc cimport Doc from .tokens.doc cimport Doc, get_token_attr
from .vocab cimport Vocab from .vocab cimport Vocab
from .attrs import IDS
from .attrs cimport attr_id_t, ID, NULL_ATTR
from .attrs import FLAG61 as U_ENT from .attrs import FLAG61 as U_ENT
from .attrs import FLAG60 as B2_ENT from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT from .attrs import FLAG59 as B3_ENT
from .attrs import FLAG58 as B4_ENT from .attrs import FLAG58 as B4_ENT
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
from .attrs import FLAG54 as B8_ENT from .attrs import FLAG54 as B8_ENT
from .attrs import FLAG53 as B9_ENT from .attrs import FLAG53 as B9_ENT
from .attrs import FLAG52 as B10_ENT from .attrs import FLAG52 as B10_ENT
from .attrs import FLAG51 as I3_ENT from .attrs import FLAG51 as I3_ENT
from .attrs import FLAG50 as I4_ENT from .attrs import FLAG50 as I4_ENT
from .attrs import FLAG49 as I5_ENT from .attrs import FLAG49 as I5_ENT
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
from .attrs import FLAG46 as I8_ENT from .attrs import FLAG46 as I8_ENT
from .attrs import FLAG45 as I9_ENT from .attrs import FLAG45 as I9_ENT
from .attrs import FLAG44 as I10_ENT from .attrs import FLAG44 as I10_ENT
from .attrs import FLAG43 as L2_ENT from .attrs import FLAG43 as L2_ENT
from .attrs import FLAG42 as L3_ENT from .attrs import FLAG42 as L3_ENT
from .attrs import FLAG41 as L4_ENT from .attrs import FLAG41 as L4_ENT
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
if value in operators: if value in operators:
ops = operators[value] ops = operators[value]
else: else:
raise KeyError( msg = "Unknown operator '%s'. Options: %s"
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys()))) raise KeyError(msg % (value, ', '.join(operators.keys())))
if isinstance(attr, basestring): if isinstance(attr, basestring):
attr = attrs.IDS.get(attr.upper()) attr = IDS.get(attr.upper())
if isinstance(value, basestring): if isinstance(value, basestring):
value = string_store.add(value) value = string_store.add(value)
if isinstance(value, bool): if isinstance(value, bool):
@ -233,13 +226,13 @@ cdef class Matcher:
return self._normalize_key(key) in self._patterns return self._normalize_key(key) in self._patterns
def add(self, key, on_match, *patterns): def add(self, key, on_match, *patterns):
"""Add a match-rule to the matcher. A match-rule consists of: an ID key, """Add a match-rule to the matcher. A match-rule consists of: an ID
an on_match callback, and one or more patterns. key, an on_match callback, and one or more patterns.
If the key exists, the patterns are appended to the previous ones, and If the key exists, the patterns are appended to the previous ones, and
the previous on_match callback is replaced. The `on_match` callback will the previous on_match callback is replaced. The `on_match` callback
receive the arguments `(matcher, doc, i, matches)`. You can also set will receive the arguments `(matcher, doc, i, matches)`. You can also
`on_match` to `None` to not perform any actions. set `on_match` to `None` to not perform any actions.
A pattern consists of one or more `token_specs`, where a `token_spec` A pattern consists of one or more `token_specs`, where a `token_spec`
is a dictionary mapping attribute IDs to values, and optionally a is a dictionary mapping attribute IDs to values, and optionally a
@ -253,8 +246,8 @@ cdef class Matcher:
The + and * operators are usually interpretted "greedily", i.e. longer The + and * operators are usually interpretted "greedily", i.e. longer
matches are returned where possible. However, if you specify two '+' matches are returned where possible. However, if you specify two '+'
and '*' patterns in a row and their matches overlap, the first and '*' patterns in a row and their matches overlap, the first
operator will behave non-greedily. This quirk in the semantics operator will behave non-greedily. This quirk in the semantics makes
makes the matcher more efficient, by avoiding the need for back-tracking. the matcher more efficient, by avoiding the need for back-tracking.
key (unicode): The match ID. key (unicode): The match ID.
on_match (callable): Callback executed on match. on_match (callable): Callback executed on match.
@ -268,7 +261,6 @@ cdef class Matcher:
key = self._normalize_key(key) key = self._normalize_key(key)
self._patterns.setdefault(key, []) self._patterns.setdefault(key, [])
self._callbacks[key] = on_match self._callbacks[key] = on_match
for pattern in patterns: for pattern in patterns:
specs = _convert_strings(pattern, self.vocab.strings) specs = _convert_strings(pattern, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, key, specs)) self.patterns.push_back(init_pattern(self.mem, key, specs))
@ -315,9 +307,9 @@ cdef class Matcher:
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading. in parallel, if the implementation supports multi-threading.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
for doc in docs: for doc in docs:
@ -325,7 +317,7 @@ cdef class Matcher:
yield doc yield doc
def __call__(self, Doc doc): def __call__(self, Doc doc):
"""Find all token sequences matching the supplied patterns on the `Doc`. """Find all token sequences matching the supplied pattern.
doc (Doc): The document to match over. doc (Doc): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples, RETURNS (list): A list of `(key, start, end)` tuples,
@ -342,8 +334,8 @@ cdef class Matcher:
for token_i in range(doc.length): for token_i in range(doc.length):
token = &doc.c[token_i] token = &doc.c[token_i]
q = 0 q = 0
# Go over the open matches, extending or finalizing if able. Otherwise, # Go over the open matches, extending or finalizing if able.
# we over-write them (q doesn't advance) # Otherwise, we over-write them (q doesn't advance)
for state in partials: for state in partials:
action = get_action(state.second, token) action = get_action(state.second, token)
if action == PANIC: if action == PANIC:
@ -356,8 +348,8 @@ cdef class Matcher:
if action == REPEAT: if action == REPEAT:
# Leave the state in the queue, and advance to next slot # Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match more # (i.e. we don't overwrite -- we want to greedily match
# pattern. # more pattern.
q += 1 q += 1
elif action == REJECT: elif action == REJECT:
pass pass
@ -366,8 +358,8 @@ cdef class Matcher:
partials[q].second += 1 partials[q].second += 1
q += 1 q += 1
elif action in (ACCEPT, ACCEPT_PREV): elif action in (ACCEPT, ACCEPT_PREV):
# TODO: What to do about patterns starting with ZERO? Need to # TODO: What to do about patterns starting with ZERO? Need
# adjust the start position. # to adjust the start position.
start = state.first start = state.first
end = token_i+1 if action == ACCEPT else token_i end = token_i+1 if action == ACCEPT else token_i
ent_id = state.second[1].attrs[0].value ent_id = state.second[1].attrs[0].value
@ -388,8 +380,8 @@ cdef class Matcher:
state.second = pattern state.second = pattern
partials.push_back(state) partials.push_back(state)
elif action == ADVANCE: elif action == ADVANCE:
# TODO: What to do about patterns starting with ZERO? Need to # TODO: What to do about patterns starting with ZERO? Need
# adjust the start position. # to adjust the start position.
state.first = token_i state.first = token_i
state.second = pattern + 1 state.second = pattern + 1
partials.push_back(state) partials.push_back(state)
@ -413,7 +405,6 @@ cdef class Matcher:
on_match = self._callbacks.get(ent_id) on_match = self._callbacks.get(ent_id)
if on_match is not None: if on_match is not None:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)
# TODO: only return (match_id, start, end)
return matches return matches
def _normalize_key(self, key): def _normalize_key(self, key):
@ -441,7 +432,8 @@ def get_bilou(length):
elif length == 8: elif length == 8:
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
elif length == 9: elif length == 9:
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
L9_ENT]
elif length == 10: elif length == 10:
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT] I10_ENT, I10_ENT, L10_ENT]
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
cdef Vocab vocab cdef Vocab vocab
cdef Matcher matcher cdef Matcher matcher
cdef PreshMap phrase_ids cdef PreshMap phrase_ids
cdef int max_length cdef int max_length
cdef attr_t* _phrase_key cdef attr_t* _phrase_key
cdef public object _callbacks cdef public object _callbacks
cdef public object _patterns cdef public object _patterns
@ -470,7 +460,8 @@ cdef class PhraseMatcher:
self.phrase_ids = PreshMap() self.phrase_ids = PreshMap()
abstract_patterns = [] abstract_patterns = []
for length in range(1, max_length): for length in range(1, max_length):
abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) abstract_patterns.append([{tag: True}
for tag in get_bilou(length)])
self.matcher.add('Candidate', None, *abstract_patterns) self.matcher.add('Candidate', None, *abstract_patterns)
self._callbacks = {} self._callbacks = {}
@ -496,8 +487,8 @@ cdef class PhraseMatcher:
return (self.__class__, (self.vocab,), None, None) return (self.__class__, (self.vocab,), None, None)
def add(self, key, on_match, *docs): def add(self, key, on_match, *docs):
"""Add a match-rule to the matcher. A match-rule consists of: an ID key, """Add a match-rule to the matcher. A match-rule consists of: an ID
an on_match callback, and one or more patterns. key, an on_match callback, and one or more patterns.
key (unicode): The match ID. key (unicode): The match ID.
on_match (callable): Callback executed on match. on_match (callable): Callback executed on match.
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
raise ValueError(msg % (len(doc), self.max_length)) raise ValueError(msg % (len(doc), self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key) cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match self._callbacks[ent_id] = on_match
cdef int length cdef int length
cdef int i cdef int i
cdef hash_t phrase_hash cdef hash_t phrase_hash
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents. docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading. in parallel, if the implementation supports multi-threading.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
for doc in stream: for doc in stream:
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
self._phrase_key[i] = 0 self._phrase_key[i] = 0
for i, j in enumerate(range(start, end)): for i, j in enumerate(range(start, end)):
self._phrase_key[i] = doc.c[j].lex.orth self._phrase_key[i] = doc.c[j].lex.orth
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) cdef hash_t key = hash64(self._phrase_key,
self.max_length * sizeof(attr_t), 0)
ent_id = <hash_t>self.phrase_ids.get(key) ent_id = <hash_t>self.phrase_ids.get(key)
if ent_id == 0: if ent_id == 0:
return None return None

View File

@ -4,17 +4,15 @@ from __future__ import unicode_literals
from libc.string cimport memset from libc.string cimport memset
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
from .attrs cimport POS, IS_SPACE from .attrs cimport POS, IS_SPACE
from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .attrs import LEMMA, intify_attrs
def _normalize_props(props): def _normalize_props(props):
""" """Transform deprecated string keys to correct names."""
Transform deprecated string keys to correct names.
"""
out = {} out = {}
for key, value in props.items(): for key, value in props.items():
if key == POS: if key == POS:
@ -77,7 +75,8 @@ cdef class Morphology:
cdef int assign_untagged(self, TokenC* token) except -1: cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses """Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the the lemmatizer's lookup() method, which looks up the string in the
table provided by the language data as lemma_lookup (if available).""" table provided by the language data as lemma_lookup (if available).
"""
if token.lemma == 0: if token.lemma == 0:
orth_str = self.strings[token.lex.orth] orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str) lemma = self.lemmatizer.lookup(orth_str)
@ -95,11 +94,10 @@ cdef class Morphology:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags: if tag_id > self.n_tags:
raise ValueError("Unknown tag ID: %s" % tag_id) raise ValueError("Unknown tag ID: %s" % tag_id)
# TODO: It's pretty arbitrary to put this logic here. I guess the justification # TODO: It's pretty arbitrary to put this logic here. I guess the
# is that this is where the specific word and the tag interact. Still, # justification is that this is where the specific word and the tag
# we should have a better way to enforce this rule, or figure out why # interact. Still, we should have a better way to enforce this rule, or
# the statistical model fails. # figure out why the statistical model fails. Related to Issue #220
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE): if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings.add('_SP')] tag_id = self.reverse_index[self.strings.add('_SP')]
rich_tag = self.rich_tags[tag_id] rich_tag = self.rich_tags[tag_id]
@ -123,12 +121,11 @@ cdef class Morphology:
else: else:
flags[0] &= ~(one << flag_id) flags[0] &= ~(one << flag_id)
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
""" force=False):
Add a special-case rule to the morphological analyser. Tokens whose """Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties. tag and orth match the rule will receive the specified properties.
Arguments:
tag (unicode): The part-of-speech tag to key the exception. tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception. orth (unicode): The word-form to key the exception.
""" """
@ -144,10 +141,9 @@ cdef class Morphology:
elif force: elif force:
memset(cached, 0, sizeof(cached[0])) memset(cached, 0, sizeof(cached[0]))
else: else:
msg = ("Conflicting morphology exception for (%s, %s). Use force=True " raise ValueError(
"to overwrite.") "Conflicting morphology exception for (%s, %s). Use "
msg = msg % (tag_str, orth_str) "force=True to overwrite." % (tag_str, orth_str))
raise ValueError(msg)
cached.tag = rich_tag cached.tag = rich_tag
# TODO: Refactor this to take arbitrary attributes. # TODO: Refactor this to take arbitrary attributes.

View File

@ -3,26 +3,17 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from thinc.api import chain, layerize, with_getitem
import numpy import numpy
cimport numpy as np cimport numpy as np
import cytoolz import cytoolz
import util
from collections import OrderedDict from collections import OrderedDict
import ujson import ujson
import msgpack import msgpack
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.api import chain
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU from thinc.v2v import Softmax
from thinc.i2v import HashEmbed from thinc.t2v import Pooling, max_pool, mean_pool
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.misc import Residual
from thinc.misc import BatchNorm as BN
from thinc.misc import LayerNorm as LN
from thinc.neural.util import to_categorical from thinc.neural.util import to_categorical
from thinc.neural._classes.difference import Siamese, CauchySimilarity from thinc.neural._classes.difference import Siamese, CauchySimilarity
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
@ -30,29 +21,23 @@ from .syntax.nn_parser cimport Parser
from .syntax import nonproj from .syntax import nonproj
from .syntax.ner cimport BiluoPushDown from .syntax.ner cimport BiluoPushDown
from .syntax.arc_eager cimport ArcEager from .syntax.arc_eager cimport ArcEager
from .tagger import Tagger
from .syntax.stateclass cimport StateClass
from .gold cimport GoldParse
from .morphology cimport Morphology from .morphology cimport Morphology
from .vocab cimport Vocab from .vocab cimport Vocab
from .syntax import nonproj from .syntax import nonproj
from .compat import json_dumps from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from .attrs import POS
from ._ml import rebatch, Tok2Vec, flatten
from ._ml import build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models
from .parts_of_speech import X from .parts_of_speech import X
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models
from . import util
class SentenceSegmenter(object): class SentenceSegmenter(object):
"""A simple spaCy hook, to allow custom sentence boundary detection logic """A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse). (that doesn't require the dependency parse). To change the sentence
boundary detection strategy, pass a generator function `strategy` on
To change the sentence boundary detection strategy, pass a generator initialization, or assign a new strategy to the .strategy attribute.
function `strategy` on initialization, or assign a new strategy to
the .strategy attribute.
Sentence detection strategies should be generators that take `Doc` objects Sentence detection strategies should be generators that take `Doc` objects
and yield `Span` objects for each sentence. and yield `Span` objects for each sentence.
""" """
@ -84,6 +69,10 @@ class SentenceSegmenter(object):
class Pipe(object): class Pipe(object):
"""This class is not instantiated directly. Components inherit from it, and
it defines the interface that components should follow to function as
components in a spaCy analysis pipeline.
"""
name = None name = None
@classmethod @classmethod
@ -149,8 +138,7 @@ class Pipe(object):
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
def use_params(self, params): def use_params(self, params):
"""Modify the pipe's model, to use the given parameter values. """Modify the pipe's model, to use the given parameter values."""
"""
with self.model.use_params(params): with self.model.use_params(params):
yield yield
@ -235,8 +223,8 @@ class Tensorizer(Pipe):
"""Construct a new statistical model. Weights are not allocated on """Construct a new statistical model. Weights are not allocated on
initialisation. initialisation.
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` vocab (Vocab): A `Vocab` instance. The model must share the same
instance with the `Doc` objects it will process. `Vocab` instance with the `Doc` objects it will process.
model (Model): A `Model` instance or `True` allocate one later. model (Model): A `Model` instance or `True` allocate one later.
**cfg: Config parameters. **cfg: Config parameters.
@ -280,7 +268,7 @@ class Tensorizer(Pipe):
"""Return a single tensor for a batch of documents. """Return a single tensor for a batch of documents.
docs (iterable): A sequence of `Doc` objects. docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the documents. RETURNS (object): Vector representations for each token in the docs.
""" """
tokvecs = self.model(docs) tokvecs = self.model(docs)
return tokvecs return tokvecs
@ -289,7 +277,7 @@ class Tensorizer(Pipe):
"""Set the tensor attribute for a batch of documents. """Set the tensor attribute for a batch of documents.
docs (iterable): A sequence of `Doc` objects. docs (iterable): A sequence of `Doc` objects.
tokvecs (object): Vector representation for each token in the documents. tokvecs (object): Vector representation for each token in the docs.
""" """
for doc, tokvecs in zip(docs, tokvecses): for doc, tokvecs in zip(docs, tokvecses):
assert tokvecs.shape[0] == len(doc) assert tokvecs.shape[0] == len(doc)
@ -328,12 +316,14 @@ class Tensorizer(Pipe):
class Tagger(Pipe): class Tagger(Pipe):
name = 'tagger' name = 'tagger'
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) self.cfg.setdefault('pretrained_dims',
self.vocab.vectors.data.shape[1])
def __call__(self, doc): def __call__(self, doc):
tags = self.predict([doc]) tags = self.predict([doc])
@ -353,8 +343,7 @@ class Tagger(Pipe):
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray): if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get() guesses = guesses.get()
guesses = self.model.ops.unflatten(guesses, guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
[len(d) for d in docs])
return guesses return guesses
def set_annotations(self, docs, batch_tag_ids): def set_annotations(self, docs, batch_tag_ids):
@ -387,8 +376,8 @@ class Tagger(Pipe):
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores) scores = self.model.ops.flatten(scores)
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)} tag_index = {tag: i
for i, tag in enumerate(self.vocab.morphology.tag_names)}
cdef int idx = 0 cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i') correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
@ -443,17 +432,18 @@ class Tagger(Pipe):
serialize['model'] = self.model.to_bytes serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes serialize['vocab'] = self.vocab.to_bytes
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map, serialize['tag_map'] = lambda: msgpack.dumps(
use_bin_type=True, self.vocab.morphology.tag_map, use_bin_type=True, encoding='utf8')
encoding='utf8')
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
def load_model(b): def load_model(b):
if self.model is True: if self.model is True:
token_vector_width = util.env_opt('token_vector_width', token_vector_width = util.env_opt(
'token_vector_width',
self.cfg.get('token_vector_width', 128)) self.cfg.get('token_vector_width', 128))
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) self.model = self.Model(self.vocab.morphology.n_tags,
**self.cfg)
self.model.from_bytes(b) self.model.from_bytes(b)
def load_tag_map(b): def load_tag_map(b):
@ -509,11 +499,11 @@ class Tagger(Pipe):
class MultitaskObjective(Tagger): class MultitaskObjective(Tagger):
'''Assist training of a parser or tagger, by training a side-objective. """Experimental: Assist training of a parser or tagger, by training a
side-objective.
Experimental """
'''
name = 'nn_labeller' name = 'nn_labeller'
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
@ -530,12 +520,12 @@ class MultitaskObjective(Tagger):
elif hasattr(target, '__call__'): elif hasattr(target, '__call__'):
self.make_label = target self.make_label = target
else: else:
raise ValueError( raise ValueError("MultitaskObjective target should be function or "
"MultitaskObjective target should be function or one of " "one of: dep, tag, ent, dep_tag_offset, ent_tag.")
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) self.cfg.setdefault('pretrained_dims',
self.vocab.vectors.data.shape[1])
@property @property
def labels(self): def labels(self):
@ -623,20 +613,19 @@ class MultitaskObjective(Tagger):
class SimilarityHook(Pipe): class SimilarityHook(Pipe):
""" """
Experimental Experimental: A pipeline component to install a hook for supervised
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
documents. The similarity model can be any object obeying the Thinc `Model`
interface. By default, the model concatenates the elementwise mean and
elementwise max of the two tensors, and compares them using the
Cauchy-like similarity function from Chen (2013):
A pipeline component to install a hook for supervised similarity into >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
model can be any object obeying the Thinc Model interface. By default,
the model concatenates the elementwise mean and elementwise max of the two
tensors, and compares them using the Cauchy-like similarity function
from Chen (2013):
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
Where W is a vector of dimension weights, initialized to 1. Where W is a vector of dimension weights, initialized to 1.
""" """
name = 'similarity' name = 'similarity'
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
@ -662,8 +651,7 @@ class SimilarityHook(Pipe):
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
def begin_training(self, _=tuple(), pipeline=None): def begin_training(self, _=tuple(), pipeline=None):
""" """Allocate model, using width from tensorizer in pipeline.
Allocate model, using width from tensorizer in pipeline.
gold_tuples (iterable): Gold-standard training data. gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of. pipeline (list): The pipeline the model is part of.
@ -763,12 +751,14 @@ cdef class DependencyParser(Parser):
for target in []: for target in []:
labeller = MultitaskObjective(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
tok2vec = self.model[0] tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) labeller.begin_training(gold_tuples, pipeline=pipeline,
tok2vec=tok2vec)
pipeline.append(labeller) pipeline.append(labeller)
self._multitasks.append(labeller) self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (DependencyParser, (self.vocab, self.moves, self.model), None, None) return (DependencyParser, (self.vocab, self.moves, self.model),
None, None)
cdef class EntityRecognizer(Parser): cdef class EntityRecognizer(Parser):
@ -781,12 +771,14 @@ cdef class EntityRecognizer(Parser):
for target in []: for target in []:
labeller = MultitaskObjective(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
tok2vec = self.model[0] tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) labeller.begin_training(gold_tuples, pipeline=pipeline,
tok2vec=tok2vec)
pipeline.append(labeller) pipeline.append(labeller)
self._multitasks.append(labeller) self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None) return (EntityRecognizer, (self.vocab, self.moves, self.model),
None, None)
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer'] __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']

View File

@ -74,18 +74,21 @@ class Scorer(object):
@property @property
def scores(self): def scores(self):
return { return {
'uas': self.uas, 'las': self.las, 'uas': self.uas,
'ents_p': self.ents_p, 'ents_r': self.ents_r, 'ents_f': self.ents_f, 'las': self.las,
'ents_p': self.ents_p,
'ents_r': self.ents_r,
'ents_f': self.ents_f,
'tags_acc': self.tags_acc, 'tags_acc': self.tags_acc,
'token_acc': self.token_acc 'token_acc': self.token_acc
} }
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
assert len(tokens) == len(gold) assert len(tokens) == len(gold)
gold_deps = set() gold_deps = set()
gold_tags = set() gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) gold_ents = set(tags_to_entities([annot[-1]
for annot in gold.orig_annot]))
for id_, word, tag, head, dep, ner in gold.orig_annot: for id_, word, tag, head, dep, ner in gold.orig_annot:
gold_tags.add((id_, tag)) gold_tags.add((id_, tag))
if dep not in (None, "") and dep.lower() not in punct_labels: if dep not in (None, "") and dep.lower() not in punct_labels:

View File

@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
cimport cython cimport cython
from libc.string cimport memcpy from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32
import ujson import ujson
import dill
from .symbols import IDS as SYMBOLS_BY_STR from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t from .typedefs cimport hash_t
from . import util
from .compat import json_dumps from .compat import json_dumps
from . import util
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(unicode string) except 0:
@ -195,7 +191,7 @@ cdef class StringStore:
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
strings = list(self) strings = list(self)
@ -225,7 +221,7 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being serialized. **exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object. RETURNS (bytes): The serialized form of the `StringStore` object.
""" """
return ujson.dumps(list(self)) return json_dumps(list(self))
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string. """Load state from a binary string.

View File

@ -1,8 +1,8 @@
# coding: utf8 # coding: utf8
#cython: optimize.unpack_method_calls=False #cython: optimize.unpack_method_calls=False
from __future__ import unicode_literals from __future__ import unicode_literals
IDS = { IDS = {
"": NIL, "": NIL,
"IS_ALPHA": IS_ALPHA, "IS_ALPHA": IS_ALPHA,
@ -464,9 +464,11 @@ IDS = {
"LAW": LAW "LAW": LAW
} }
def sort_nums(x): def sort_nums(x):
return x[1] return x[1]
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)] NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
# Unfortunate hack here, to work around problem with long cpdef enum # Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+) # (which is generating an enormous amount of C++ in Cython 0.24+)

View File

@ -2,7 +2,7 @@
# cython: profile=True # cython: profile=True
cimport numpy as np cimport numpy as np
import numpy import numpy
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_XDECREF
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation from thinc.extra.search import MaxViolation
from thinc.typedefs cimport hash_t, class_t from thinc.typedefs cimport hash_t, class_t
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..tokens.doc cimport Doc
# These are passed as callbacks to thinc.search.Beam # These are passed as callbacks to thinc.search.Beam
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
cdef StateClass state, st cdef StateClass state, st
for state in states: for state in states:
beam = Beam(self.moves.n_moves, width, density) beam = Beam(self.moves.n_moves, width, density)
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) beam.initialize(self.moves.init_beam_state, state.c.length,
state.c._sent)
for i in range(beam.width): for i in range(beam.width):
st = <StateClass>beam.at(i) st = <StateClass>beam.at(i)
st.c.offset = state.c.offset st.c.offset = state.c.offset
@ -74,7 +74,8 @@ cdef class ParserBeam(object):
@property @property
def is_done(self): def is_done(self):
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams)) return all(b.is_done or self.dones[i]
for i, b in enumerate(self.beams))
def __getitem__(self, i): def __getitem__(self, i):
return self.beams[i] return self.beams[i]
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
for i in range(beam.size): for i in range(beam.size):
state = <StateClass>beam.at(i) state = <StateClass>beam.at(i)
if not state.c.is_final(): if not state.c.is_final():
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) self.moves.set_costs(beam.is_valid[i], beam.costs[i],
state, gold)
if follow_gold: if follow_gold:
for j in range(beam.nr_class): for j in range(beam.nr_class):
if beam.costs[i][j] >= 1: if beam.costs[i][j] >= 1:
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
c_ids += ids.shape[1] c_ids += ids.shape[1]
return ids return ids
nr_update = 0 nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps, def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, golds, states, golds,
state2vec, vec2scores, state2vec, vec2scores,
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
if pbeam.is_done and gbeam.is_done: if pbeam.is_done and gbeam.is_done:
break break
# The beam maps let us find the right row in the flattened scores # The beam maps let us find the right row in the flattened scores
# arrays for each state. States are identified by (example id, history). # arrays for each state. States are identified by (example id,
# We keep a different beam map for each step (since we'll have a flat # history). We keep a different beam map for each step (since we'll
# scores array for each step). The beam map will let us take the per-state # have a flat scores array for each step). The beam map will let us
# losses, and compute the gradient for each (step, state, class). # take the per-state losses, and compute the gradient for each (step,
# state, class).
beam_maps.append({}) beam_maps.append({})
# Gather all states from the two beams in a list. Some stats may occur # Gather all states from the two beams in a list. Some stats may occur
# in both beams. To figure out which beam each state belonged to, # in both beams. To figure out which beam each state belonged to,
# we keep two lists of indices, p_indices and g_indices # we keep two lists of indices, p_indices and g_indices
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
nr_update)
if not states: if not states:
break break
# Now that we have our flat list of states, feed them through the model # Now that we have our flat list of states, feed them through the model
token_ids = get_token_ids(states, nr_feature) token_ids = get_token_ids(states, nr_feature)
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
if hist_feats: if hist_feats:
hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i') hists = numpy.asarray([st.history[:hist_feats] for st in states],
scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop) dtype='i')
scores, bp_scores = vec2scores.begin_update((vectors, hists),
drop=drop)
else: else:
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
# Unpack the flat scores into lists for the two beams. The indices arrays # Unpack the flat scores into lists for the two beams. The indices arrays
# tell us which example and state the scores-row refers to. # tell us which example and state the scores-row refers to.
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices] p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices] for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
for indices in g_indices]
# Now advance the states in the beams. The gold beam is contrained to # Now advance the states in the beams. The gold beam is contrained to
# to follow only gold analyses. # to follow only gold analyses.
pbeam.advance(p_scores) pbeam.advance(p_scores)
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
def get_gradient(nr_class, beam_maps, histories, losses): def get_gradient(nr_class, beam_maps, histories, losses):
""" """The global model assigns a loss to each parse. The beam scores
The global model assigns a loss to each parse. The beam scores
are additive, so the same gradient is applied to each action are additive, so the same gradient is applied to each action
in the history. This gives the gradient of a single *action* in the history. This gives the gradient of a single *action*
for a beam state -- so we have "the gradient of loss for taking for a beam state -- so we have "the gradient of loss for taking
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
if loss != 0.0 and not numpy.isnan(loss): if loss != 0.0 and not numpy.isnan(loss):
nr_step = max(nr_step, len(hist)) nr_step = max(nr_step, len(hist))
for i in range(nr_step): for i in range(nr_step):
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f')) grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
dtype='f'))
assert len(histories) == len(losses) assert len(histories) == len(losses)
for eg_id, hists in enumerate(histories): for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists): for loss, hist in zip(losses[eg_id], hists):
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
grads[j][i, clas] += loss grads[j][i, clas] += loss
key = key + tuple([clas]) key = key + tuple([clas])
return grads return grads

View File

@ -1 +0,0 @@
# test

View File

@ -4,24 +4,16 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport Py_INCREF
import ctypes
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from collections import OrderedDict from collections import OrderedDict
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
import numpy
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token from ._state cimport StateC
from .nonproj import is_nonproj_tree from .nonproj import is_nonproj_tree
from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse from ..gold cimport GoldParse, GoldParseC
from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
from ..lexeme cimport Lexeme
from ..structs cimport TokenC from ..structs cimport TokenC
@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions', OrderedDict((
OrderedDict((
(SHIFT, ['']), (SHIFT, ['']),
(REDUCE, ['']), (REDUCE, ['']),
(RIGHT, []), (RIGHT, []),
(LEFT, []), (LEFT, []),
(BREAK, ['ROOT']) (BREAK, ['ROOT']))
))) ))
seen_actions = set() seen_actions = set()
for label in kwargs.get('left_labels', []): for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT': if label.upper() != 'ROOT':
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
if gold.cand_to_gold[i] is None: if gold.cand_to_gold[i] is None:
continue continue
if state.safe_get(i).dep: if state.safe_get(i).dep:
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep])) predicted.add((i, state.H(i),
self.strings[state.safe_get(i).dep]))
else: else:
predicted.add((i, state.H(i), 'ROOT')) predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
if not self.has_gold(gold): if not self.has_gold(gold):
return None return None
for i in range(gold.length): for i in range(gold.length):
if gold.heads[i] is None or gold.labels[i] is None: # Missing values # Missing values
if gold.heads[i] is None or gold.labels[i] is None:
gold.c.heads[i] = i gold.c.heads[i] = i
gold.c.has_dep[i] = False gold.c.has_dep[i] = False
else: else:
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
# Check projectivity --- leading cause # Check projectivity --- leading cause
if is_nonproj_tree(gold.heads): if is_nonproj_tree(gold.heads):
raise ValueError( raise ValueError(
"Could not find a gold-standard action to supervise the dependency " "Could not find a gold-standard action to supervise the "
"parser.\n" "dependency parser. Likely cause: the tree is "
"Likely cause: the tree is non-projective (i.e. it has crossing " "non-projective (i.e. it has crossing arcs -- see "
"arcs -- see spacy/syntax/nonproj.pyx for definitions)\n" "spacy/syntax/nonproj.pyx for definitions). The ArcEager "
"The ArcEager transition system only supports projective trees.\n" "transition system only supports projective trees. To "
"To learn non-projective representations, transform the data " "learn non-projective representations, transform the data "
"before training and after parsing. Either pass make_projective=True " "before training and after parsing. Either pass "
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data") "make_projective=True to the GoldParse class, or use "
"spacy.syntax.nonproj.preprocess_training_data.")
else: else:
print(gold.orig_annot) print(gold.orig_annot)
print(gold.words) print(gold.words)
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
print(gold.labels) print(gold.labels)
print(gold.sent_starts) print(gold.sent_starts)
raise ValueError( raise ValueError(
"Could not find a gold-standard action to supervise the dependency " "Could not find a gold-standard action to supervise the"
"parser.\n" "dependency parser. The GoldParse was projective. The "
"The GoldParse was projective.\n" "transition system has %d actions. State at failure: %s"
"The transition system has %d actions.\n" % (self.n_moves, stcls.print_state(gold.words)))
"State at failure:\n"
"%s" % (self.n_moves, stcls.print_state(gold.words)))
assert n_gold >= 1 assert n_gold >= 1
def get_beam_annot(self, Beam beam): def get_beam_annot(self, Beam beam):
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
deps[j].setdefault(dep, 0.0) deps[j].setdefault(dep, 0.0)
deps[j][dep] += prob deps[j][dep] += prob
return heads, deps return heads, deps

View File

@ -1,144 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
def english_noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse.
Works on both Doc and Span.
"""
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.i+1))
yield word.left_edge.i, word.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.i+1))
yield word.left_edge.i, word.i+1, np_label
# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
def german_noun_chunks(obj):
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
doc = obj.doc # Ensure works on both Doc and Span.
np_label = doc.vocab.strings.add('NP')
np_deps = set(doc.vocab.strings.add(label) for label in labels)
close_app = doc.vocab.strings.add('nk')
rbracket = 0
for i, word in enumerate(obj):
if i < rbracket:
continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
rbracket = word.i+1
# try to extend the span to the right
# to capture close apposition/measurement constructions
for rdep in doc[word.i].rights:
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
rbracket = rdep.i+1
yield word.left_edge.i, rbracket, np_label
def es_noun_chunks(obj):
doc = obj.doc
np_label = doc.vocab.strings['NP']
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
right_labels = ['flat', 'fixed', 'compound', 'neg']
stop_labels = ['punct']
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
def next_token(token):
try:
return token.nbor()
except:
return None
def noun_bounds(root):
def is_verb_token(token):
return token.pos in [VERB, AUX]
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
left_bound = token
right_bound = root
for token in root.rights:
if (token.dep in np_right_deps):
left, right = noun_bounds(token)
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
doc[left_bound.i: right.i])):
break
else:
right_bound = right
return left_bound, right_bound
token = doc[0]
while token and token.i < len(doc):
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(token)
yield left.i, right.i+1, np_label
token = right
token = next_token(token)
def french_noun_chunks(obj):
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
doc = obj.doc # Ensure works on both Doc and Span.
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add('conj')
np_label = doc.vocab.strings.add('NP')
seen = set()
for i, word in enumerate(obj):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
elif word.dep == conj:
head = word.head
while head.dep == conj and head.head.i < head.i:
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
yield word.left_edge.i, word.right_edge.i+1, np_label
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
'es': es_noun_chunks, 'fr': french_noun_chunks}

View File

@ -4,17 +4,12 @@ from __future__ import unicode_literals
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from collections import OrderedDict from collections import OrderedDict
import numpy
from thinc.neural.ops import NumpyOps
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .transition_system cimport Transition from .transition_system cimport Transition
from .transition_system cimport do_func_t from .transition_system cimport do_func_t
from ..structs cimport TokenC, Entity from ..gold cimport GoldParseC, GoldParse
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
cdef enum: cdef enum:
@ -69,8 +64,7 @@ cdef class BiluoPushDown(TransitionSystem):
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions', OrderedDict((
OrderedDict((
(MISSING, ['']), (MISSING, ['']),
(BEGIN, []), (BEGIN, []),
(IN, []), (IN, []),
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label cdef attr_t label
if name == '-' or name == None: if name == '-' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0) return Transition(clas=0, move=MISSING, label=0, score=0)
elif name == '!O': elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0) return Transition(clas=0, move=ISNT, label=0, score=0)
@ -328,8 +322,8 @@ cdef class In:
return False return False
elif preset_ent_iob == 3: elif preset_ent_iob == 3:
return False return False
# TODO: Is this quite right? # TODO: Is this quite right? I think it's supposed to be ensuring the
# I think it's supposed to be ensuring the gazetteer matches are maintained # gazetteer matches are maintained
elif st.B_(1).ent_iob != preset_ent_iob: elif st.B_(1).ent_iob != preset_ent_iob:
return False return False
# Don't allow entities to extend across sentence boundaries # Don't allow entities to extend across sentence boundaries
@ -354,10 +348,12 @@ cdef class In:
if g_act == MISSING: if g_act == MISSING:
return 0 return 0
elif g_act == BEGIN: elif g_act == BEGIN:
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) # I, Gold B --> True
# (P of bad open entity sunk, R of this entity sunk)
return 0 return 0
elif g_act == IN: elif g_act == IN:
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) # I, Gold I --> True
# (label forced by prev, if mismatch, P and R both sunk)
return 0 return 0
elif g_act == LAST: elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O # I, Gold L --> True iff this entity sunk and next tag == O
@ -505,11 +501,3 @@ cdef class Out:
return 1 return 1
else: else:
return 1 return 1
class OracleError(Exception):
pass
class UnknownMove(Exception):
pass

View File

@ -5,79 +5,55 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from collections import Counter, OrderedDict from collections import OrderedDict
import ujson import ujson
import json import json
import contextlib
import numpy import numpy
from libc.math cimport exp
cimport cython
cimport cython.parallel cimport cython.parallel
import cytoolz import cytoolz
import dill
import numpy.random import numpy.random
cimport numpy as np cimport numpy as np
from cpython.ref cimport PyObject, Py_XDECREF
from libcpp.vector cimport vector
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.stdint cimport uint32_t, uint64_t from libc.math cimport exp
from libc.string cimport memset, memcpy from libcpp.vector cimport vector
from libc.stdlib cimport malloc, calloc, free from libc.string cimport memset
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from libc.stdlib cimport calloc, free
from thinc.linear.avgtron cimport AveragedPerceptron from cymem.cymem cimport Pool
from thinc.linalg cimport Vec, VecVec from thinc.typedefs cimport weight_t, class_t, hash_t
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
from thinc.extra.eg cimport Example
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from thinc.api import chain, clone
from cymem.cymem cimport Pool, Address from thinc.v2v import Model, Maxout, Affine
from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.api import layerize, chain, clone, with_flatten
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.misc import LayerNorm from thinc.misc import LayerNorm
from thinc.neural.ops import CupyOps
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from .. import util from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch
from .._ml import Residual, drop_layer, flatten
from .._ml import link_vectors_to_models from .._ml import link_vectors_to_models
from .._ml import HistoryFeatures
from ..compat import json_dumps, copy_array from ..compat import json_dumps, copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
from .. import util
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from . import nonproj from .transition_system cimport Transition
from .transition_system import OracleError from . import _beam_utils, nonproj
from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from ..gold cimport GoldParse
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
from . import _beam_utils
def get_templates(*args, **kwargs): def get_templates(*args, **kwargs):
return [] return []
DEBUG = False DEBUG = False
def set_debug(val): def set_debug(val):
global DEBUG global DEBUG
DEBUG = val DEBUG = val
cdef class precompute_hiddens: cdef class precompute_hiddens:
'''Allow a model to be "primed" by pre-computing input features in bulk. """Allow a model to be "primed" by pre-computing input features in bulk.
This is used for the parser, where we want to take a batch of documents, This is used for the parser, where we want to take a batch of documents,
and compute vectors for each (token, position) pair. These vectors can then and compute vectors for each (token, position) pair. These vectors can then
@ -92,7 +68,7 @@ cdef class precompute_hiddens:
so we can save the factor k. This also gives a nice CPU/GPU division: so we can save the factor k. This also gives a nice CPU/GPU division:
we can do all our hard maths up front, packed into large multiplications, we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU. and do the hard-to-program parsing on the CPU.
''' """
cdef int nF, nO, nP cdef int nF, nO, nP
cdef bint _is_synchronized cdef bint _is_synchronized
cdef public object ops cdef public object ops
@ -101,7 +77,8 @@ cdef class precompute_hiddens:
cdef object _cuda_stream cdef object _cuda_stream
cdef object _bp_hiddens cdef object _bp_hiddens
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.): def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
drop=0.):
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
cdef np.ndarray cached cdef np.ndarray cached
if not isinstance(gpu_cached, numpy.ndarray): if not isinstance(gpu_cached, numpy.ndarray):
@ -121,8 +98,7 @@ cdef class precompute_hiddens:
self._bp_hiddens = bp_features self._bp_hiddens = bp_features
cdef const float* get_feat_weights(self) except NULL: cdef const float* get_feat_weights(self) except NULL:
if not self._is_synchronized \ if not self._is_synchronized and self._cuda_stream is not None:
and self._cuda_stream is not None:
self._cuda_stream.synchronize() self._cuda_stream.synchronize()
self._is_synchronized = True self._is_synchronized = True
return <float*>self._cached.data return <float*>self._cached.data
@ -131,7 +107,8 @@ cdef class precompute_hiddens:
return self.begin_update(X)[0] return self.begin_update(X)[0]
def begin_update(self, token_ids, drop=0.): def begin_update(self, token_ids, drop=0.):
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f') cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO*self.nP), dtype='f')
# This is tricky, but (assuming GPU available); # This is tricky, but (assuming GPU available);
# - Input to forward on CPU # - Input to forward on CPU
# - Output from forward on CPU # - Output from forward on CPU
@ -162,10 +139,11 @@ cdef class precompute_hiddens:
state_vector = state_vector.reshape( state_vector = state_vector.reshape(
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
best, which = self.ops.maxout(state_vector) best, which = self.ops.maxout(state_vector)
def backprop(d_best, sgd=None): def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP) return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
return best, backprop
cdef void sum_state_features(float* output, cdef void sum_state_features(float* output,
@ -240,11 +218,15 @@ cdef class Parser:
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
if depth != 1: if depth != 1:
raise ValueError("Currently parser depth is hard-coded to 1.") raise ValueError("Currently parser depth is hard-coded to 1.")
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
cfg.get('maxout_pieces', 2))
if parser_maxout_pieces != 2: if parser_maxout_pieces != 2:
raise ValueError("Currently parser_maxout_pieces is hard-coded to 2") raise ValueError("Currently parser_maxout_pieces is hard-coded "
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) "to 2")
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200)) token_vector_width = util.env_opt('token_vector_width',
cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width',
cfg.get('hidden_width', 200))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -280,23 +262,19 @@ cdef class Parser:
return (tok2vec, lower, upper), cfg return (tok2vec, lower, upper), cfg
def __init__(self, Vocab vocab, moves=True, model=True, **cfg): def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
""" """Create a Parser.
Create a Parser.
Arguments: vocab (Vocab): The vocabulary object. Must be shared with documents
vocab (Vocab): to be processed. The value is set to the `.vocab` attribute.
The vocabulary object. Must be shared with documents to be processed. moves (TransitionSystem): Defines how the parse-state is created,
The value is set to the .vocab attribute. updated and evaluated. The value is set to the .moves attribute
moves (TransitionSystem): unless True (default), in which case a new instance is created with
Defines how the parse-state is created, updated and evaluated. `Parser.Moves()`.
The value is set to the .moves attribute unless True (default), model (object): Defines how the parse-state is created, updated and
in which case a new instance is created with Parser.Moves(). evaluated. The value is set to the .model attribute unless True
model (object): (default), in which case a new instance is created with
Defines how the parse-state is created, updated and evaluated. `Parser.Model()`.
The value is set to the .model attribute unless True (default), **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
in which case a new instance is created with Parser.Model().
**cfg:
Arbitrary configuration parameters. Set to the .cfg attribute
""" """
self.vocab = vocab self.vocab = vocab
if moves is True: if moves is True:
@ -322,13 +300,10 @@ cdef class Parser:
return (Parser, (self.vocab, self.moves, self.model), None, None) return (Parser, (self.vocab, self.moves, self.model), None, None)
def __call__(self, Doc doc, beam_width=None, beam_density=None): def __call__(self, Doc doc, beam_width=None, beam_density=None):
""" """Apply the parser or entity recognizer, setting the annotations onto
Apply the parser or entity recognizer, setting the annotations onto the Doc object. the `Doc` object.
Arguments:
doc (Doc): The document to be processed. doc (Doc): The document to be processed.
Returns:
None
""" """
if beam_width is None: if beam_width is None:
beam_width = self.cfg.get('beam_width', 1) beam_width = self.cfg.get('beam_width', 1)
@ -350,16 +325,13 @@ cdef class Parser:
def pipe(self, docs, int batch_size=256, int n_threads=2, def pipe(self, docs, int batch_size=256, int n_threads=2,
beam_width=None, beam_density=None): beam_width=None, beam_density=None):
""" """Process a stream of documents.
Process a stream of documents.
Arguments:
stream: The sequence of documents to process. stream: The sequence of documents to process.
batch_size (int): batch_size (int): Number of documents to accumulate into a working set.
The number of documents to accumulate into a working set. n_threads (int): The number of threads with which to work on the buffer
n_threads (int): in parallel.
The number of threads with which to work on the buffer in parallel. YIELDS (Doc): Documents, in order.
Yields (Doc): Documents, in order.
""" """
if beam_width is None: if beam_width is None:
beam_width = self.cfg.get('beam_width', 1) beam_width = self.cfg.get('beam_width', 1)
@ -376,8 +348,8 @@ cdef class Parser:
parse_states = self.parse_batch(subbatch) parse_states = self.parse_batch(subbatch)
beams = [] beams = []
else: else:
beams = self.beam_parse(subbatch, beams = self.beam_parse(subbatch, beam_width=beam_width,
beam_width=beam_width, beam_density=beam_density) beam_density=beam_density)
parse_states = [] parse_states = []
for beam in beams: for beam in beams:
parse_states.append(<StateClass>beam.at(0)) parse_states.append(<StateClass>beam.at(0))
@ -397,9 +369,9 @@ cdef class Parser:
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
cuda_stream = get_cuda_stream() cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
0.0) docs, cuda_stream, 0.0)
nr_state = len(docs) nr_state = len(docs)
nr_class = self.moves.n_moves nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1] nr_dim = tokvecs.shape[1]
@ -413,7 +385,8 @@ cdef class Parser:
feat_weights = state2vec.get_feat_weights() feat_weights = state2vec.get_feat_weights()
cdef int i cdef int i
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T) cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
vec2scores._layers[-1].W.T)
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
hW = <float*>hidden_weights.data hW = <float*>hidden_weights.data
@ -473,9 +446,9 @@ cdef class Parser:
cdef Doc doc cdef Doc doc
cdef int nr_class = self.moves.n_moves cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output cdef StateClass stcls, output
cuda_stream = get_cuda_stream() cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
0.0) docs, cuda_stream, 0.0)
beams = [] beams = []
cdef int offset = 0 cdef int offset = 0
cdef int j = 0 cdef int j = 0
@ -530,9 +503,7 @@ cdef class Parser:
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]
cuda_stream = util.get_cuda_stream()
cuda_stream = get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds) states, golds, max_steps = self._init_gold_batch(docs, golds)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
drop) drop)
@ -547,7 +518,6 @@ cdef class Parser:
n_steps = 0 n_steps = 0
while todo: while todo:
states, golds = zip(*todo) states, golds = zip(*todo)
token_ids = self.get_token_ids(states) token_ids = self.get_token_ids(states)
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
if drop != 0: if drop != 0:
@ -569,8 +539,8 @@ cdef class Parser:
and not isinstance(token_ids, state2vec.ops.xp.ndarray): and not isinstance(token_ids, state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously # Move token_ids and d_vector to GPU, asynchronously
backprops.append(( backprops.append((
get_async(cuda_stream, token_ids), util.get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector), util.get_async(cuda_stream, d_vector),
bp_vector bp_vector
)) ))
else: else:
@ -603,14 +573,12 @@ cdef class Parser:
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
for gold in golds: for gold in golds:
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
cuda_stream = util.get_cuda_stream()
cuda_stream = get_cuda_stream() (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop) docs, cuda_stream, drop)
states_d_scores, backprops = _beam_utils.update_beam(
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, self.moves, self.nr_feature, 500, states, golds, state2vec,
states, golds, vec2scores, width, density, self.cfg.get('hist_size', 0),
state2vec, vec2scores,
width, density, self.cfg.get('hist_size', 0),
drop=drop, losses=losses) drop=drop, losses=losses)
backprop_lower = [] backprop_lower = []
cdef float batch_size = len(docs) cdef float batch_size = len(docs)
@ -623,13 +591,14 @@ cdef class Parser:
if isinstance(self.model[0].ops, CupyOps) \ if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(ids, state2vec.ops.xp.ndarray): and not isinstance(ids, state2vec.ops.xp.ndarray):
backprop_lower.append(( backprop_lower.append((
get_async(cuda_stream, ids), util.get_async(cuda_stream, ids),
get_async(cuda_stream, d_vector), util.get_async(cuda_stream, d_vector),
bp_vectors)) bp_vectors))
else: else:
backprop_lower.append((ids, d_vector, bp_vectors)) backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream) self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
cuda_stream)
def _init_gold_batch(self, whole_docs, whole_golds): def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long """Make a square batch, of length equal to the shortest doc. A long
@ -779,7 +748,8 @@ cdef class Parser:
def begin_training(self, gold_tuples, pipeline=None, **cfg): def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100) gold_tuples = nonproj.preprocess_training_data(gold_tuples,
label_freq_cutoff=100)
actions = self.moves.get_actions(gold_parses=gold_tuples) actions = self.moves.get_actions(gold_parses=gold_tuples)
for action, labels in actions.items(): for action, labels in actions.items():
for label in labels: for label in labels:

View File

@ -1,36 +1,34 @@
# coding: utf-8 # coding: utf-8
""" """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme. scheme.
""" """
from __future__ import unicode_literals from __future__ import unicode_literals
from copy import copy from copy import copy
from ..tokens.doc cimport Doc
from ..attrs import DEP, HEAD
DELIMITER = '||' DELIMITER = '||'
def ancestors(tokenid, heads): def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root # Returns all words going from the word up the path to the root. The path
# the path to root cannot be longer than the number of words in the sentence # to root cannot be longer than the number of words in the sentence. This
# this function ends after at most len(heads) steps # function ends after at most len(heads) steps, because it would otherwise
# because it would otherwise loop indefinitely on cycles # loop indefinitely on cycles.
head = tokenid head = tokenid
cnt = 0 cnt = 0
while heads[head] != head and cnt < len(heads): while heads[head] != head and cnt < len(heads):
head = heads[head] head = heads[head]
cnt += 1 cnt += 1
yield head yield head
if head == None: if head is None:
break break
def contains_cycle(heads): def contains_cycle(heads):
# in an acyclic tree, the path from each word following # in an acyclic tree, the path from each word following the head relation
# the head relation upwards always ends at the root node # upwards always ends at the root node
for tokenid in range(len(heads)): for tokenid in range(len(heads)):
seen = set([tokenid]) seen = set([tokenid])
for ancestor in ancestors(tokenid, heads): for ancestor in ancestors(tokenid, heads):
@ -47,13 +45,13 @@ def is_nonproj_arc(tokenid, heads):
head = heads[tokenid] head = heads[tokenid]
if head == tokenid: # root arcs cannot be non-projective if head == tokenid: # root arcs cannot be non-projective
return False return False
elif head == None: # unattached tokens cannot be non-projective elif head is None: # unattached tokens cannot be non-projective
return False return False
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
for k in range(start, end): for k in range(start, end):
for ancestor in ancestors(k, heads): for ancestor in ancestors(k, heads):
if ancestor == None: # for unattached tokens/subtrees if ancestor is None: # for unattached tokens/subtrees
break break
elif ancestor == head: # normal case: k dominated by h elif ancestor == head: # normal case: k dominated by h
break break
@ -83,30 +81,30 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
for (ids, words, tags, heads, labels, iob), ctnts in sents: for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads, deco_labels = projectivize(heads, labels) proj_heads, deco_labels = projectivize(heads, labels)
# set the label to ROOT for each root dependent # set the label to ROOT for each root dependent
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ] deco_labels = ['ROOT' if head == i else deco_labels[i]
for i, head in enumerate(proj_heads)]
# count label frequencies # count label frequencies
if label_freq_cutoff > 0: if label_freq_cutoff > 0:
for label in deco_labels: for label in deco_labels:
if is_decorated(label): if is_decorated(label):
freqs[label] = freqs.get(label, 0) + 1 freqs[label] = freqs.get(label, 0) + 1
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts)) prepro_sents.append(
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
preprocessed.append((raw_text, prepro_sents)) preprocessed.append((raw_text, prepro_sents))
if label_freq_cutoff > 0: if label_freq_cutoff > 0:
return _filter_labels(preprocessed, label_freq_cutoff, freqs) return _filter_labels(preprocessed, label_freq_cutoff, freqs)
return preprocessed return preprocessed
def projectivize(heads, labels): def projectivize(heads, labels):
# use the algorithm by Nivre & Nilsson 2005 # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
# assumes heads to be a proper tree, i.e. connected and cycle-free # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
# returns a new pair (heads,labels) which encode # which encode a projective and decorated tree.
# a projective and decorated tree
proj_heads = copy(heads) proj_heads = copy(heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
if smallest_np_arc == None: # this sentence is already projective if smallest_np_arc is None: # this sentence is already projective
return proj_heads, copy(labels) return proj_heads, copy(labels)
while smallest_np_arc != None: while smallest_np_arc is not None:
_lift(smallest_np_arc, proj_heads) _lift(smallest_np_arc, proj_heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
deco_labels = _decorate(heads, proj_heads, labels) deco_labels = _decorate(heads, proj_heads, labels)
@ -114,9 +112,9 @@ def projectivize(heads, labels):
def deprojectivize(tokens): def deprojectivize(tokens):
# reattach arcs with decorated labels (following HEAD scheme) # Reattach arcs with decorated labels (following HEAD scheme). For each
# for each decorated arc X||Y, search top-down, left-to-right, # decorated arc X||Y, search top-down, left-to-right, breadth-first until
# breadth-first until hitting a Y then make this the new head # hitting a Y then make this the new head.
for token in tokens: for token in tokens:
if is_decorated(token.dep_): if is_decorated(token.dep_):
newlabel, headlabel = decompose(token.dep_) newlabel, headlabel = decompose(token.dep_)
@ -125,13 +123,15 @@ def deprojectivize(tokens):
token.dep_ = newlabel token.dep_ = newlabel
return tokens return tokens
def _decorate(heads, proj_heads, labels): def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005 # uses decoration scheme HEAD from Nivre & Nilsson 2005
assert(len(heads) == len(proj_heads) == len(labels)) assert(len(heads) == len(proj_heads) == len(labels))
deco_labels = [] deco_labels = []
for tokenid, head in enumerate(heads): for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]: if head != proj_heads[tokenid]:
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) deco_labels.append(
'%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
else: else:
deco_labels.append(labels[tokenid]) deco_labels.append(labels[tokenid])
return deco_labels return deco_labels
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
next_queue = [] next_queue = []
for qtoken in queue: for qtoken in queue:
for child in qtoken.children: for child in qtoken.children:
if child.is_space: continue if child.is_space:
if child == token: continue continue
if child == token:
continue
if child.dep_ == headlabel: if child.dep_ == headlabel:
return child return child
next_queue.append(child) next_queue.append(child)
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
for raw_text, sents in gold_tuples: for raw_text, sents in gold_tuples:
filtered_sents = [] filtered_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents: for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ] filtered_labels = [decompose(label)[0]
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) if freqs.get(label, cutoff) < cutoff
else label for label in labels]
filtered_sents.append(
((ids, words, tags, heads, filtered_labels, iob), ctnts))
filtered.append((raw_text, filtered_sents)) filtered.append((raw_text, filtered_sents))
return filtered return filtered

View File

@ -2,17 +2,8 @@
# cython: infer_types=True # cython: infer_types=True
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t, uint64_t
import numpy import numpy
from ..vocab cimport EMPTY_LEXEME
from ..structs cimport Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
from ..attrs cimport attr_id_t
from ..tokens.token cimport Token
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc

View File

@ -2,17 +2,17 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from collections import defaultdict, OrderedDict from collections import OrderedDict
import ujson import ujson
from .. import util
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..compat import json_dumps
from .. import util
cdef weight_t MIN_SCORE = -90000 cdef weight_t MIN_SCORE = -90000
@ -136,11 +136,12 @@ cdef class TransitionSystem:
print([gold.c.ner[i].clas for i in range(gold.length)]) print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)]) print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)]) print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels", [self.c[i].label for i in range(self.n_moves)]) print("Self labels",
[self.c[i].label for i in range(self.n_moves)])
raise ValueError( raise ValueError(
"Could not find a gold-standard action to supervise " "Could not find a gold-standard action to supervise "
"the entity recognizer\n" "the entity recognizer. The transition system has "
"The transition system has %d actions." % (self.n_moves)) "%d actions." % (self.n_moves))
def get_class_name(self, int clas): def get_class_name(self, int clas):
act = self.c[clas] act = self.c[clas]
@ -186,7 +187,7 @@ cdef class TransitionSystem:
'name': self.move_name(trans.move, trans.label) 'name': self.move_name(trans.move, trans.label)
}) })
serializers = { serializers = {
'transitions': lambda: ujson.dumps(transitions), 'transitions': lambda: json_dumps(transitions),
'strings': lambda: self.strings.to_bytes() 'strings': lambda: self.strings.to_bytes()
} }
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)

View File

@ -1,17 +0,0 @@
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC
from .structs cimport TokenC
from .vocab cimport Vocab
cdef class TaggerModel(AveragedPerceptron):
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *
cdef class Tagger:
cdef readonly Vocab vocab
cdef readonly TaggerModel model
cdef public dict freqs
cdef public object cfg

View File

@ -1,253 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from collections import defaultdict
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .gold cimport GoldParse
from .attrs cimport *
cpdef enum:
P2_orth
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_lemma
P2_flags
P1_orth
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_lemma
P1_flags
W_orth
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_lemma
W_flags
N1_orth
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_lemma
N1_flags
N2_orth
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_lemma
N2_flags
N_CONTEXT_FIELDS
cdef class TaggerModel(AveragedPerceptron):
def update(self, Example eg):
self.time += 1
guess = eg.guess
best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)
if guess != best:
for feat in eg.c.features[:eg.c.nr_feat]:
self.update_weight(feat.key, best, -feat.value)
self.update_weight(feat.key, guess, feat.value)
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.lower
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.tag
context[6] = t.lemma
if t.lex.flags & (1 << IS_ALPHA):
context[7] = 1
elif t.lex.flags & (1 << IS_PUNCT):
context[7] = 2
elif t.lex.flags & (1 << LIKE_URL):
context[7] = 3
elif t.lex.flags & (1 << LIKE_NUM):
context[7] = 4
else:
context[7] = 0
cdef class Tagger:
"""Annotate part-of-speech tags on Doc objects."""
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
"""Create a Tagger.
vocab (Vocab): The vocabulary object. Must be shared with documents to
be processed.
model (thinc.linear.AveragedPerceptron): The statistical model.
RETURNS (Tagger): The newly constructed object.
"""
if model is None:
model = TaggerModel(cfg.get('features', self.feature_templates),
L1=0.0)
self.vocab = vocab
self.model = model
self.model.l1_penalty = 0.0
# TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1
self.cfg = cfg
@property
def tag_names(self):
return self.vocab.morphology.tag_names
def __reduce__(self):
return (self.__class__, (self.vocab, self.model), None, None)
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):
self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
doc (Doc): The tokens to be tagged.
"""
if tokens.length == 0:
return 0
cdef Pool mem = Pool()
cdef int i, tag
cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS,
nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat)
for i in range(tokens.length):
if tokens.c[i].pos == 0:
self.model.set_featuresC(&eg.c, tokens.c, i)
self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat)
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
eg.fill_scores(0, eg.c.nr_class)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2):
"""Tag a stream of documents.
Arguments:
stream: The sequence of documents to tag.
batch_size (int): The number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the Matcher implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
"""
for doc in stream:
self(doc)
yield doc
def update(self, Doc tokens, GoldParse gold, itn=0):
"""Update the statistical model, with tags supplied for the given document.
doc (Doc): The document to update on.
gold (GoldParse): Manager for the gold-standard tags.
RETURNS (int): Number of tags predicted correctly.
"""
gold_tag_strs = gold.tags
assert len(tokens) == len(gold_tag_strs)
for tag in gold_tag_strs:
if tag != None and tag not in self.tag_names:
msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "
"gold tags, to maintain coarse-grained mapping.")
raise ValueError(msg % tag)
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
cdef int correct = 0
cdef Pool mem = Pool()
cdef Example eg = Example(
nr_atom=N_CONTEXT_FIELDS,
nr_class=self.vocab.morphology.n_tags,
nr_feat=self.model.nr_feat)
for i in range(tokens.length):
self.model.set_featuresC(&eg.c, tokens.c, i)
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
self.model.set_scoresC(eg.c.scores,
eg.c.features, eg.c.nr_feat)
self.model.update(eg)
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
correct += eg.cost == 0
self.freqs[TAG][tokens.c[i].tag] += 1
eg.fill_scores(0, eg.c.nr_class)
eg.fill_costs(0, eg.c.nr_class)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
return correct
feature_templates = (
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,),
)

View File

@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
import regex as re import regex as re
from .strings cimport hash_string
from . import util
cimport cython cimport cython
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .strings cimport hash_string
from . import util
cdef class Tokenizer: cdef class Tokenizer:
@ -74,9 +73,8 @@ cdef class Tokenizer:
RETURNS (Doc): A container for linguistic annotations. RETURNS (Doc): A container for linguistic annotations.
""" """
if len(string) >= (2 ** 30): if len(string) >= (2 ** 30):
raise ValueError( msg = "String is too long: %d characters. Max is 2**30."
"String is too long: %d characters. Max is 2**30." % len(string) raise ValueError(msg % len(string))
)
cdef int length = len(string) cdef int length = len(string)
cdef Doc doc = Doc(self.vocab) cdef Doc doc = Doc(self.vocab)
if length == 0: if length == 0:
@ -122,8 +120,8 @@ cdef class Tokenizer:
"""Tokenize a stream of texts. """Tokenize a stream of texts.
texts: A sequence of unicode texts. texts: A sequence of unicode texts.
batch_size (int): The number of texts to accumulate in an internal buffer. batch_size (int): Number of texts to accumulate in an internal buffer.
n_threads (int): The number of threads to use, if the implementation n_threads (int): Number of threads to use, if the implementation
supports multi-threading. The default tokenizer is single-threaded. supports multi-threading. The default tokenizer is single-threaded.
YIELDS (Doc): A sequence of Doc objects, in order. YIELDS (Doc): A sequence of Doc objects, in order.
""" """
@ -232,8 +230,8 @@ cdef class Tokenizer:
if not matches: if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False) tokens.push_back(self.vocab.get(tokens.mem, string), False)
else: else:
# let's say we have dyn-o-mite-dave # let's say we have dyn-o-mite-dave - the regex finds the
# the regex finds the start and end positions of the hyphens # start and end positions of the hyphens
start = 0 start = 0
for match in matches: for match in matches:
infix_start = match.start() infix_start = match.start()
@ -293,8 +291,8 @@ cdef class Tokenizer:
return list(self.infix_finditer(string)) return list(self.infix_finditer(string))
def find_prefix(self, unicode string): def find_prefix(self, unicode string):
"""Find the length of a prefix that should be segmented from the string, """Find the length of a prefix that should be segmented from the
or None if no prefix rules match. string, or None if no prefix rules match.
string (unicode): The string to segment. string (unicode): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`. RETURNS (int): The length of the prefix if present, otherwise `None`.
@ -305,8 +303,8 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string): def find_suffix(self, unicode string):
"""Find the length of a suffix that should be segmented from the string, """Find the length of a suffix that should be segmented from the
or None if no suffix rules match. string, or None if no suffix rules match.
string (unicode): The string to segment. string (unicode): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`. Returns (int): The length of the suffix if present, otherwise `None`.
@ -326,8 +324,8 @@ cdef class Tokenizer:
string (unicode): The string to specially tokenize. string (unicode): The string to specially tokenize.
token_attrs (iterable): A sequence of dicts, where each dict describes token_attrs (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes must a token and its attributes. The `ORTH` fields of the attributes
exactly match the string when they are concatenated. must exactly match the string when they are concatenated.
""" """
substrings = list(substrings) substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -343,7 +341,7 @@ cdef class Tokenizer:
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
""" """
with path.open('wb') as file_: with path.open('wb') as file_:
file_.write(self.to_bytes(**exclude)) file_.write(self.to_bytes(**exclude))

View File

@ -2,4 +2,4 @@ from .doc import Doc
from .token import Token from .token import Token
from .span import Span from .span import Span
__all__ = [Doc, Token, Span] __all__ = ['Doc', 'Token', 'Span']

View File

@ -1,21 +0,0 @@
cdef class Binder:
def __init__(self, *docs):
pass
def __iter__(self):
pass
def __reduce__(self):
pass
def to_bytes(self):
pass
def from_bytes(cls, data):
pass
def to_disk(self):
pass
def from_disk(self, path):
pass

View File

@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs, IDS from ..attrs import intify_attrs, IDS
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport SENT_START from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle from ..compat import is_config, copy_reg, pickle
@ -78,17 +78,18 @@ def _get_chunker(lang):
cdef class Doc: cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export """A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings. annotations to numpy arrays, losslessly serialize to compressed binary
The `Doc` object holds an array of `TokenC` structs. The Python-level strings. The `Doc` object holds an array of `TokenC` structs. The
`Token` and `Span` objects are views of this array, i.e. they don't own Python-level `Token` and `Span` objects are views of this array, i.e.
the data themselves. they don't own the data themselves.
EXAMPLE: Construction 1 EXAMPLE: Construction 1
>>> doc = nlp(u'Some text') >>> doc = nlp(u'Some text')
Construction 2 Construction 2
>>> from spacy.tokens import Doc >>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False])
""" """
@classmethod @classmethod
def set_extension(cls, name, default=None, method=None, def set_extension(cls, name, default=None, method=None,
@ -109,15 +110,14 @@ cdef class Doc:
orths_and_spaces=None): orths_and_spaces=None):
"""Create a Doc object. """Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you want vocab (Vocab): A vocabulary object, which must match any models you
to use (e.g. tokenizer, parser, entity recognizer). want to use (e.g. tokenizer, parser, entity recognizer).
words (list or None): A list of unicode strings to add to the document words (list or None): A list of unicode strings to add to the document
as words. If `None`, defaults to empty list. as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as spaces (list or None): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)` it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc. user_data (dict or None): Optional extra data to attach to the Doc.
RETURNS (Doc): The newly constructed object. RETURNS (Doc): The newly constructed object.
""" """
self.vocab = vocab self.vocab = vocab
@ -153,10 +153,10 @@ cdef class Doc:
spaces = [True] * len(words) spaces = [True] * len(words)
elif len(spaces) != len(words): elif len(spaces) != len(words):
raise ValueError( raise ValueError(
"Arguments 'words' and 'spaces' should be sequences of the " "Arguments 'words' and 'spaces' should be sequences of "
"same length, or 'spaces' should be left default at None. " "the same length, or 'spaces' should be left default at "
"spaces should be a sequence of booleans, with True meaning " "None. spaces should be a sequence of booleans, with True "
"that the word owns a ' ' character following it.") "meaning that the word owns a ' ' character following it.")
orths_and_spaces = zip(words, spaces) orths_and_spaces = zip(words, spaces)
if orths_and_spaces is not None: if orths_and_spaces is not None:
for orth_space in orths_and_spaces: for orth_space in orths_and_spaces:
@ -166,7 +166,8 @@ cdef class Doc:
elif isinstance(orth_space, bytes): elif isinstance(orth_space, bytes):
raise ValueError( raise ValueError(
"orths_and_spaces expects either List(unicode) or " "orths_and_spaces expects either List(unicode) or "
"List((unicode, bool)). Got bytes instance: %s" % (str(orth_space))) "List((unicode, bool)). "
"Got bytes instance: %s" % (str(orth_space)))
else: else:
orth, has_space = orth_space orth, has_space = orth_space
# Note that we pass self.mem here --- we have ownership, if LexemeC # Note that we pass self.mem here --- we have ownership, if LexemeC
@ -186,7 +187,8 @@ cdef class Doc:
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a `Token` or `Span` object. """Get a `Token` or `Span` object.
i (int or tuple) The index of the token, or the slice of the document to get. i (int or tuple) The index of the token, or the slice of the document
to get.
RETURNS (Token or Span): The token at `doc[i]]`, or the span at RETURNS (Token or Span): The token at `doc[i]]`, or the span at
`doc[start : end]`. `doc[start : end]`.
@ -199,11 +201,11 @@ cdef class Doc:
>>> doc[start : end]] >>> doc[start : end]]
Get a `Span` object, starting at position `start` and ending at Get a `Span` object, starting at position `start` and ending at
position `end`, where `start` and `end` are token indices. For position `end`, where `start` and `end` are token indices. For
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4. instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
Stepped slices (e.g. `doc[start : end : step]`) are not supported, 4. Stepped slices (e.g. `doc[start : end : step]`) are not
as `Span` objects must be contiguous (cannot have gaps). You can use supported, as `Span` objects must be contiguous (cannot have gaps).
negative indices and open-ended ranges, which have their normal You can use negative indices and open-ended ranges, which have
Python semantics. their normal Python semantics.
""" """
if isinstance(i, slice): if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step) start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -262,8 +264,10 @@ cdef class Doc:
doc (Doc): The parent document. doc (Doc): The parent document.
start (int): The index of the first character of the span. start (int): The index of the first character of the span.
end (int): The index of the first character after the span. end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for named entities. label (uint64 or string): A label to attach to the Span, e.g. for
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
""" """
if not isinstance(label, int): if not isinstance(label, int):
@ -322,7 +326,8 @@ cdef class Doc:
if self._vector is not None: if self._vector is not None:
return self._vector return self._vector
elif not len(self): elif not len(self):
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') self._vector = numpy.zeros((self.vocab.vectors_length,),
dtype='f')
return self._vector return self._vector
elif self.has_vector: elif self.has_vector:
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
@ -334,7 +339,8 @@ cdef class Doc:
self._vector = self.tensor.mean(axis=0) self._vector = self.tensor.mean(axis=0)
return self._vector return self._vector
else: else:
return numpy.zeros((self.vocab.vectors_length,), dtype='float32') return numpy.zeros((self.vocab.vectors_length,),
dtype='float32')
def __set__(self, value): def __set__(self, value):
self._vector = value self._vector = value
@ -377,13 +383,14 @@ cdef class Doc:
return self.text return self.text
property ents: property ents:
"""Iterate over the entities in the document. Yields named-entity `Span` """Iterate over the entities in the document. Yields named-entity
objects, if the entity recognizer has been applied to the document. `Span` objects, if the entity recognizer has been applied to the
document.
YIELDS (Span): Entities in the document. YIELDS (Span): Entities in the document.
EXAMPLE: Iterate over the span to get individual Token objects, or access EXAMPLE: Iterate over the span to get individual Token objects,
the label: or access the label:
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents) >>> ents = list(tokens.ents)
@ -419,7 +426,8 @@ cdef class Doc:
def __set__(self, ents): def __set__(self, ents):
# TODO: # TODO:
# 1. Allow negative matches # 1. Allow negative matches
# 2. Ensure pre-set NERs are not over-written during statistical prediction # 2. Ensure pre-set NERs are not over-written during statistical
# prediction
# 3. Test basic data-driven ORTH gazetteer # 3. Test basic data-driven ORTH gazetteer
# 4. Test more nuanced date and currency regex # 4. Test more nuanced date and currency regex
cdef int i cdef int i
@ -456,10 +464,11 @@ cdef class Doc:
property noun_chunks: property noun_chunks:
"""Iterate over the base noun phrases in the document. Yields base """Iterate over the base noun phrases in the document. Yields base
noun-phrase #[code Span] objects, if the document has been syntactically noun-phrase #[code Span] objects, if the document has been
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does syntactically parsed. A base noun phrase, or "NP chunk", is a noun
not permit other NPs to be nested within it so no NP-level phrase that does not permit other NPs to be nested within it so no
coordination, no prepositional phrases, and no relative clauses. NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Noun chunks in the document. YIELDS (Span): Noun chunks in the document.
""" """
@ -467,12 +476,14 @@ cdef class Doc:
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
"noun_chunks requires the dependency parse, which " "noun_chunks requires the dependency parse, which "
"requires data to be installed. For more info, see the " "requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__) "documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This prevents # Accumulate the result before beginning to iterate over it. This
# the tokenisation from being changed out from under us during the iteration. # prevents the tokenisation from being changed out from under us
# The tricky thing here is that Span accepts its tokenisation changing, # during the iteration. The tricky thing here is that Span accepts
# so it's okay once we have the Span objects. See Issue #375 # its tokenisation changing, so it's okay once we have the Span
# objects. See Issue #375.
spans = [] spans = []
for start, end, label in self.noun_chunks_iterator(self): for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label)) spans.append(Span(self, start, end, label=label))
@ -497,8 +508,9 @@ cdef class Doc:
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
"sentence boundary detection requires the dependency parse, which " "Sentence boundary detection requires the dependency "
"requires data to be installed. For more info, see the " "parse, which requires a statistical model to be "
"installed and loaded. For more info, see the "
"documentation: \n%s\n" % about.__docs_models__) "documentation: \n%s\n" % about.__docs_models__)
cdef int i cdef int i
start = 0 start = 0
@ -537,12 +549,11 @@ cdef class Doc:
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
"""Export given token attributes to a numpy `ndarray`. """Export given token attributes to a numpy `ndarray`.
If `attr_ids` is a sequence of M attributes, the output array will be
If `attr_ids` is a sequence of M attributes, the output array will of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
be of shape `(N, M)`, where N is the length of the `Doc` `attr_ids` is a single attribute, the output shape will be (N,). You
(in tokens). If `attr_ids` is a single attribute, the output shape will can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) string name (e.g. 'LEMMA' or 'lemma').
or string name (e.g. 'LEMMA' or 'lemma').
attr_ids (list[]): A list of attributes (int IDs or string names). attr_ids (list[]): A list of attributes (int IDs or string names).
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
@ -566,18 +577,19 @@ cdef class Doc:
# Allow strings, e.g. 'lemma' or 'LEMMA' # Allow strings, e.g. 'lemma' or 'LEMMA'
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_) py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
for id_ in py_attr_ids] for id_ in py_attr_ids]
# Make an array from the attributes --- otherwise our inner loop is Python # Make an array from the attributes --- otherwise our inner loop is
# dict iteration. # Python dict iteration.
attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) output = numpy.ndarray(shape=(self.length, len(attr_ids)),
dtype=numpy.uint64)
for i in range(self.length): for i in range(self.length):
for j, feature in enumerate(attr_ids): for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.c[i], feature) output[i, j] = get_token_attr(&self.c[i], feature)
# Handle 1d case # Handle 1d case
return output if len(attr_ids) >= 2 else output.reshape((self.length,)) return output if len(attr_ids) >= 2 else output.reshape((self.length,))
def count_by(self, attr_id_t attr_id, exclude=None,
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): PreshCounter counts=None):
"""Count the frequencies of a given attribute. Produces a dict of """Count the frequencies of a given attribute. Produces a dict of
`{attribute (int): count (ints)}` frequencies, keyed by the values of `{attribute (int): count (ints)}` frequencies, keyed by the values of
the given attribute ID. the given attribute ID.
@ -641,13 +653,12 @@ cdef class Doc:
def from_array(self, attrs, array): def from_array(self, attrs, array):
if SENT_START in attrs and HEAD in attrs: if SENT_START in attrs and HEAD in attrs:
raise ValueError( raise ValueError(
"Conflicting attributes specified in doc.from_array():\n" "Conflicting attributes specified in doc.from_array(): "
"(HEAD, SENT_START)\n" "(HEAD, SENT_START)\n"
"The HEAD attribute currently sets sentence boundaries implicitly,\n" "The HEAD attribute currently sets sentence boundaries "
"based on the tree structure. This means the HEAD attribute would " "implicitly, based on the tree structure. This means the HEAD "
"potentially override the sentence boundaries set by SENT_START.\n" "attribute would potentially override the sentence boundaries "
"See https://github.com/spacy-io/spaCy/issues/235 for details and " "set by SENT_START.")
"workarounds, and to propose solutions.")
cdef int i, col cdef int i, col
cdef attr_id_t attr_id cdef attr_id_t attr_id
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c
@ -675,18 +686,14 @@ cdef class Doc:
return self return self
def get_lca_matrix(self): def get_lca_matrix(self):
''' """Calculates the lowest common ancestor matrix for a given `Doc`.
Calculates the lowest common ancestor matrix Returns LCA matrix containing the integer index of the ancestor, or -1
for a given Spacy doc. if no common ancestor is found (ex if span excludes a necessary
Returns LCA matrix containing the integer index ancestor). Apologies about the recursion, but the impact on
of the ancestor, or -1 if no common ancestor is performance is negligible given the natural limitations on the depth
found (ex if span excludes a necessary ancestor). of a typical human sentence.
Apologies about the recursion, but the """
impact on performance is negligible given
the natural limitations on the depth of a typical human sentence.
'''
# Efficiency notes: # Efficiency notes:
#
# We can easily improve the performance here by iterating in Cython. # We can easily improve the performance here by iterating in Cython.
# To loop over the tokens in Cython, the easiest way is: # To loop over the tokens in Cython, the easiest way is:
# for token in doc.c[:doc.c.length]: # for token in doc.c[:doc.c.length]:
@ -705,7 +712,8 @@ cdef class Doc:
elif (token_j.head == token_j) and (token_k.head == token_k): elif (token_j.head == token_j) and (token_k.head == token_k):
lca_index = -1 lca_index = -1
else: else:
lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix) lca_index = __pairwise_lca(token_j.head, token_k.head,
lca_matrix)
lca_matrix[token_j.i][token_k.i] = lca_index lca_matrix[token_j.i][token_k.i] = lca_index
lca_matrix[token_k.i][token_j.i] = lca_index lca_matrix[token_k.i][token_j.i] = lca_index
@ -719,14 +727,13 @@ cdef class Doc:
token_k = self[k] token_k = self[k]
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
lca_matrix[k][j] = lca_matrix[j][k] lca_matrix[k][j] = lca_matrix[j][k]
return lca_matrix return lca_matrix
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
""" """
with path.open('wb') as file_: with path.open('wb') as file_:
file_.write(self.to_bytes(**exclude)) file_.write(self.to_bytes(**exclude))
@ -792,7 +799,8 @@ cdef class Doc:
# keys, we must have tuples. In values we just have to hope # keys, we must have tuples. In values we just have to hope
# users don't mind getting a list instead of a tuple. # users don't mind getting a list instead of a tuple.
if 'user_data' not in exclude and 'user_data_keys' in msg: if 'user_data' not in exclude and 'user_data_keys' in msg:
user_data_keys = msgpack.loads(msg['user_data_keys'], use_list=False) user_data_keys = msgpack.loads(msg['user_data_keys'],
use_list=False)
user_data_values = msgpack.loads(msg['user_data_values']) user_data_values = msgpack.loads(msg['user_data_values'])
for key, value in zip(user_data_keys, user_data_values): for key, value in zip(user_data_keys, user_data_values):
self.user_data[key] = value self.user_data[key] = value
@ -819,14 +827,15 @@ cdef class Doc:
return self return self
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` """Retokenize the document, such that the span at
is merged into a single token. If `start_idx` and `end_idx `do not mark `doc.text[start_idx : end_idx]` is merged into a single token. If
start and end token boundaries, the document remains unchanged. `start_idx` and `end_idx `do not mark start and end token boundaries,
the document remains unchanged.
start_idx (int): The character index of the start of the slice to merge. start_idx (int): Character index of the start of the slice to merge.
end_idx (int): The character index after the end of the slice to merge. end_idx (int): Character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default, **attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span. attributes are inherited from the syntactic root of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries. indices did not fall at token boundaries.
""" """
@ -847,10 +856,11 @@ cdef class Doc:
attributes[ENT_TYPE] = attributes['ent_type'] attributes[ENT_TYPE] = attributes['ent_type']
elif args: elif args:
raise ValueError( raise ValueError(
"Doc.merge received %d non-keyword arguments. " "Doc.merge received %d non-keyword arguments. Expected either "
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " "3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n" "Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) "Keyword arguments: %s\n" % (len(args), repr(args),
repr(attributes)))
# More deprecated attribute handling =/ # More deprecated attribute handling =/
if 'label' in attributes: if 'label' in attributes:
@ -882,8 +892,9 @@ cdef class Doc:
Token.set_struct_attr(token, attr_name, attr_value) Token.set_struct_attr(token, attr_name, attr_value)
# Begin by setting all the head indices to absolute token positions # Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets # This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency # Before thinking of something simpler, beware the case where a
# bridges over the entity. Here the alignment of the tokens changes. # dependency bridges over the entity. Here the alignment of the
# tokens changes.
span_root = span.root.i span_root = span.root.i
token.dep = span.root.dep token.dep = span.root.dep
# We update token.lex after keeping span root and dep, since # We update token.lex after keeping span root and dep, since
@ -932,8 +943,9 @@ cdef class Doc:
>>> trees = doc.print_tree() >>> trees = doc.print_tree()
>>> trees[1] >>> trees[1]
{'modifiers': [ {'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
'lemma': 'Alice'},
{'modifiers': [ {'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
@ -1018,4 +1030,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):
copy_reg.pickle(Doc, pickle_doc, unpickle_doc) copy_reg.pickle(Doc, pickle_doc, unpickle_doc)

View File

@ -43,8 +43,8 @@ def POS_tree(root, light=False, flat=False):
def parse_tree(doc, light=False, flat=False): def parse_tree(doc, light=False, flat=False):
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to """Make a copy of the doc and construct a syntactic parse tree similar to
the one used in displaCy. Generates the POS tree for all sentences in a doc. displaCy. Generates the POS tree for all sentences in a doc.
doc (Doc): The doc for parsing. doc (Doc): The doc for parsing.
RETURNS (dict): The parse tree. RETURNS (dict): The parse tree.
@ -70,4 +70,5 @@ def parse_tree(doc, light=False, flat=False):
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE])) doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
merge_ents(doc_clone) # merge the entities into single tokens first merge_ents(doc_clone) # merge the entities into single tokens first
return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents] return [POS_tree(sent.root, light=light, flat=flat)
for sent in doc_clone.sents]

View File

@ -35,15 +35,16 @@ cdef class Span:
def has_extension(cls, name): def has_extension(cls, name):
return name in Underscore.span_extensions return name in Underscore.span_extensions
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
vector_norm=None): vector=None, vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`. """Create a `Span` object from the slice `doc[start : end]`.
doc (Doc): The parent document. doc (Doc): The parent document.
start (int): The index of the first token of the span. start (int): The index of the first token of the span.
end (int): The index of the first token after the span. end (int): The index of the first token after the span.
label (uint64): A label to attach to the Span, e.g. for named entities. label (uint64): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. vector (ndarray[ndim=1, dtype='float32']): A meaning representation
of the span.
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
""" """
if not (0 <= start <= end <= len(doc)): if not (0 <= start <= end <= len(doc)):
@ -127,14 +128,17 @@ cdef class Span:
@property @property
def _(self): def _(self):
"""User space for adding custom attribute extensions."""
return Underscore(Underscore.span_extensions, self, return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char) start=self.start_char, end=self.end_char)
def as_doc(self): def as_doc(self):
'''Create a Doc object view of the Span's data. # TODO: fix
"""Create a `Doc` object view of the Span's data. This is mostly
useful for C-typed interfaces.
This is mostly useful for C-typed interfaces. RETURNS (Doc): The `Doc` view of the span.
''' """
cdef Doc doc = Doc(self.doc.vocab) cdef Doc doc = Doc(self.doc.vocab)
doc.length = self.end-self.start doc.length = self.end-self.start
doc.c = &self.doc.c[self.start] doc.c = &self.doc.c[self.start]
@ -162,7 +166,8 @@ cdef class Span:
attributes are inherited from the syntactic root token of the span. attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token. RETURNS (Token): The newly merged token.
""" """
return self.doc.merge(self.start_char, self.end_char, *args, **attributes) return self.doc.merge(self.start_char, self.end_char, *args,
**attributes)
def similarity(self, other): def similarity(self, other):
"""Make a semantic similarity estimate. The default estimate is cosine """Make a semantic similarity estimate. The default estimate is cosine
@ -179,24 +184,19 @@ cdef class Span:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
def get_lca_matrix(self): def get_lca_matrix(self):
''' """Calculates the lowest common ancestor matrix for a given `Span`.
Calculates the lowest common ancestor matrix Returns LCA matrix containing the integer index of the ancestor, or -1
for a given Spacy span. if no common ancestor is found (ex if span excludes a necessary
Returns LCA matrix containing the integer index ancestor). Apologies about the recursion, but the impact on
of the ancestor, or -1 if no common ancestor is performance is negligible given the natural limitations on the depth
found (ex if span excludes a necessary ancestor). of a typical human sentence.
Apologies about the recursion, but the """
impact on performance is negligible given
the natural limitations on the depth of a typical human sentence.
'''
def __pairwise_lca(token_j, token_k, lca_matrix, margins): def __pairwise_lca(token_j, token_k, lca_matrix, margins):
offset = margins[0] offset = margins[0]
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
token_j_i = token_j.i - offset token_j_i = token_j.i - offset
token_k_i = token_k.i - offset token_k_i = token_k.i - offset
if lca_matrix[token_j_i][token_k_i] != -2: if lca_matrix[token_j_i][token_k_i] != -2:
return lca_matrix[token_j_i][token_k_i] return lca_matrix[token_j_i][token_k_i]
elif token_j == token_k: elif token_j == token_k:
@ -209,23 +209,19 @@ cdef class Span:
lca_index = -1 lca_index = -1
else: else:
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
lca_matrix[token_j_i][token_k_i] = lca_index lca_matrix[token_j_i][token_k_i] = lca_index
lca_matrix[token_k_i][token_j_i] = lca_index lca_matrix[token_k_i][token_j_i] = lca_index
return lca_index return lca_index
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
lca_matrix.fill(-2) lca_matrix.fill(-2)
margins = [self.start, self.end] margins = [self.start, self.end]
for j in range(len(self)): for j in range(len(self)):
token_j = self[j] token_j = self[j]
for k in range(len(self)): for k in range(len(self)):
token_k = self[k] token_k = self[k]
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
lca_matrix[k][j] = lca_matrix[j][k] lca_matrix[k][j] = lca_matrix[j][k]
return lca_matrix return lca_matrix
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
@ -266,10 +262,7 @@ cdef class Span:
self.end = end + 1 self.end = end + 1
property sent: property sent:
"""The sentence span that this span is a part of. """RETURNS (Span): The sentence span that the span is a part of."""
RETURNS (Span): The sentence span that the span is a part of.
"""
def __get__(self): def __get__(self):
if 'sent' in self.doc.user_span_hooks: if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self) return self.doc.user_span_hooks['sent'](self)
@ -285,10 +278,7 @@ cdef class Span:
return self.doc[root.l_edge:root.r_edge + 1] return self.doc[root.l_edge:root.r_edge + 1]
property has_vector: property has_vector:
"""A boolean value indicating whether a word vector is associated with """RETURNS (bool): Whether a word vector is associated with the object.
the object.
RETURNS (bool): Whether a word vector is associated with the object.
""" """
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.user_span_hooks: if 'has_vector' in self.doc.user_span_hooks:
@ -310,10 +300,7 @@ cdef class Span:
return self._vector return self._vector
property vector_norm: property vector_norm:
"""The L2 norm of the document's vector representation. """RETURNS (float): The L2 norm of the vector representation."""
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self): def __get__(self):
if 'vector_norm' in self.doc.user_span_hooks: if 'vector_norm' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['vector'](self) return self.doc.user_span_hooks['vector'](self)
@ -327,7 +314,9 @@ cdef class Span:
return self._vector_norm return self._vector_norm
property sentiment: property sentiment:
# TODO: docstring """RETURNS (float): A scalar value indicating the positivity or
negativity of the span.
"""
def __get__(self): def __get__(self):
if 'sentiment' in self.doc.user_span_hooks: if 'sentiment' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sentiment'](self) return self.doc.user_span_hooks['sentiment'](self)
@ -335,10 +324,7 @@ cdef class Span:
return sum([token.sentiment for token in self]) / len(self) return sum([token.sentiment for token in self]) / len(self)
property text: property text:
"""A unicode representation of the span text. """RETURNS (unicode): The original verbatim text of the span."""
RETURNS (unicode): The original verbatim text of the span.
"""
def __get__(self): def __get__(self):
text = self.text_with_ws text = self.text_with_ws
if self[-1].whitespace_: if self[-1].whitespace_:
@ -349,7 +335,8 @@ cdef class Span:
"""The text content of the span with a trailing whitespace character if """The text content of the span with a trailing whitespace character if
the last token has one. the last token has one.
RETURNS (unicode): The text content of the span (with trailing whitespace). RETURNS (unicode): The text content of the span (with trailing
whitespace).
""" """
def __get__(self): def __get__(self):
return u''.join([t.text_with_ws for t in self]) return u''.join([t.text_with_ws for t in self])
@ -358,7 +345,8 @@ cdef class Span:
"""Yields base noun-phrase `Span` objects, if the document has been """Yields base noun-phrase `Span` objects, if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun syntactically parsed. A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it so no phrase that does not permit other NPs to be nested within it so no
NP-level coordination, no prepositional phrases, and no relative clauses. NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Base noun-phrase `Span` objects YIELDS (Span): Base noun-phrase `Span` objects
""" """
@ -366,12 +354,14 @@ cdef class Span:
if not self.doc.is_parsed: if not self.doc.is_parsed:
raise ValueError( raise ValueError(
"noun_chunks requires the dependency parse, which " "noun_chunks requires the dependency parse, which "
"requires data to be installed. For more info, see the " "requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__) "documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This prevents # Accumulate the result before beginning to iterate over it. This
# the tokenisation from being changed out from under us during the iteration. # prevents the tokenisation from being changed out from under us
# The tricky thing here is that Span accepts its tokenisation changing, # during the iteration. The tricky thing here is that Span accepts
# so it's okay once we have the Span objects. See Issue #375 # its tokenisation changing, so it's okay once we have the Span
# objects. See Issue #375
spans = [] spans = []
cdef attr_t label cdef attr_t label
for start, end, label in self.doc.noun_chunks_iterator(self): for start, end, label in self.doc.noun_chunks_iterator(self):
@ -385,9 +375,9 @@ cdef class Span:
RETURNS (Token): The root token. RETURNS (Token): The root token.
EXAMPLE: The root token has the shortest path to the root of the sentence EXAMPLE: The root token has the shortest path to the root of the
(or is the root itself). If multiple words are equally high in the sentence (or is the root itself). If multiple words are equally
tree, the first word is taken. For example: high in the tree, the first word is taken. For example:
>>> toks = nlp(u'I like New York in Autumn.') >>> toks = nlp(u'I like New York in Autumn.')
@ -437,11 +427,11 @@ cdef class Span:
if self.doc.c[i].head == 0: if self.doc.c[i].head == 0:
return self.doc[i] return self.doc[i]
# If we don't have a sentence root, we do something that's not so # If we don't have a sentence root, we do something that's not so
# algorithmically clever, but I think should be quite fast, especially # algorithmically clever, but I think should be quite fast,
# for short spans. # especially for short spans.
# For each word, we count the path length, and arg min this measure. # For each word, we count the path length, and arg min this measure.
# We could use better tree logic to save steps here...But I think this # We could use better tree logic to save steps here...But I
# should be okay. # think this should be okay.
cdef int current_best = self.doc.length cdef int current_best = self.doc.length
cdef int root = -1 cdef int root = -1
for i in range(self.start, self.end): for i in range(self.start, self.end):
@ -463,7 +453,7 @@ cdef class Span:
YIELDS (Token):A left-child of a token of the span. YIELDS (Token):A left-child of a token of the span.
""" """
def __get__(self): def __get__(self):
for token in reversed(self): # Reverse, so we get the tokens in order for token in reversed(self): # Reverse, so we get tokens in order
for left in token.lefts: for left in token.lefts:
if left.i < self.start: if left.i < self.start:
yield left yield left
@ -480,6 +470,22 @@ cdef class Span:
if right.i >= self.end: if right.i >= self.end:
yield right yield right
property n_lefts:
"""RETURNS (int): The number of leftward immediate children of the
span, in the syntactic dependency parse.
"""
# TODO: implement
def __get__(self):
raise NotImplementedError
property n_rights:
"""RETURNS (int): The number of rightward immediate children of the
span, in the syntactic dependency parse.
"""
# TODO: implement
def __get__(self):
raise NotImplementedError
property subtree: property subtree:
"""Tokens that descend from tokens in the span, but fall outside it. """Tokens that descend from tokens in the span, but fall outside it.
@ -493,66 +499,55 @@ cdef class Span:
yield from word.subtree yield from word.subtree
property ent_id: property ent_id:
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`. """RETURNS (uint64): The entity ID."""
RETURNS (uint64): The entity ID.
"""
def __get__(self): def __get__(self):
return self.root.ent_id return self.root.ent_id
def __set__(self, hash_t key): def __set__(self, hash_t key):
# TODO
raise NotImplementedError( raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on the issue " "Can't yet set ent_id from Span. Vote for this feature on "
"tracker: http://github.com/explosion/spaCy/issues") "the issue tracker: http://github.com/explosion/spaCy/issues")
property ent_id_: property ent_id_:
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`. """RETURNS (unicode): The (string) entity ID."""
RETURNS (unicode): The entity ID.
"""
def __get__(self): def __get__(self):
return self.root.ent_id_ return self.root.ent_id_
def __set__(self, hash_t key): def __set__(self, hash_t key):
# TODO
raise NotImplementedError( raise NotImplementedError(
"Can't yet set ent_id_ from Span. Vote for this feature on the issue " "Can't yet set ent_id_ from Span. Vote for this feature on the "
"tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
property orth_: property orth_:
# TODO: docstring """Verbatim text content (identical to Span.text). Exists mostly for
consistency with other attributes.
RETURNS (unicode): The span's text."""
def __get__(self): def __get__(self):
return ''.join([t.string for t in self]).strip() return ''.join([t.orth_ for t in self]).strip()
property lemma_: property lemma_:
"""The span's lemma. """RETURNS (unicode): The span's lemma."""
RETURNS (unicode): The span's lemma.
"""
def __get__(self): def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip() return ' '.join([t.lemma_ for t in self]).strip()
property upper_: property upper_:
# TODO: docstring """Deprecated. Use Span.text.upper() instead."""
def __get__(self): def __get__(self):
return ''.join([t.string.upper() for t in self]).strip() return ''.join([t.text_with_ws.upper() for t in self]).strip()
property lower_: property lower_:
# TODO: docstring """Deprecated. Use Span.text.lower() instead."""
def __get__(self): def __get__(self):
return ''.join([t.string.lower() for t in self]).strip() return ''.join([t.text_with_ws.lower() for t in self]).strip()
property string: property string:
# TODO: docstring """Deprecated: Use Span.text_with_ws instead."""
def __get__(self): def __get__(self):
return ''.join([t.string for t in self]) return ''.join([t.text_with_ws for t in self])
property label_: property label_:
"""The span's label. """RETURNS (unicode): The span's label."""
RETURNS (unicode): The span's label.
"""
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.label] return self.doc.vocab.strings[self.label]
@ -570,7 +565,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
n += 1 n += 1
if n >= sent_length: if n >= sent_length:
raise RuntimeError( raise RuntimeError(
"Array bounds exceeded while searching for root word. This likely " "Array bounds exceeded while searching for root word. This "
"means the parse tree is in an invalid state. Please report this " "likely means the parse tree is in an invalid state. Please "
"issue here: http://github.com/explosion/spaCy/issues") "report this issue here: "
"http://github.com/explosion/spaCy/issues")
return n return n

View File

@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .. import parts_of_speech from .. import parts_of_speech
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LEMMA, POS, TAG, DEP from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..compat import is_config from ..compat import is_config
from .. import about from .. import about
from .underscore import Underscore from .underscore import Underscore
cdef class Token: cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace, etc.""" """An individual token i.e. a word, punctuation symbol, whitespace,
etc."""
@classmethod @classmethod
def set_extension(cls, name, default=None, method=None, def set_extension(cls, name, default=None, method=None,
getter=None, setter=None): getter=None, setter=None):
@ -144,37 +145,33 @@ cdef class Token:
return self.doc.user_token_hooks['similarity'](self) return self.doc.user_token_hooks['similarity'](self)
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return (numpy.dot(self.vector, other.vector) /
(self.vector_norm * other.vector_norm))
property lex_id: property lex_id:
"""ID of the token's lexical type. """RETURNS (int): Sequential ID of the token's lexical type."""
RETURNS (int): ID of the token's lexical type."""
def __get__(self): def __get__(self):
return self.c.lex.id return self.c.lex.id
property rank: property rank:
# TODO: add docstring """RETURNS (int): Sequential ID of the token's lexical type, used to
index into tables, e.g. for word vectors."""
def __get__(self): def __get__(self):
return self.c.lex.id return self.c.lex.id
property string: property string:
"""Deprecated: Use Token.text_with_ws instead."""
def __get__(self): def __get__(self):
return self.text_with_ws return self.text_with_ws
property text: property text:
"""A unicode representation of the token text. """RETURNS (unicode): The original verbatim text of the token."""
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self): def __get__(self):
return self.orth_ return self.orth_
property text_with_ws: property text_with_ws:
"""The text content of the token with a trailing whitespace character if """RETURNS (unicode): The text content of the span (with trailing
it has one. whitespace).
RETURNS (unicode): The text content of the span (with trailing whitespace).
""" """
def __get__(self): def __get__(self):
cdef unicode orth = self.vocab.strings[self.c.lex.orth] cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@ -184,74 +181,104 @@ cdef class Token:
return orth return orth
property prob: property prob:
"""RETURNS (float): Smoothed log probability estimate of token type."""
def __get__(self): def __get__(self):
return self.c.lex.prob return self.c.lex.prob
property sentiment: property sentiment:
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the token."""
def __get__(self): def __get__(self):
if 'sentiment' in self.doc.user_token_hooks: if 'sentiment' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['sentiment'](self) return self.doc.user_token_hooks['sentiment'](self)
return self.c.lex.sentiment return self.c.lex.sentiment
property lang: property lang:
"""RETURNS (uint64): ID of the language of the parent document's
vocabulary.
"""
def __get__(self): def __get__(self):
return self.c.lex.lang return self.c.lex.lang
property idx: property idx:
"""RETURNS (int): The character offset of the token within the parent
document.
"""
def __get__(self): def __get__(self):
return self.c.idx return self.c.idx
property cluster: property cluster:
"""RETURNS (int): Brown cluster ID."""
def __get__(self): def __get__(self):
return self.c.lex.cluster return self.c.lex.cluster
property orth: property orth:
"""RETURNS (uint64): ID of the verbatim text content."""
def __get__(self): def __get__(self):
return self.c.lex.orth return self.c.lex.orth
property lower: property lower:
"""RETURNS (uint64): ID of the lowercase token text."""
def __get__(self): def __get__(self):
return self.c.lex.lower return self.c.lex.lower
property norm: property norm:
"""RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
the token text. Usually set in the language's tokenizer exceptions
or norm exceptions.
"""
def __get__(self): def __get__(self):
return self.c.lex.norm return self.c.lex.norm
property shape: property shape:
"""RETURNS (uint64): ID of the token's shape, a transform of the
tokens's string, to show orthographic features (e.g. "Xxxx", "dd").
"""
def __get__(self): def __get__(self):
return self.c.lex.shape return self.c.lex.shape
property prefix: property prefix:
"""RETURNS (uint64): ID of a length-N substring from the start of the
token. Defaults to `N=1`.
"""
def __get__(self): def __get__(self):
return self.c.lex.prefix return self.c.lex.prefix
property suffix: property suffix:
"""RETURNS (uint64): ID of a length-N substring from the end of the
token. Defaults to `N=3`.
"""
def __get__(self): def __get__(self):
return self.c.lex.suffix return self.c.lex.suffix
property lemma: property lemma:
"""Base form of the word, with no inflectional suffixes. """RETURNS (uint64): ID of the base form of the word, with no
inflectional suffixes.
RETURNS (uint64): Token lemma.
""" """
def __get__(self): def __get__(self):
return self.c.lemma return self.c.lemma
def __set__(self, attr_t lemma): def __set__(self, attr_t lemma):
self.c.lemma = lemma self.c.lemma = lemma
property pos: property pos:
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
def __get__(self): def __get__(self):
return self.c.pos return self.c.pos
property tag: property tag:
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
def __get__(self): def __get__(self):
return self.c.tag return self.c.tag
def __set__(self, attr_t tag): def __set__(self, attr_t tag):
self.vocab.morphology.assign_tag(self.c, tag) self.vocab.morphology.assign_tag(self.c, tag)
property dep: property dep:
"""RETURNS (uint64): ID of syntactic dependency label."""
def __get__(self): def __get__(self):
return self.c.dep return self.c.dep
def __set__(self, attr_t label): def __set__(self, attr_t label):
self.c.dep = label self.c.dep = label
@ -292,23 +319,29 @@ cdef class Token:
return numpy.sqrt((vector ** 2).sum()) return numpy.sqrt((vector ** 2).sum())
property n_lefts: property n_lefts:
"""RETURNS (int): The number of leftward immediate children of the
word, in the syntactic dependency parse.
"""
def __get__(self): def __get__(self):
return self.c.l_kids return self.c.l_kids
property n_rights: property n_rights:
"""RETURNS (int): The number of rightward immediate children of the
word, in the syntactic dependency parse.
"""
def __get__(self): def __get__(self):
return self.c.r_kids return self.c.r_kids
property sent_start: property sent_start:
# TODO: fix and document
def __get__(self): def __get__(self):
return self.c.sent_start return self.c.sent_start
def __set__(self, value): def __set__(self, value):
if self.doc.is_parsed: if self.doc.is_parsed:
raise ValueError( raise ValueError(
'Refusing to write to token.sent_start if its document is parsed, ' "Refusing to write to token.sent_start if its document "
'because this may cause inconsistent state. ' "is parsed, because this may cause inconsistent state.")
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
if value is None: if value is None:
self.c.sent_start = 0 self.c.sent_start = 0
elif value is True: elif value is True:
@ -316,15 +349,16 @@ cdef class Token:
elif value is False: elif value is False:
self.c.sent_start = -1 self.c.sent_start = -1
else: else:
raise ValueError("Invalid value for token.sent_start -- must be one of " raise ValueError("Invalid value for token.sent_start. Must be "
"None, True, False") "one of: None, True, False")
property lefts: property lefts:
def __get__(self): """The leftward immediate children of the word, in the syntactic
"""
The leftward immediate children of the word, in the syntactic
dependency parse. dependency parse.
YIELDS (Token): A left-child of the token.
""" """
def __get__(self):
cdef int nr_iter = 0 cdef int nr_iter = 0
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
while ptr < self.c: while ptr < self.c:
@ -334,15 +368,16 @@ cdef class Token:
nr_iter += 1 nr_iter += 1
# This is ugly, but it's a way to guard out infinite loops # This is ugly, but it's a way to guard out infinite loops
if nr_iter >= 10000000: if nr_iter >= 10000000:
raise RuntimeError( raise RuntimeError("Possibly infinite loop encountered "
"Possibly infinite loop encountered while looking for token.lefts") "while looking for token.lefts")
property rights: property rights:
def __get__(self): """The rightward immediate children of the word, in the syntactic
"""
The rightward immediate children of the word, in the syntactic
dependency parse. dependency parse.
YIELDS (Token): A right-child of the token.
""" """
def __get__(self):
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
tokens = [] tokens = []
cdef int nr_iter = 0 cdef int nr_iter = 0
@ -352,27 +387,26 @@ cdef class Token:
ptr -= 1 ptr -= 1
nr_iter += 1 nr_iter += 1
if nr_iter >= 10000000: if nr_iter >= 10000000:
raise RuntimeError( raise RuntimeError("Possibly infinite loop encountered "
"Possibly infinite loop encountered while looking for token.rights") "while looking for token.rights")
tokens.reverse() tokens.reverse()
for t in tokens: for t in tokens:
yield t yield t
property children: property children:
""" """A sequence of the token's immediate syntactic children.
A sequence of the token's immediate syntactic children.
Yields: Token A child token such that child.head==self YIELDS (Token): A child token such that child.head==self
""" """
def __get__(self): def __get__(self):
yield from self.lefts yield from self.lefts
yield from self.rights yield from self.rights
property subtree: property subtree:
""" """A sequence of all the token's syntactic descendents.
A sequence of all the token's syntactic descendents.
Yields: Token A descendent token such that self.is_ancestor(descendent) YIELDS (Token): A descendent token such that
`self.is_ancestor(descendent)`.
""" """
def __get__(self): def __get__(self):
for word in self.lefts: for word in self.lefts:
@ -427,13 +461,12 @@ cdef class Token:
property head: property head:
"""The syntactic parent, or "governor", of this token. """The syntactic parent, or "governor", of this token.
RETURNS (Token): The token head. RETURNS (Token): The token predicted by the parser to be the head of
the current token.
""" """
def __get__(self): def __get__(self):
"""The token predicted by the parser to be the head of the current
token.
"""
return self.doc[self.i + self.c.head] return self.doc[self.i + self.c.head]
def __set__(self, Token new_head): def __set__(self, Token new_head):
# this function sets the head of self to new_head # this function sets the head of self to new_head
# and updates the counters for left/right dependents # and updates the counters for left/right dependents
@ -456,13 +489,15 @@ cdef class Token:
if self.c.head > 0: # left dependent if self.c.head > 0: # left dependent
old_head.c.l_kids -= 1 old_head.c.l_kids -= 1
if self.c.l_edge == old_head.c.l_edge: if self.c.l_edge == old_head.c.l_edge:
# the token dominates the left edge so the left edge of the head # the token dominates the left edge so the left edge of
# may change when the token is reattached # the head may change when the token is reattached, it may
# it may not change if the new head is a descendant of the current head # not change if the new head is a descendant of the current
# head
new_edge = self.c.l_edge new_edge = self.c.l_edge
# the new l_edge is the left-most l_edge on any of the other dependents # the new l_edge is the left-most l_edge on any of the
# where the l_edge is left of the head, otherwise it is the head # other dependents where the l_edge is left of the head,
# otherwise it is the head
if not is_desc: if not is_desc:
new_edge = old_head.i new_edge = old_head.i
for child in old_head.children: for child in old_head.children:
@ -472,8 +507,9 @@ cdef class Token:
new_edge = child.c.l_edge new_edge = child.c.l_edge
old_head.c.l_edge = new_edge old_head.c.l_edge = new_edge
# walk up the tree from old_head and assign new l_edge to ancestors # walk up the tree from old_head and assign new l_edge to
# until an ancestor already has an l_edge that's further left # ancestors until an ancestor already has an l_edge that's
# further left
for anc in old_head.ancestors: for anc in old_head.ancestors:
if anc.c.l_edge <= new_edge: if anc.c.l_edge <= new_edge:
break break
@ -542,12 +578,10 @@ cdef class Token:
yield from word.conjuncts yield from word.conjuncts
property ent_type: property ent_type:
"""Named entity type. """RETURNS (uint64): Named entity type."""
RETURNS (uint64): Named entity type.
"""
def __get__(self): def __get__(self):
return self.c.ent_type return self.c.ent_type
def __set__(self, ent_type): def __set__(self, ent_type):
self.c.ent_type = ent_type self.c.ent_type = ent_type
@ -561,19 +595,17 @@ cdef class Token:
return self.c.ent_iob return self.c.ent_iob
property ent_type_: property ent_type_:
"""Named entity type. """RETURNS (unicode): Named entity type."""
RETURNS (unicode): Named entity type.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.ent_type] return self.vocab.strings[self.c.ent_type]
def __set__(self, ent_type): def __set__(self, ent_type):
self.c.ent_type = self.vocab.strings.add(ent_type) self.c.ent_type = self.vocab.strings.add(ent_type)
property ent_iob_: property ent_iob_:
"""IOB code of named entity tag. "B" means the token begins an entity, """IOB code of named entity tag. "B" means the token begins an entity,
"I" means it is inside an entity, "O" means it is outside an entity, and "I" means it is inside an entity, "O" means it is outside an entity,
"" means no entity tag is set. and "" means no entity tag is set.
RETURNS (unicode): IOB code of named entity tag. RETURNS (unicode): IOB code of named entity tag.
""" """
@ -582,10 +614,8 @@ cdef class Token:
return iob_strings[self.c.ent_iob] return iob_strings[self.c.ent_iob]
property ent_id: property ent_id:
"""ID of the entity the token is an instance of, if any. Usually """RETURNS (uint64): ID of the entity the token is an instance of,
assigned by patterns in the Matcher. if any.
RETURNS (uint64): ID of the entity.
""" """
def __get__(self): def __get__(self):
return self.c.ent_id return self.c.ent_id
@ -594,10 +624,8 @@ cdef class Token:
self.c.ent_id = key self.c.ent_id = key
property ent_id_: property ent_id_:
"""ID of the entity the token is an instance of, if any. Usually """RETURNS (unicode): ID of the entity the token is an instance of,
assigned by patterns in the Matcher. if any.
RETURNS (unicode): ID of the entity.
""" """
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.ent_id] return self.vocab.strings[self.c.ent_id]
@ -606,107 +634,192 @@ cdef class Token:
self.c.ent_id = self.vocab.strings.add(name) self.c.ent_id = self.vocab.strings.add(name)
property whitespace_: property whitespace_:
"""RETURNS (unicode): The trailing whitespace character, if present.
"""
def __get__(self): def __get__(self):
return ' ' if self.c.spacy else '' return ' ' if self.c.spacy else ''
property orth_: property orth_:
"""RETURNS (unicode): Verbatim text content (identical to
`Token.text`). Existst mostly for consistency with the other
attributes.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.orth] return self.vocab.strings[self.c.lex.orth]
property lower_: property lower_:
"""RETURNS (unicode): The lowercase token text. Equivalent to
`Token.text.lower()`.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.lower] return self.vocab.strings[self.c.lex.lower]
property norm_: property norm_:
"""RETURNS (unicode): The token's norm, i.e. a normalised form of the
token text. Usually set in the language's tokenizer exceptions or
norm exceptions.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.norm] return self.vocab.strings[self.c.lex.norm]
property shape_: property shape_:
"""RETURNS (unicode): Transform of the tokens's string, to show
orthographic features. For example, "Xxxx" or "dd".
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.shape] return self.vocab.strings[self.c.lex.shape]
property prefix_: property prefix_:
"""RETURNS (unicode): A length-N substring from the start of the token.
Defaults to `N=1`.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.prefix] return self.vocab.strings[self.c.lex.prefix]
property suffix_: property suffix_:
"""RETURNS (unicode): A length-N substring from the end of the token.
Defaults to `N=3`.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.suffix] return self.vocab.strings[self.c.lex.suffix]
property lang_: property lang_:
"""RETURNS (unicode): Language of the parent document's vocabulary,
e.g. 'en'.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lex.lang] return self.vocab.strings[self.c.lex.lang]
property lemma_: property lemma_:
"""Base form of the word, with no inflectional suffixes. """RETURNS (unicode): The token lemma, i.e. the base form of the word,
with no inflectional suffixes.
RETURNS (unicode): Token lemma.
""" """
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lemma] return self.vocab.strings[self.c.lemma]
def __set__(self, unicode lemma_): def __set__(self, unicode lemma_):
self.c.lemma = self.vocab.strings.add(lemma_) self.c.lemma = self.vocab.strings.add(lemma_)
property pos_: property pos_:
"""RETURNS (unicode): Coarse-grained part-of-speech tag."""
def __get__(self): def __get__(self):
return parts_of_speech.NAMES[self.c.pos] return parts_of_speech.NAMES[self.c.pos]
property tag_: property tag_:
"""RETURNS (unicode): Fine-grained part-of-speech tag."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.tag] return self.vocab.strings[self.c.tag]
def __set__(self, tag): def __set__(self, tag):
self.tag = self.vocab.strings.add(tag) self.tag = self.vocab.strings.add(tag)
property dep_: property dep_:
"""RETURNS (unicode): The syntactic dependency label."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.dep] return self.vocab.strings[self.c.dep]
def __set__(self, unicode label): def __set__(self, unicode label):
self.c.dep = self.vocab.strings.add(label) self.c.dep = self.vocab.strings.add(label)
property is_oov: property is_oov:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV) """RETURNS (bool): Whether the token is out-of-vocabulary."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_OOV)
property is_stop: property is_stop:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP) """RETURNS (bool): Whether the token is a stop word, i.e. part of a
"stop list" defined by the language data.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_STOP)
property is_alpha: property is_alpha:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) """RETURNS (bool): Whether the token consists of alpha characters.
Equivalent to `token.text.isalpha()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
property is_ascii: property is_ascii:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII) """RETURNS (bool): Whether the token consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
property is_digit: property is_digit:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) """RETURNS (bool): Whether the token consists of digits. Equivalent to
`token.text.isdigit()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
property is_lower: property is_lower:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER) """RETURNS (bool): Whether the token is in lowercase. Equivalent to
`token.text.islower()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
property is_upper:
"""RETURNS (bool): Whether the token is in uppercase. Equivalent to
`token.text.isupper()`
"""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
property is_title: property is_title:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE) """RETURNS (bool): Whether the token is in titlecase. Equivalent to
`token.text.istitle()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
property is_punct: property is_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) """RETURNS (bool): Whether the token is punctuation."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
property is_space: property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) """RETURNS (bool): Whether the token consists of whitespace characters.
Equivalent to `token.text.isspace()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket: property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) """RETURNS (bool): Whether the token is a bracket."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote: property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) """RETURNS (bool): Whether the token is a quotation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct: property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) """RETURNS (bool): Whether the token is a left punctuation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct: property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) """RETURNS (bool): Whether the token is a left punctuation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url: property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL) """RETURNS (bool): Whether the token resembles a URL."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
property like_num: property like_num:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) """RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
"10", "ten", etc.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
property like_email: property like_email:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) """RETURNS (bool): Whether the token resembles an email address."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)

View File

@ -1,5 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
import functools import functools
class Underscore(object): class Underscore(object):
doc_extensions = {} doc_extensions = {}
span_extensions = {} span_extensions = {}

View File

@ -1 +0,0 @@

View File

@ -10,25 +10,27 @@ from pathlib import Path
import sys import sys
import textwrap import textwrap
import random import random
import numpy
import io
import dill
from collections import OrderedDict from collections import OrderedDict
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
import functools import functools
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import import_file
import msgpack import msgpack
import msgpack_numpy import msgpack_numpy
msgpack_numpy.patch() msgpack_numpy.patch()
import ujson
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import copy_array, normalize_string_keys, getattr_, import_file
LANGUAGES = {} LANGUAGES = {}
_data_path = Path(__file__).parent / 'data' _data_path = Path(__file__).parent / 'data'
_PRINT_ENV = False
def set_env_log(value):
global _PRINT_ENV
_PRINT_ENV = value
def get_lang_class(lang): def get_lang_class(lang):
@ -38,11 +40,12 @@ def get_lang_class(lang):
RETURNS (Language): Language class. RETURNS (Language): Language class.
""" """
global LANGUAGES global LANGUAGES
if not lang in LANGUAGES: if lang not in LANGUAGES:
try: try:
module = importlib.import_module('.lang.%s' % lang, 'spacy') module = importlib.import_module('.lang.%s' % lang, 'spacy')
except ImportError: except ImportError:
raise ImportError("Can't import language %s from spacy.lang." %lang) msg = "Can't import language %s from spacy.lang."
raise ImportError(msg % lang)
LANGUAGES[lang] = getattr(module, module.__all__[0]) LANGUAGES[lang] = getattr(module, module.__all__[0])
return LANGUAGES[lang] return LANGUAGES[lang]
@ -100,8 +103,8 @@ def load_model(name, **overrides):
data_path = get_data_path() data_path = get_data_path()
if not data_path or not data_path.exists(): if not data_path or not data_path.exists():
raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
if isinstance(name, basestring_): if isinstance(name, basestring_): # in data dir / shortcut
if name in set([d.name for d in data_path.iterdir()]): # in data dir / shortcut if name in set([d.name for d in data_path.iterdir()]):
return load_model_from_link(name, **overrides) return load_model_from_link(name, **overrides)
if is_package(name): # installed as package if is_package(name): # installed as package
return load_model_from_package(name, **overrides) return load_model_from_package(name, **overrides)
@ -120,7 +123,7 @@ def load_model_from_link(name, **overrides):
except AttributeError: except AttributeError:
raise IOError( raise IOError(
"Cant' load '%s'. If you're using a shortcut link, make sure it " "Cant' load '%s'. If you're using a shortcut link, make sure it "
"points to a valid model package (not just a data directory)." % name) "points to a valid package (not just a data directory)." % name)
return cls.load(**overrides) return cls.load(**overrides)
@ -164,7 +167,8 @@ def load_model_from_init_py(init_file, **overrides):
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
data_path = model_path / data_dir data_path = model_path / data_dir
if not model_path.exists(): if not model_path.exists():
raise ValueError("Can't find model directory: %s" % path2str(data_path)) msg = "Can't find model directory: %s"
raise ValueError(msg % path2str(data_path))
return load_model_from_path(data_path, meta, **overrides) return load_model_from_path(data_path, meta, **overrides)
@ -176,14 +180,16 @@ def get_model_meta(path):
""" """
model_path = ensure_path(path) model_path = ensure_path(path)
if not model_path.exists(): if not model_path.exists():
raise ValueError("Can't find model directory: %s" % path2str(model_path)) msg = "Can't find model directory: %s"
raise ValueError(msg % path2str(model_path))
meta_path = model_path / 'meta.json' meta_path = model_path / 'meta.json'
if not meta_path.is_file(): if not meta_path.is_file():
raise IOError("Could not read meta.json from %s" % meta_path) raise IOError("Could not read meta.json from %s" % meta_path)
meta = read_json(meta_path) meta = read_json(meta_path)
for setting in ['lang', 'name', 'version']: for setting in ['lang', 'name', 'version']:
if setting not in meta or not meta[setting]: if setting not in meta or not meta[setting]:
raise ValueError("No valid '%s' setting found in model meta.json" % setting) msg = "No valid '%s' setting found in model meta.json"
raise ValueError(msg % setting)
return meta return meta
@ -274,12 +280,6 @@ def itershuffle(iterable, bufsize=1000):
raise StopIteration raise StopIteration
_PRINT_ENV = False
def set_env_log(value):
global _PRINT_ENV
_PRINT_ENV = value
def env_opt(name, default=None): def env_opt(name, default=None):
if type(default) is float: if type(default) is float:
type_convert = float type_convert = float
@ -305,17 +305,20 @@ def read_regex(path):
path = ensure_path(path) path = ensure_path(path)
with path.open() as file_: with path.open() as file_:
entries = file_.read().split('\n') entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) expression = '|'.join(['^' + re.escape(piece)
for piece in entries if piece.strip()])
return re.compile(expression) return re.compile(expression)
def compile_prefix_regex(entries): def compile_prefix_regex(entries):
if '(' in entries: if '(' in entries:
# Handle deprecated data # Handle deprecated data
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) expression = '|'.join(['^' + re.escape(piece)
for piece in entries if piece.strip()])
return re.compile(expression) return re.compile(expression)
else: else:
expression = '|'.join(['^' + piece for piece in entries if piece.strip()]) expression = '|'.join(['^' + piece
for piece in entries if piece.strip()])
return re.compile(expression) return re.compile(expression)
@ -359,16 +362,15 @@ def update_exc(base_exceptions, *addition_dicts):
exc = dict(base_exceptions) exc = dict(base_exceptions)
for additions in addition_dicts: for additions in addition_dicts:
for orth, token_attrs in additions.items(): for orth, token_attrs in additions.items():
if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs): if not all(isinstance(attr[ORTH], unicode_)
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'" for attr in token_attrs):
msg = "Invalid ORTH value in exception: key='%s', orths='%s'"
raise ValueError(msg % (orth, token_attrs)) raise ValueError(msg % (orth, token_attrs))
described_orth = ''.join(attr[ORTH] for attr in token_attrs) described_orth = ''.join(attr[ORTH] for attr in token_attrs)
if orth != described_orth: if orth != described_orth:
raise ValueError("Invalid tokenizer exception: ORTH values " msg = ("Invalid tokenizer exception: ORTH values combined "
"combined don't match original string. " "don't match original string. key='%s', orths='%s'")
"key='%s', orths='%s'" % (orth, described_orth)) raise ValueError(msg % (orth, described_orth))
# overlap = set(exc.keys()).intersection(set(additions))
# assert not overlap, overlap
exc.update(additions) exc.update(additions)
exc = expand_exc(exc, "'", "") exc = expand_exc(exc, "'", "")
return exc return exc
@ -405,13 +407,11 @@ def normalize_slice(length, start, stop, step=None):
elif start < 0: elif start < 0:
start += length start += length
start = min(length, max(0, start)) start = min(length, max(0, start))
if stop is None: if stop is None:
stop = length stop = length
elif stop < 0: elif stop < 0:
stop += length stop += length
stop = min(length, max(start, stop)) stop = min(length, max(start, stop))
assert 0 <= start <= stop <= length assert 0 <= start <= stop <= length
return start, stop return start, stop
@ -530,17 +530,19 @@ def print_markdown(data, title=None):
if isinstance(data, dict): if isinstance(data, dict):
data = list(data.items()) data = list(data.items())
markdown = ["* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)] markdown = ["* **{}:** {}".format(l, unicode_(v))
for l, v in data if not excl_value(v)]
if title: if title:
print("\n## {}".format(title)) print("\n## {}".format(title))
print('\n{}\n'.format('\n'.join(markdown))) print('\n{}\n'.format('\n'.join(markdown)))
def prints(*texts, **kwargs): def prints(*texts, **kwargs):
"""Print formatted message (manual ANSI escape sequences to avoid dependency) """Print formatted message (manual ANSI escape sequences to avoid
dependency)
*texts (unicode): Texts to print. Each argument is rendered as paragraph. *texts (unicode): Texts to print. Each argument is rendered as paragraph.
**kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit. **kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
""" """
exits = kwargs.get('exits', None) exits = kwargs.get('exits', None)
title = kwargs.get('title', None) title = kwargs.get('title', None)
@ -570,7 +572,8 @@ def _wrap(text, wrap_max=80, indent=4):
def minify_html(html): def minify_html(html):
"""Perform a template-specific, rudimentary HTML minification for displaCy. """Perform a template-specific, rudimentary HTML minification for displaCy.
Disclaimer: NOT a general-purpose solution, only removes indentation/newlines. Disclaimer: NOT a general-purpose solution, only removes indentation and
newlines.
html (unicode): Markup to minify. html (unicode): Markup to minify.
RETURNS (unicode): "Minified" HTML. RETURNS (unicode): "Minified" HTML.

View File

@ -1,5 +1,6 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.stdint cimport int32_t, uint64_t
import numpy import numpy
from collections import OrderedDict from collections import OrderedDict
import msgpack import msgpack
@ -9,23 +10,20 @@ cimport numpy as np
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
from .typedefs cimport attr_t
from .strings cimport StringStore from .strings cimport StringStore
from . import util
from .compat import basestring_, path2str from .compat import basestring_, path2str
from . import util
cdef class Vectors: cdef class Vectors:
'''Store, save and load word vectors. """Store, save and load word vectors.
Vectors data is kept in the vectors.data attribute, which should be an Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors) instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
or cupy.ndarray (for GPU vectors). (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
rows in the vectors.data table. The array `vectors.keys` keeps the keys in
vectors.key2row is a dictionary mapping word hashes to rows order, such that `keys[vectors.key2row[key]] == key`.
in the vectors.data table. The array `vectors.keys` keeps """
the keys in order, such that keys[vectors.key2row[key]] == key.
'''
cdef public object data cdef public object data
cdef readonly StringStore strings cdef readonly StringStore strings
cdef public object key2row cdef public object key2row
@ -33,6 +31,16 @@ cdef class Vectors:
cdef public int i cdef public int i
def __init__(self, strings, width=0, data=None): def __init__(self, strings, width=0, data=None):
"""Create a new vector store. To keep the vector table empty, pass
`width=0`. You can also create the vector table and add vectors one by
one, or set the vector values directly on initialisation.
strings (StringStore or list): List of strings or StringStore that maps
strings to hash values, and vice versa.
width (int): Number of dimensions.
data (numpy.ndarray): The vector data.
RETURNS (Vectors): The newly created object.
"""
if isinstance(strings, StringStore): if isinstance(strings, StringStore):
self.strings = strings self.strings = strings
else: else:
@ -55,11 +63,13 @@ cdef class Vectors:
return (Vectors, (self.strings, self.data)) return (Vectors, (self.strings, self.data))
def __getitem__(self, key): def __getitem__(self, key):
'''Get a vector by key. If key is a string, it is hashed """Get a vector by key. If key is a string, it is hashed to an integer
to an integer ID using the vectors.strings table. ID using the vectors.strings table. If the integer key is not found in
the table, a KeyError is raised.
If the integer key is not found in the table, a KeyError is raised. key (unicode / int): The key to get the vector for.
''' RETURNS (numpy.ndarray): The vector for the key.
"""
if isinstance(key, basestring): if isinstance(key, basestring):
key = self.strings[key] key = self.strings[key]
i = self.key2row[key] i = self.key2row[key]
@ -69,30 +79,47 @@ cdef class Vectors:
return self.data[i] return self.data[i]
def __setitem__(self, key, vector): def __setitem__(self, key, vector):
'''Set a vector for the given key. If key is a string, it is hashed """Set a vector for the given key. If key is a string, it is hashed
to an integer ID using the vectors.strings table. to an integer ID using the vectors.strings table.
'''
key (unicode / int): The key to set the vector for.
vector (numpy.ndarray): The vector to set.
"""
if isinstance(key, basestring): if isinstance(key, basestring):
key = self.strings.add(key) key = self.strings.add(key)
i = self.key2row[key] i = self.key2row[key]
self.data[i] = vector self.data[i] = vector
def __iter__(self): def __iter__(self):
'''Yield vectors from the table.''' """Yield vectors from the table.
YIELDS (numpy.ndarray): A vector.
"""
yield from self.data yield from self.data
def __len__(self): def __len__(self):
'''Return the number of vectors that have been assigned.''' """Return the number of vectors that have been assigned.
RETURNS (int): The number of vectors in the data.
"""
return self.i return self.i
def __contains__(self, key): def __contains__(self, key):
'''Check whether a key has a vector entry in the table.''' """Check whether a key has a vector entry in the table.
key (unicode / int): The key to check.
RETURNS (bool): Whether the key has a vector entry.
"""
if isinstance(key, basestring_): if isinstance(key, basestring_):
key = self.strings[key] key = self.strings[key]
return key in self.key2row return key in self.key2row
def add(self, key, vector=None): def add(self, key, vector=None):
'''Add a key to the table, optionally setting a vector value as well.''' """Add a key to the table, optionally setting a vector value as well.
key (unicode / int): The key to add.
vector (numpy.ndarray): An optional vector to add.
"""
if isinstance(key, basestring_): if isinstance(key, basestring_):
key = self.strings.add(key) key = self.strings.add(key)
if key not in self.key2row: if key not in self.key2row:
@ -110,24 +137,36 @@ cdef class Vectors:
return i return i
def items(self): def items(self):
'''Iterate over (string key, vector) pairs, in order.''' """Iterate over `(string key, vector)` pairs, in order.
YIELDS (tuple): A key/vector pair.
"""
for i, key in enumerate(self.keys): for i, key in enumerate(self.keys):
string = self.strings[key] string = self.strings[key]
yield string, self.data[i] yield string, self.data[i]
@property @property
def shape(self): def shape(self):
"""Get `(rows, dims)` tuples of number of rows and number of dimensions
in the vector table.
RETURNS (tuple): A `(rows, dims)` pair.
"""
return self.data.shape return self.data.shape
def most_similar(self, key): def most_similar(self, key):
# TODO: implement
raise NotImplementedError raise NotImplementedError
def from_glove(self, path): def from_glove(self, path):
'''Load GloVe vectors from a directory. Assumes binary format, """Load GloVe vectors from a directory. Assumes binary format,
that the vocab is in a vocab.txt, and that vectors are named that the vocab is in a vocab.txt, and that vectors are named
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32 vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc. vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
By default GloVe outputs 64-bit vectors.''' By default GloVe outputs 64-bit vectors.
path (unicode / Path): The path to load the GloVe vectors from.
"""
path = util.ensure_path(path) path = util.ensure_path(path)
for name in path.iterdir(): for name in path.iterdir():
if name.parts[-1].startswith('vectors'): if name.parts[-1].startswith('vectors'):
@ -150,9 +189,15 @@ cdef class Vectors:
self.data self.data
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Save the current state to a directory.
path (unicode / Path): A path to a directory, which will be created if
it doesn't exists. Either a string or a Path-like object.
"""
xp = get_array_module(self.data) xp = get_array_module(self.data)
if xp is numpy: if xp is numpy:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) save_array = lambda arr, file_: xp.save(file_, arr,
allow_pickle=False)
else: else:
save_array = lambda arr, file_: xp.save(file_, arr) save_array = lambda arr, file_: xp.save(file_, arr)
serializers = OrderedDict(( serializers = OrderedDict((
@ -162,6 +207,12 @@ cdef class Vectors:
return util.to_disk(path, serializers, exclude) return util.to_disk(path, serializers, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode / Path): Directory path, string or Path-like object.
RETURNS (Vectors): The modified object.
"""
def load_keys(path): def load_keys(path):
if path.exists(): if path.exists():
self.keys = numpy.load(path2str(path)) self.keys = numpy.load(path2str(path))
@ -182,6 +233,11 @@ cdef class Vectors:
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vectors` object.
"""
def serialize_weights(): def serialize_weights():
if hasattr(self.data, 'to_bytes'): if hasattr(self.data, 'to_bytes'):
return self.data.to_bytes() return self.data.to_bytes()
@ -194,6 +250,12 @@ cdef class Vectors:
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, data, **exclude): def from_bytes(self, data, **exclude):
"""Load state from a binary string.
data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Vectors): The `Vectors` object.
"""
def deserialize_weights(b): def deserialize_weights(b):
if hasattr(self.data, 'from_bytes'): if hasattr(self.data, 'from_bytes'):
self.data.from_bytes() self.data.from_bytes()

View File

@ -1,33 +1,24 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import bz2
import ujson
import re
import numpy import numpy
import dill import dill
from libc.string cimport memset, memcpy
from libc.stdint cimport int32_t
from libc.math cimport sqrt
from cymem.cymem cimport Address
from collections import OrderedDict from collections import OrderedDict
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.token cimport Token from .tokens.token cimport Token
from .attrs cimport PROB, LANG from .attrs cimport PROB, LANG, ORTH, TAG
from .structs cimport SerializedLexemeC from .structs cimport SerializedLexemeC
from .compat import copy_reg, pickle, basestring_ from .compat import copy_reg, basestring_
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .attrs import intify_attrs from .attrs import intify_attrs
from .vectors import Vectors from .vectors import Vectors
from . import util
from . import attrs
from . import symbols
from ._ml import link_vectors_to_models from ._ml import link_vectors_to_models
from . import util
cdef class Vocab: cdef class Vocab:
@ -39,20 +30,19 @@ cdef class Vocab:
strings=tuple(), **deprecated_kwargs): strings=tuple(), **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to functions lex_attr_getters (dict): A dictionary mapping attribute IDs to
to compute them. Defaults to `None`. functions to compute them. Defaults to `None`.
tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
parts-of-speech, and optionally morphological attributes. parts-of-speech, and optionally morphological attributes.
lemmatizer (object): A lemmatizer. Defaults to `None`. lemmatizer (object): A lemmatizer. Defaults to `None`.
strings (StringStore): StringStore that maps strings to integers, and strings (StringStore): StringStore that maps strings to integers, and
vice versa. vice versa.
RETURNS (Vocab): The newly constructed vocab object. RETURNS (Vocab): The newly constructed object.
""" """
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {} tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False): if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {}) lemmatizer = Lemmatizer({}, {}, {})
self.mem = Pool() self.mem = Pool()
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
@ -84,19 +74,20 @@ cdef class Vocab:
The flag_getter function will be called over the words currently in the The flag_getter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able vocab, and then applied to new words as they occur. You'll then be able
to access the flag value on each token, using token.check_flag(flag_id). to access the flag value on each token using token.check_flag(flag_id).
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`, See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`. `Token.check_flag`.
flag_getter (callable): A function `f(unicode) -> bool`, to get the flag flag_getter (callable): A function `f(unicode) -> bool`, to get the
value. flag value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest the bit at which the flag will be stored. If -1, the lowest
available bit will be chosen. available bit will be chosen.
RETURNS (int): The integer ID by which the flag value can be checked. RETURNS (int): The integer ID by which the flag value can be checked.
EXAMPLE: EXAMPLE:
>>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy']) >>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy']
>>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter)
>>> doc = nlp(u'I like spaCy') >>> doc = nlp(u'I like spaCy')
>>> assert doc[2].check_flag(MY_PRODUCT) == True >>> assert doc[2].check_flag(MY_PRODUCT) == True
""" """
@ -107,9 +98,10 @@ cdef class Vocab:
break break
else: else:
raise ValueError( raise ValueError(
"Cannot find empty bit for new lexical flag. All bits between " "Cannot find empty bit for new lexical flag. All bits "
"0 and 63 are occupied. You can replace one by specifying the " "between 0 and 63 are occupied. You can replace one by "
"flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA") "specifying the flag_id explicitly, e.g. "
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
elif flag_id >= 64 or flag_id < 1: elif flag_id >= 64 or flag_id < 1:
raise ValueError( raise ValueError(
"Invalid value for flag_id: %d. Flag IDs must be between " "Invalid value for flag_id: %d. Flag IDs must be between "
@ -120,9 +112,9 @@ cdef class Vocab:
return flag_id return flag_id
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` """Get a pointer to a `LexemeC` from the lexicon, creating a new
if necessary, using memory acquired from the given pool. If the pool `Lexeme` if necessary using memory acquired from the given pool. If the
is the lexicon's own memory, the lexeme is saved in the lexicon. pool is the lexicon's own memory, the lexeme is saved in the lexicon.
""" """
if string == u'': if string == u'':
return &EMPTY_LEXEME return &EMPTY_LEXEME
@ -139,9 +131,9 @@ cdef class Vocab:
return self._new_lexeme(mem, string) return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` """Get a pointer to a `LexemeC` from the lexicon, creating a new
if necessary, using memory acquired from the given pool. If the pool `Lexeme` if necessary using memory acquired from the given pool. If the
is the lexicon's own memory, the lexeme is saved in the lexicon. pool is the lexicon's own memory, the lexeme is saved in the lexicon.
""" """
if orth == 0: if orth == 0:
return &EMPTY_LEXEME return &EMPTY_LEXEME
@ -229,13 +221,14 @@ cdef class Vocab:
cdef int i cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings): for i, props in enumerate(substrings):
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) props = intify_attrs(props, strings_map=self.strings,
_do_deprecated=True)
token = &tokens[i] token = &tokens[i]
# Set the special tokens up to have arbitrary attributes # Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH]) lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
token.lex = lex token.lex = lex
if attrs.TAG in props: if TAG in props:
self.morphology.assign_tag(token, props[attrs.TAG]) self.morphology.assign_tag(token, props[TAG])
for attr_id, value in props.items(): for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value) Token.set_struct_attr(token, attr_id, value)
Lexeme.set_struct_attr(lex, attr_id, value) Lexeme.set_struct_attr(lex, attr_id, value)
@ -254,16 +247,13 @@ cdef class Vocab:
self.vectors = Vectors(self.strings, width=new_dim) self.vectors = Vectors(self.strings, width=new_dim)
def get_vector(self, orth): def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary. """Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is
raised.
Words can be looked up by string or int ID. RETURNS (numpy.ndarray): A word vector. Size
and shape determined by the `vocab.vectors` instance. Usually, a
RETURNS: numpy ndarray of shape (300,) and dtype float32.
A word vector. Size and shape determined by the
vocab.vectors instance. Usually, a numpy ndarray
of shape (300,) and dtype float32.
RAISES: If no vectors data is loaded, ValueError is raised.
""" """
if isinstance(orth, basestring_): if isinstance(orth, basestring_):
orth = self.strings.add(orth) orth = self.strings.add(orth)
@ -273,21 +263,16 @@ cdef class Vocab:
return numpy.zeros((self.vectors_length,), dtype='f') return numpy.zeros((self.vectors_length,), dtype='f')
def set_vector(self, orth, vector): def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary. """Set a vector for a word in the vocabulary. Words can be referenced
by string or int ID.
Words can be referenced by string or int ID.
RETURNS:
None
""" """
if not isinstance(orth, basestring_): if not isinstance(orth, basestring_):
orth = self.strings[orth] orth = self.strings[orth]
self.vectors.add(orth, vector=vector) self.vectors.add(orth, vector=vector)
def has_vector(self, orth): def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no """Check whether a word has a vector. Returns False if no vectors have
vectors have been loaded. Words can be looked up by string been loaded. Words can be looked up by string or int ID."""
or int ID."""
if isinstance(orth, basestring_): if isinstance(orth, basestring_):
orth = self.strings.add(orth) orth = self.strings.add(orth)
return orth in self.vectors return orth in self.vectors
@ -296,7 +281,7 @@ cdef class Vocab:
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): if not path.exists():
@ -421,12 +406,9 @@ def pickle_vocab(vocab):
length = vocab.length length = vocab.length
data_dir = vocab.data_dir data_dir = vocab.data_dir
lex_attr_getters = dill.dumps(vocab.lex_attr_getters) lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
lexemes_data = vocab.lexemes_to_bytes() lexemes_data = vocab.lexemes_to_bytes()
return (unpickle_vocab, return (unpickle_vocab,
(sstore, morph, data_dir, lex_attr_getters, (sstore, morph, data_dir, lex_attr_getters, lexemes_data, length))
lexemes_data, length))
def unpickle_vocab(sstore, morphology, data_dir, def unpickle_vocab(sstore, morphology, data_dir,
@ -450,12 +432,10 @@ class LookupError(Exception):
@classmethod @classmethod
def mismatched_strings(cls, id_, id_string, original_string): def mismatched_strings(cls, id_, id_string, original_string):
return cls( return cls(
"Error fetching a Lexeme from the Vocab. When looking up a string, " "Error fetching a Lexeme from the Vocab. When looking up a "
"the lexeme returned had an orth ID that did not match the query string. " "string, the lexeme returned had an orth ID that did not match "
"This means that the cached lexeme structs are mismatched to the " "the query string. This means that the cached lexeme structs are "
"string encoding table. The mismatched:\n" "mismatched to the string encoding table. The mismatched:\n"
"Query string: {query}\n" "Query string: {}\n"
"Orth cached: {orth_str}\n" "Orth cached: {}\n"
"ID of orth: {orth_id}".format( "Orth ID: {}".format(repr(original_string), repr(id_string), id_))
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
)

View File

@ -784,3 +784,10 @@ p
+cell +cell
| A dictionary that allows customisation of properties of | A dictionary that allows customisation of properties of
| #[code Span] children. | #[code Span] children.
+row
+cell #[code _]
+cell #[code Underscore]
+cell
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].

View File

@ -157,27 +157,61 @@ p The L2 norm of the lexeme's vector representation.
+row +row
+cell #[code vocab] +cell #[code vocab]
+cell #[code Vocab] +cell #[code Vocab]
+cell +cell The lexeme's vocabulary.
+row +row
+cell #[code text] +cell #[code text]
+cell unicode +cell unicode
+cell Verbatim text content. +cell Verbatim text content.
+row
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+row
+cell #[code orth_]
+cell unicode
+cell
| Verbatim text content (identical to #[code Lexeme.text]). Existst
| mostly for consistency with the other attributes.
+row +row
+cell #[code lex_id] +cell #[code lex_id]
+cell int +cell int
+cell ID of the lexeme's lexical type. +cell ID of the lexeme's lexical type.
+row
+cell #[code rank]
+cell int
+cell
| Sequential ID of the lexemes's lexical type, used to index into
| tables, e.g. for word vectors.
+row
+cell #[code flags]
+cell int
+cell Container of the lexeme's binary flags.
+row
+cell #[code norm]
+cell int
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
+row
+cell #[code norm_]
+cell unicode
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
+row +row
+cell #[code lower] +cell #[code lower]
+cell int +cell int
+cell Lower-case form of the word. +cell Lowercase form of the word.
+row +row
+cell #[code lower_] +cell #[code lower_]
+cell unicode +cell unicode
+cell Lower-case form of the word. +cell Lowercase form of the word.
+row +row
+cell #[code shape] +cell #[code shape]
@ -192,22 +226,30 @@ p The L2 norm of the lexeme's vector representation.
+row +row
+cell #[code prefix] +cell #[code prefix]
+cell int +cell int
+cell Length-N substring from the start of the word. Defaults to #[code N=1]. +cell
| Length-N substring from the start of the word. Defaults to
| #[code N=1].
+row +row
+cell #[code prefix_] +cell #[code prefix_]
+cell unicode +cell unicode
+cell Length-N substring from the start of the word. Defaults to #[code N=1]. +cell
| Length-N substring from the start of the word. Defaults to
| #[code N=1].
+row +row
+cell #[code suffix] +cell #[code suffix]
+cell int +cell int
+cell Length-N substring from the end of the word. Defaults to #[code N=3]. +cell
| Length-N substring from the end of the word. Defaults to
| #[code N=3].
+row +row
+cell #[code suffix_] +cell #[code suffix_]
+cell unicode +cell unicode
+cell Length-N substring from the start of the word. Defaults to #[code N=3]. +cell
| Length-N substring from the start of the word. Defaults to
| #[code N=3].
+row +row
+cell #[code is_alpha] +cell #[code is_alpha]
@ -237,6 +279,13 @@ p The L2 norm of the lexeme's vector representation.
| Is the lexeme in lowercase? Equivalent to | Is the lexeme in lowercase? Equivalent to
| #[code lexeme.text.islower()]. | #[code lexeme.text.islower()].
+row
+cell #[code is_upper]
+cell bool
+cell
| Is the lexeme in uppercase? Equivalent to
| #[code lexeme.text.isupper()].
+row +row
+cell #[code is_title] +cell #[code is_title]
+cell bool +cell bool
@ -249,6 +298,16 @@ p The L2 norm of the lexeme's vector representation.
+cell bool +cell bool
+cell Is the lexeme punctuation? +cell Is the lexeme punctuation?
+row
+cell #[code is_left_punct]
+cell bool
+cell Is the lexeme a left punctuation mark, e.g. #[code (]?
+row
+cell #[code is_right_punct]
+cell bool
+cell Is the lexeme a right punctuation mark, e.g. #[code )]?
+row +row
+cell #[code is_space] +cell #[code is_space]
+cell bool +cell bool
@ -256,6 +315,16 @@ p The L2 norm of the lexeme's vector representation.
| Does the lexeme consist of whitespace characters? Equivalent to | Does the lexeme consist of whitespace characters? Equivalent to
| #[code lexeme.text.isspace()]. | #[code lexeme.text.isspace()].
+row
+cell #[code is_bracket]
+cell bool
+cell Is the lexeme a bracket?
+row
+cell #[code is_quote]
+cell bool
+cell Is the lexeme a quotation mark?
+row +row
+cell #[code like_url] +cell #[code like_url]
+cell bool +cell bool
@ -285,6 +354,7 @@ p The L2 norm of the lexeme's vector representation.
+cell #[code lang] +cell #[code lang]
+cell int +cell int
+cell Language of the parent vocabulary. +cell Language of the parent vocabulary.
+row +row
+cell #[code lang_] +cell #[code lang_]
+cell unicode +cell unicode
@ -293,9 +363,16 @@ p The L2 norm of the lexeme's vector representation.
+row +row
+cell #[code prob] +cell #[code prob]
+cell float +cell float
+cell Smoothed log probability estimate of lexeme's type. +cell Smoothed log probability estimate of the lexeme's type.
+row
+cell #[code cluster]
+cell int
+cell Brown cluster ID.
+row +row
+cell #[code sentiment] +cell #[code sentiment]
+cell float +cell float
+cell A scalar value indicating the positivity or negativity of the lexeme. +cell
| A scalar value indicating the positivity or negativity of the
| lexeme.

View File

@ -248,6 +248,28 @@ p
+cell float +cell float
+cell A scalar similarity score. Higher is more similar. +cell A scalar similarity score. Higher is more similar.
+h(2, "get_lca_matrix") Span.get_lca_matrix
+tag method
p
| Calculates the lowest common ancestor matrix for a given #[code Span].
| Returns LCA matrix containing the integer index of the ancestor, or
| #[code -1] if no common ancestor is found, e.g. if span excludes a
| necessary ancestor.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn')
span = doc[1:4]
matrix = span.get_lca_matrix()
# array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
+cell The lowest common ancestor matrix of the #[code Span].
+h(2, "to_array") Span.to_array +h(2, "to_array") Span.to_array
+tag method +tag method
+tag-new(2) +tag-new(2)
@ -347,7 +369,7 @@ p
+tag property +tag property
+tag-model("parse") +tag-model("parse")
p Tokens that are to the left of the span, whose head is within the span. p Tokens that are to the left of the span, whose heads are within the span.
+aside-code("Example"). +aside-code("Example").
doc = nlp(u'I like New York in Autumn.') doc = nlp(u'I like New York in Autumn.')
@ -364,7 +386,7 @@ p Tokens that are to the left of the span, whose head is within the span.
+tag property +tag property
+tag-model("parse") +tag-model("parse")
p Tokens that are to the right of the span, whose head is within the span. p Tokens that are to the right of the span, whose heads are within the span.
+aside-code("Example"). +aside-code("Example").
doc = nlp(u'I like New York in Autumn.') doc = nlp(u'I like New York in Autumn.')
@ -377,6 +399,42 @@ p Tokens that are to the right of the span, whose head is within the span.
+cell #[code Token] +cell #[code Token]
+cell A right-child of a token of the span. +cell A right-child of a token of the span.
+h(2, "n_lefts") Span.n_lefts
+tag property
+tag-model("parse")
p
| The number of tokens that are to the left of the span, whose heads are
| within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[3:7].n_lefts == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of left-child tokens.
+h(2, "n_rights") Span.n_rights
+tag property
+tag-model("parse")
p
| The number of tokens that are to the right of the span, whose heads are
| within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[2:4].n_rights == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of right-child tokens.
+h(2, "subtree") Span.subtree +h(2, "subtree") Span.subtree
+tag property +tag property
+tag-model("parse") +tag-model("parse")
@ -495,6 +553,18 @@ p
| The text content of the span with a trailing whitespace character | The text content of the span with a trailing whitespace character
| if the last token has one. | if the last token has one.
+row
+cell #[code orth]
+cell int
+cell ID of the verbatim text content.
+row
+cell #[code orth_]
+cell unicode
+cell
| Verbatim text content (identical to #[code Span.text]). Existst
| mostly for consistency with the other attributes.
+row +row
+cell #[code label] +cell #[code label]
+cell int +cell int
@ -519,3 +589,17 @@ p
+cell #[code ent_id_] +cell #[code ent_id_]
+cell unicode +cell unicode
+cell The string ID of the named entity the token is an instance of. +cell The string ID of the named entity the token is an instance of.
+row
+cell #[code sentiment]
+cell float
+cell
| A scalar value indicating the positivity or negativity of the
| span.
+row
+cell #[code _]
+cell #[code Underscore]
+cell
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].

View File

@ -302,6 +302,80 @@ p A sequence of the token's immediate syntactic children.
+cell #[code Token] +cell #[code Token]
+cell A child token such that #[code child.head==self]. +cell A child token such that #[code child.head==self].
+h(2, "lefts") Token.lefts
+tag property
+tag-model("parse")
p
| The leftward immediate children of the word, in the syntactic dependency
| parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
lefts = [t.text for t in doc[3].lefts]
assert lefts == [u'New']
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell #[code Token]
+cell A left-child of the token.
+h(2, "rights") Token.rights
+tag property
+tag-model("parse")
p
| The rightward immediate children of the word, in the syntactic
| dependency parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
rights = [t.text for t in doc[3].rights]
assert rights == [u'in']
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell #[code Token]
+cell A right-child of the token.
+h(2, "n_lefts") Token.n_lefts
+tag property
+tag-model("parse")
p
| The number of leftward immediate children of the word, in the syntactic
| dependency parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[3].n_lefts == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of left-child tokens.
+h(2, "n_rights") Token.n_rights
+tag property
+tag-model("parse")
p
| The number of rightward immediate children of the word, in the syntactic
| dependency parse.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
assert doc[3].n_rights == 1
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
+cell The number of right-child tokens.
+h(2, "subtree") Token.subtree +h(2, "subtree") Token.subtree
+tag property +tag property
+tag-model("parse") +tag-model("parse")
@ -489,15 +563,35 @@ p The L2 norm of the token's vector representation.
+cell unicode +cell unicode
+cell Base form of the token, with no inflectional suffixes. +cell Base form of the token, with no inflectional suffixes.
+row
+cell #[code norm]
+cell int
+cell
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+row
+cell #[code norm_]
+cell unicode
+cell
| The token's norm, i.e. a normalised form of the token text.
| Usually set in the language's
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
+row +row
+cell #[code lower] +cell #[code lower]
+cell int +cell int
+cell Lower-case form of the token. +cell Lowercase form of the token.
+row +row
+cell #[code lower_] +cell #[code lower_]
+cell unicode +cell unicode
+cell Lower-case form of the token. +cell
| Lowercase form of the token text. Equivalent to
| #[code Token.text.lower()].
+row +row
+cell #[code shape] +cell #[code shape]
@ -537,7 +631,9 @@ p The L2 norm of the token's vector representation.
+row +row
+cell #[code suffix_] +cell #[code suffix_]
+cell unicode +cell unicode
+cell Length-N substring from the end of the token. Defaults to #[code N=3]. +cell
| Length-N substring from the end of the token. Defaults to
| #[code N=3].
+row +row
+cell #[code is_alpha] +cell #[code is_alpha]
@ -672,6 +768,7 @@ p The L2 norm of the token's vector representation.
+cell #[code lang] +cell #[code lang]
+cell int +cell int
+cell Language of the parent document's vocabulary. +cell Language of the parent document's vocabulary.
+row +row
+cell #[code lang_] +cell #[code lang_]
+cell unicode +cell unicode
@ -690,9 +787,30 @@ p The L2 norm of the token's vector representation.
+row +row
+cell #[code sentiment] +cell #[code sentiment]
+cell float +cell float
+cell A scalar value indicating the positivity or negativity of the token. +cell
| A scalar value indicating the positivity or negativity of the
| token.
+row +row
+cell #[code lex_id] +cell #[code lex_id]
+cell int +cell int
+cell ID of the token's lexical type. +cell Sequential ID of the token's lexical type.
+row
+cell #[code rank]
+cell int
+cell
| Sequential ID of the token's lexical type, used to index into
| tables, e.g. for word vectors.
+row
+cell #[code cluster]
+cell int
+cell Brown cluster ID.
+row
+cell #[code _]
+cell #[code Underscore]
+cell
| User space for adding custom
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].

View File

@ -36,12 +36,14 @@ p
| that maps strings to hash values, and vice versa. | that maps strings to hash values, and vice versa.
+row +row
+cell #[code data] +cell #[code width]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell int
+cell Number of dimensions.
+row +row
+cell #[code width] +cell #[code data]
+cell Number of dimensions. +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell The vector data.
+row("foot") +row("foot")
+cell returns +cell returns
@ -208,7 +210,7 @@ p
+row("foot") +row("foot")
+cell returns +cell returns
+cell tuple +cell tuple
+cell #[code (rows, dims)] pairs. +cell A #[code (rows, dims)] pair.
+h(2, "from_glove") Vectors.from_glove +h(2, "from_glove") Vectors.from_glove
+tag method +tag method
@ -238,11 +240,16 @@ p Save the current state to a directory.
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code path] +cell #[code path]
+cell unicode or #[code Path] +cell unicode / #[code Path]
+cell +cell
| A path to a directory, which will be created if it doesn't exist. | A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects. | Paths may be either strings or #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+h(2, "from_disk") Vectors.from_disk +h(2, "from_disk") Vectors.from_disk
+tag method +tag method
@ -255,7 +262,7 @@ p Loads state from a directory. Modifies the object in place and returns it.
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code path] +cell #[code path]
+cell unicode or #[code Path] +cell unicode / #[code Path]
+cell +cell
| A path to a directory. Paths may be either strings or | A path to a directory. Paths may be either strings or
| #[code Path]-like objects. | #[code Path]-like objects.
@ -297,7 +304,7 @@ p Load state from a binary string.
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code bytes_data] +cell #[code data]
+cell bytes +cell bytes
+cell The data to load from. +cell The data to load from.

View File

@ -111,11 +111,13 @@ p
p p
| A few more convenience attributes are provided for iterating around the | A few more convenience attributes are provided for iterating around the
| local tree from the token. The #[code .lefts] and #[code .rights] | local tree from the token. The #[+api("token#lefts") #[code Token.lefts]]
| attributes provide sequences of syntactic children that occur before and | and #[+api("token#rights") #[code Token.rights]] attributes provide
| after the token. Both sequences are in sentences order. There are also | sequences of syntactic children that occur before and after the token.
| two integer-typed attributes, #[code .n_rights] and #[code .n_lefts], | Both sequences are in sentence order. There are also two integer-typed
| that give the number of left and right children. | attributes, #[+api("token#n_rights") #[code Token.n_rights]] and
| #[+api("token#n_lefts") #[code Token.n_lefts]], that give the number of
| left and right children.
+code. +code.
doc = nlp(u'bright red apples on the tree') doc = nlp(u'bright red apples on the tree')
@ -126,10 +128,11 @@ p
p p
| You can get a whole phrase by its syntactic head using the | You can get a whole phrase by its syntactic head using the
| #[code .subtree] attribute. This returns an ordered sequence of tokens. | #[+api("token#subtree") #[code Token.subtree]] attribute. This returns an
| You can walk up the tree with the #[code .ancestors] attribute, and | ordered sequence of tokens. You can walk up the tree with the
| check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]] | #[+api("token#ancestors") #[code Token.ancestors]] attribute, and
| method. | check dominance with
| #[+api("token#is_ancestor") #[code Token.is_ancestor()]].
+aside("Projective vs. non-projective") +aside("Projective vs. non-projective")
| For the #[+a("/models/en") default English model], the | For the #[+a("/models/en") default English model], the