mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge pull request #1468 from explosion/feature/tidy-up
💫 Tidy up v2.0 code base
This commit is contained in:
commit
4b78c1762b
|
@ -1,93 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import joblib
|
|
||||||
from os import path
|
|
||||||
import os
|
|
||||||
import bz2
|
|
||||||
import ujson
|
|
||||||
from preshed.counter import PreshCounter
|
|
||||||
from joblib import Parallel, delayed
|
|
||||||
import io
|
|
||||||
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.strings import StringStore
|
|
||||||
from spacy.attrs import ORTH
|
|
||||||
from spacy.tokenizer import Tokenizer
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
def iter_comments(loc):
|
|
||||||
with bz2.BZ2File(loc) as file_:
|
|
||||||
for line in file_:
|
|
||||||
yield ujson.loads(line)
|
|
||||||
|
|
||||||
|
|
||||||
def count_freqs(input_loc, output_loc):
|
|
||||||
print(output_loc)
|
|
||||||
vocab = English.default_vocab(get_lex_attr=None)
|
|
||||||
tokenizer = Tokenizer.from_dir(vocab,
|
|
||||||
path.join(English.default_data_dir(), 'tokenizer'))
|
|
||||||
|
|
||||||
counts = PreshCounter()
|
|
||||||
for json_comment in iter_comments(input_loc):
|
|
||||||
doc = tokenizer(json_comment['body'])
|
|
||||||
doc.count_by(ORTH, counts=counts)
|
|
||||||
|
|
||||||
with io.open(output_loc, 'w', 'utf8') as file_:
|
|
||||||
for orth, freq in counts:
|
|
||||||
string = tokenizer.vocab.strings[orth]
|
|
||||||
if not string.isspace():
|
|
||||||
file_.write('%d\t%s\n' % (freq, string))
|
|
||||||
|
|
||||||
|
|
||||||
def parallelize(func, iterator, n_jobs):
|
|
||||||
Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
|
|
||||||
|
|
||||||
|
|
||||||
def merge_counts(locs, out_loc):
|
|
||||||
string_map = StringStore()
|
|
||||||
counts = PreshCounter()
|
|
||||||
for loc in locs:
|
|
||||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
|
||||||
for line in file_:
|
|
||||||
freq, word = line.strip().split('\t', 1)
|
|
||||||
orth = string_map[word]
|
|
||||||
counts.inc(orth, int(freq))
|
|
||||||
with io.open(out_loc, 'w', encoding='utf8') as file_:
|
|
||||||
for orth, count in counts:
|
|
||||||
string = string_map[orth]
|
|
||||||
file_.write('%d\t%s\n' % (count, string))
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
input_loc=("Location of input file list"),
|
|
||||||
freqs_dir=("Directory for frequency files"),
|
|
||||||
output_loc=("Location for output file"),
|
|
||||||
n_jobs=("Number of workers", "option", "n", int),
|
|
||||||
skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
|
|
||||||
)
|
|
||||||
def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
|
|
||||||
tasks = []
|
|
||||||
outputs = []
|
|
||||||
for input_path in open(input_loc):
|
|
||||||
input_path = input_path.strip()
|
|
||||||
if not input_path:
|
|
||||||
continue
|
|
||||||
filename = input_path.split('/')[-1]
|
|
||||||
output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
|
|
||||||
outputs.append(output_path)
|
|
||||||
if not path.exists(output_path) or not skip_existing:
|
|
||||||
tasks.append((input_path, output_path))
|
|
||||||
|
|
||||||
if tasks:
|
|
||||||
parallelize(count_freqs, tasks, n_jobs)
|
|
||||||
|
|
||||||
print("Merge")
|
|
||||||
merge_counts(outputs, output_loc)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,89 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from xml.etree import cElementTree as ElementTree
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
import plac
|
|
||||||
from pathlib import Path
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
|
|
||||||
escaped_tokens = {
|
|
||||||
'-LRB-': '(',
|
|
||||||
'-RRB-': ')',
|
|
||||||
'-LSB-': '[',
|
|
||||||
'-RSB-': ']',
|
|
||||||
'-LCB-': '{',
|
|
||||||
'-RCB-': '}',
|
|
||||||
}
|
|
||||||
|
|
||||||
def read_parses(parse_loc):
|
|
||||||
offset = 0
|
|
||||||
doc = []
|
|
||||||
for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
|
|
||||||
parse = _adjust_token_ids(parse, offset)
|
|
||||||
offset += len(parse.split('\n'))
|
|
||||||
doc.append(parse)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def _adjust_token_ids(parse, offset):
|
|
||||||
output = []
|
|
||||||
for line in parse.split('\n'):
|
|
||||||
pieces = line.split()
|
|
||||||
pieces[0] = str(int(pieces[0]) + offset)
|
|
||||||
pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
|
|
||||||
output.append('\t'.join(pieces))
|
|
||||||
return '\n'.join(output)
|
|
||||||
|
|
||||||
|
|
||||||
def _fmt_doc(filename, paras):
|
|
||||||
return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}
|
|
||||||
|
|
||||||
|
|
||||||
def _fmt_para(raw, sents):
|
|
||||||
return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}
|
|
||||||
|
|
||||||
|
|
||||||
def _fmt_sent(sent):
|
|
||||||
return {
|
|
||||||
'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
|
|
||||||
'brackets': []}
|
|
||||||
|
|
||||||
|
|
||||||
def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
|
|
||||||
head = int(head) - 1
|
|
||||||
id_ = int(id_) - 1
|
|
||||||
head = (head - id_) if head != -1 else 0
|
|
||||||
return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}
|
|
||||||
|
|
||||||
|
|
||||||
tags_re = re.compile(r'<[\w\?/][^>]+>')
|
|
||||||
def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
|
|
||||||
ewtb_dir = Path(ewtb_dir)
|
|
||||||
out_dir = Path(out_dir)
|
|
||||||
if not out_dir.exists():
|
|
||||||
out_dir.mkdir()
|
|
||||||
for genre_dir in ewtb_dir.joinpath('data').iterdir():
|
|
||||||
#if 'answers' in str(genre_dir): continue
|
|
||||||
parse_dir = genre_dir.joinpath('penntree')
|
|
||||||
docs = []
|
|
||||||
for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
|
|
||||||
filename = source_loc.parts[-1].replace('.sgm.sgm', '')
|
|
||||||
filename = filename.replace('.xml', '')
|
|
||||||
filename = filename.replace('.txt', '')
|
|
||||||
parse_loc = parse_dir.joinpath(filename + '.xml.tree')
|
|
||||||
parses = read_parses(parse_loc)
|
|
||||||
source = source_loc.open().read().strip()
|
|
||||||
if 'answers' in str(genre_dir):
|
|
||||||
source = tags_re.sub('', source).strip()
|
|
||||||
docs.append(_fmt_doc(filename, [[source, parses]]))
|
|
||||||
|
|
||||||
out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
|
|
||||||
with open(str(out_loc), 'w') as out_file:
|
|
||||||
out_file.write(json.dumps(docs, indent=4))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,32 +0,0 @@
|
||||||
import io
|
|
||||||
import plac
|
|
||||||
|
|
||||||
from spacy.en import English
|
|
||||||
|
|
||||||
|
|
||||||
def main(text_loc):
|
|
||||||
with io.open(text_loc, 'r', encoding='utf8') as file_:
|
|
||||||
text = file_.read()
|
|
||||||
NLU = English()
|
|
||||||
for paragraph in text.split('\n\n'):
|
|
||||||
tokens = NLU(paragraph)
|
|
||||||
|
|
||||||
ent_starts = {}
|
|
||||||
ent_ends = {}
|
|
||||||
for span in tokens.ents:
|
|
||||||
ent_starts[span.start] = span.label_
|
|
||||||
ent_ends[span.end] = span.label_
|
|
||||||
|
|
||||||
output = []
|
|
||||||
for token in tokens:
|
|
||||||
if token.i in ent_starts:
|
|
||||||
output.append('<%s>' % ent_starts[token.i])
|
|
||||||
output.append(token.orth_)
|
|
||||||
if (token.i+1) in ent_ends:
|
|
||||||
output.append('</%s>' % ent_ends[token.i+1])
|
|
||||||
output.append('\n\n')
|
|
||||||
print ' '.join(output)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,157 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import io
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
import gzip
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
|
||||||
from spacy.syntax.parser import Parser
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
|
|
||||||
# Last updated for spaCy v0.97
|
|
||||||
|
|
||||||
|
|
||||||
def read_conll(file_):
|
|
||||||
"""Read a standard CoNLL/MALT-style format"""
|
|
||||||
sents = []
|
|
||||||
for sent_str in file_.read().strip().split('\n\n'):
|
|
||||||
ids = []
|
|
||||||
words = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
tags = []
|
|
||||||
for i, line in enumerate(sent_str.split('\n')):
|
|
||||||
word, pos_string, head_idx, label = _parse_line(line)
|
|
||||||
words.append(word)
|
|
||||||
if head_idx < 0:
|
|
||||||
head_idx = i
|
|
||||||
ids.append(i)
|
|
||||||
heads.append(head_idx)
|
|
||||||
labels.append(label)
|
|
||||||
tags.append(pos_string)
|
|
||||||
text = ' '.join(words)
|
|
||||||
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
|
|
||||||
sents.append((None, [(annot, [])]))
|
|
||||||
return sents
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_line(line):
|
|
||||||
pieces = line.split()
|
|
||||||
if len(pieces) == 4:
|
|
||||||
word, pos, head_idx, label = pieces
|
|
||||||
head_idx = int(head_idx)
|
|
||||||
elif len(pieces) == 15:
|
|
||||||
id_ = int(pieces[0].split('_')[-1])
|
|
||||||
word = pieces[1]
|
|
||||||
pos = pieces[4]
|
|
||||||
head_idx = int(pieces[8])-1
|
|
||||||
label = pieces[10]
|
|
||||||
else:
|
|
||||||
id_ = int(pieces[0].split('_')[-1])
|
|
||||||
word = pieces[1]
|
|
||||||
pos = pieces[4]
|
|
||||||
head_idx = int(pieces[6])-1
|
|
||||||
label = pieces[7]
|
|
||||||
if head_idx == 0:
|
|
||||||
label = 'ROOT'
|
|
||||||
return word, pos, head_idx, label
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=False)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
|
||||||
gold_preproc=False, force_gold=False):
|
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
|
||||||
if path.exists(dep_model_dir):
|
|
||||||
shutil.rmtree(dep_model_dir)
|
|
||||||
if path.exists(pos_model_dir):
|
|
||||||
shutil.rmtree(pos_model_dir)
|
|
||||||
os.mkdir(dep_model_dir)
|
|
||||||
os.mkdir(pos_model_dir)
|
|
||||||
|
|
||||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
|
||||||
labels=ArcEager.get_labels(gold_tuples))
|
|
||||||
|
|
||||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
|
||||||
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
|
||||||
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
|
||||||
|
|
||||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
|
||||||
for itn in range(n_iter):
|
|
||||||
scorer = Scorer()
|
|
||||||
loss = 0
|
|
||||||
for _, sents in gold_tuples:
|
|
||||||
for annot_tuples, _ in sents:
|
|
||||||
if len(annot_tuples[1]) == 1:
|
|
||||||
continue
|
|
||||||
|
|
||||||
score_model(scorer, nlp, None, annot_tuples, verbose=False)
|
|
||||||
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
|
||||||
if not gold.is_projective:
|
|
||||||
raise Exception(
|
|
||||||
"Non-projective sentence in training, after we should "
|
|
||||||
"have enforced projectivity: %s" % annot_tuples
|
|
||||||
)
|
|
||||||
|
|
||||||
loss += nlp.parser.train(tokens, gold)
|
|
||||||
nlp.tagger.train(tokens, gold.tags)
|
|
||||||
random.shuffle(gold_tuples)
|
|
||||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
|
|
||||||
scorer.tags_acc, scorer.token_acc))
|
|
||||||
print('end training')
|
|
||||||
nlp.end_training(model_dir)
|
|
||||||
print('done')
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
|
||||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
|
||||||
model_dir=("Location of output model directory"),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
)
|
|
||||||
def main(train_loc, dev_loc, model_dir, n_iter=15):
|
|
||||||
with io.open(train_loc, 'r', encoding='utf8') as file_:
|
|
||||||
train_sents = read_conll(file_)
|
|
||||||
if not eval_only:
|
|
||||||
train(English, train_sents, model_dir, n_iter=n_iter)
|
|
||||||
nlp = English(data_dir=model_dir)
|
|
||||||
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
|
|
||||||
scorer = Scorer()
|
|
||||||
for _, sents in dev_sents:
|
|
||||||
for annot_tuples, _ in sents:
|
|
||||||
score_model(scorer, nlp, None, annot_tuples)
|
|
||||||
print('TOK', 100-scorer.token_acc)
|
|
||||||
print('POS', scorer.tags_acc)
|
|
||||||
print('UAS', scorer.uas)
|
|
||||||
print('LAS', scorer.las)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,187 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import io
|
|
||||||
import random
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import re
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
from spacy.gold import read_json_file
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.gold import merge_sents
|
|
||||||
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
|
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
|
||||||
from spacy.syntax.ner import BiluoPushDown
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
from spacy.syntax.parser import Parser
|
|
||||||
from spacy.syntax.nonproj import PseudoProjectivity
|
|
||||||
|
|
||||||
|
|
||||||
def _corrupt(c, noise_level):
|
|
||||||
if random.random() >= noise_level:
|
|
||||||
return c
|
|
||||||
elif c == ' ':
|
|
||||||
return '\n'
|
|
||||||
elif c == '\n':
|
|
||||||
return ' '
|
|
||||||
elif c in ['.', "'", "!", "?"]:
|
|
||||||
return ''
|
|
||||||
else:
|
|
||||||
return c.lower()
|
|
||||||
|
|
||||||
|
|
||||||
def add_noise(orig, noise_level):
|
|
||||||
if random.random() >= noise_level:
|
|
||||||
return orig
|
|
||||||
elif type(orig) == list:
|
|
||||||
corrupted = [_corrupt(word, noise_level) for word in orig]
|
|
||||||
corrupted = [w for w in corrupted if w]
|
|
||||||
return corrupted
|
|
||||||
else:
|
|
||||||
return ''.join(_corrupt(c, noise_level) for c in orig)
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
else:
|
|
||||||
tokens = nlp.tokenizer(raw_text)
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
|
||||||
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
|
|
||||||
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
|
||||||
format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
|
|
||||||
with Language.train(model_dir, train_data,
|
|
||||||
tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
|
||||||
loss = 0
|
|
||||||
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
|
|
||||||
augment_data=None)):
|
|
||||||
for doc, gold in epoch:
|
|
||||||
trainer.update(doc, gold)
|
|
||||||
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
|
|
||||||
print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
|
|
||||||
trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
|
||||||
beam_width=None, cand_preproc=None):
|
|
||||||
print("Load parser", model_dir)
|
|
||||||
nlp = Language(path=model_dir)
|
|
||||||
if nlp.lang == 'de':
|
|
||||||
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
|
|
||||||
if beam_width is not None:
|
|
||||||
nlp.parser.cfg.beam_width = beam_width
|
|
||||||
scorer = Scorer()
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
if gold_preproc:
|
|
||||||
raw_text = None
|
|
||||||
else:
|
|
||||||
sents = merge_sents(sents)
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
else:
|
|
||||||
tokens = nlp(raw_text)
|
|
||||||
gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
def write_parses(Language, dev_loc, model_dir, out_loc):
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
gold_tuples = read_json_file(dev_loc)
|
|
||||||
scorer = Scorer()
|
|
||||||
out_file = io.open(out_loc, 'w', 'utf8')
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
sents = _merge_sents(sents)
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
else:
|
|
||||||
tokens = nlp(raw_text)
|
|
||||||
#gold = GoldParse(tokens, annot_tuples)
|
|
||||||
#scorer.score(tokens, gold, verbose=False)
|
|
||||||
for sent in tokens.sents:
|
|
||||||
for t in sent:
|
|
||||||
if not t.is_space:
|
|
||||||
out_file.write(
|
|
||||||
'%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
|
|
||||||
)
|
|
||||||
out_file.write('\n')
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
language=("The language to train", "positional", None, str, ['en','de', 'zh']),
|
|
||||||
train_loc=("Location of training file or directory"),
|
|
||||||
dev_loc=("Location of development file or directory"),
|
|
||||||
model_dir=("Location of output model directory",),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
|
||||||
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
|
|
||||||
out_loc=("Out location", "option", "o", str),
|
|
||||||
n_sents=("Number of training sentences", "option", "n", int),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
|
||||||
debug=("Debug mode", "flag", "d", bool),
|
|
||||||
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
|
|
||||||
L1=("L1 regularization penalty", "option", "L", float),
|
|
||||||
)
|
|
||||||
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
|
||||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
|
|
||||||
L1=1e-6):
|
|
||||||
parser_cfg = dict(locals())
|
|
||||||
tagger_cfg = dict(locals())
|
|
||||||
entity_cfg = dict(locals())
|
|
||||||
|
|
||||||
lang = spacy.util.get_lang_class(language)
|
|
||||||
|
|
||||||
parser_cfg['features'] = lang.Defaults.parser_features
|
|
||||||
entity_cfg['features'] = lang.Defaults.entity_features
|
|
||||||
|
|
||||||
if not eval_only:
|
|
||||||
gold_train = list(read_json_file(train_loc))
|
|
||||||
gold_dev = list(read_json_file(dev_loc))
|
|
||||||
if n_sents > 0:
|
|
||||||
gold_train = gold_train[:n_sents]
|
|
||||||
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
|
||||||
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
|
|
||||||
n_iter=n_iter)
|
|
||||||
if out_loc:
|
|
||||||
write_parses(lang, dev_loc, model_dir, out_loc)
|
|
||||||
scorer = evaluate(lang, list(read_json_file(dev_loc)),
|
|
||||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
|
||||||
print('TOK', scorer.token_acc)
|
|
||||||
print('POS', scorer.tags_acc)
|
|
||||||
print('UAS', scorer.uas)
|
|
||||||
print('LAS', scorer.las)
|
|
||||||
|
|
||||||
print('NER P', scorer.ents_p)
|
|
||||||
print('NER R', scorer.ents_r)
|
|
||||||
print('NER F', scorer.ents_f)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,201 +0,0 @@
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
import plac
|
|
||||||
import json
|
|
||||||
import random
|
|
||||||
import pathlib
|
|
||||||
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.syntax.nonproj import PseudoProjectivity
|
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
from spacy.pipeline import DependencyParser, TokenVectorEncoder
|
|
||||||
from spacy.syntax.parser import get_templates
|
|
||||||
from spacy.syntax.arc_eager import ArcEager
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
|
|
||||||
import spacy.attrs
|
|
||||||
import io
|
|
||||||
from thinc.neural.ops import CupyOps
|
|
||||||
from thinc.neural import Model
|
|
||||||
from spacy.es import Spanish
|
|
||||||
from spacy.attrs import POS
|
|
||||||
|
|
||||||
|
|
||||||
from thinc.neural import Model
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import cupy
|
|
||||||
from thinc.neural.ops import CupyOps
|
|
||||||
except:
|
|
||||||
cupy = None
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllx(loc, n=0):
|
|
||||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
|
||||||
text = file_.read()
|
|
||||||
i = 0
|
|
||||||
for sent in text.strip().split('\n\n'):
|
|
||||||
lines = sent.strip().split('\n')
|
|
||||||
if lines:
|
|
||||||
while lines[0].startswith('#'):
|
|
||||||
lines.pop(0)
|
|
||||||
tokens = []
|
|
||||||
for line in lines:
|
|
||||||
id_, word, lemma, pos, tag, morph, head, dep, _1, \
|
|
||||||
_2 = line.split('\t')
|
|
||||||
if '-' in id_ or '.' in id_:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
id_ = int(id_) - 1
|
|
||||||
head = (int(head) - 1) if head != '0' else id_
|
|
||||||
dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
|
|
||||||
tag = pos+'__'+dep+'__'+morph
|
|
||||||
Spanish.Defaults.tag_map[tag] = {POS: pos}
|
|
||||||
tokens.append((id_, word, tag, head, dep, 'O'))
|
|
||||||
except:
|
|
||||||
raise
|
|
||||||
tuples = [list(t) for t in zip(*tokens)]
|
|
||||||
yield (None, [[tuples, []]])
|
|
||||||
i += 1
|
|
||||||
if n >= 1 and i >= n:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
|
|
||||||
scorer = Scorer()
|
|
||||||
correct = 0.
|
|
||||||
total = 0.
|
|
||||||
for doc, gold in zip(Xs, ys):
|
|
||||||
doc = Doc(vocab, words=[w.text for w in doc])
|
|
||||||
encoder(doc)
|
|
||||||
parser(doc)
|
|
||||||
PseudoProjectivity.deprojectivize(doc)
|
|
||||||
scorer.score(doc, gold, verbose=verbose)
|
|
||||||
for token, tag in zip(doc, gold.tags):
|
|
||||||
if '_' in token.tag_:
|
|
||||||
univ_guess, _ = token.tag_.split('_', 1)
|
|
||||||
else:
|
|
||||||
univ_guess = ''
|
|
||||||
univ_truth, _ = tag.split('_', 1)
|
|
||||||
correct += univ_guess == univ_truth
|
|
||||||
total += 1
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
def organize_data(vocab, train_sents):
|
|
||||||
Xs = []
|
|
||||||
ys = []
|
|
||||||
for _, doc_sents in train_sents:
|
|
||||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
|
||||||
doc = Doc(vocab, words=words)
|
|
||||||
gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
|
|
||||||
Xs.append(doc)
|
|
||||||
ys.append(gold)
|
|
||||||
return Xs, ys
|
|
||||||
|
|
||||||
|
|
||||||
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
|
||||||
LangClass = spacy.util.get_lang_class(lang_name)
|
|
||||||
train_sents = list(read_conllx(train_loc))
|
|
||||||
dev_sents = list(read_conllx(dev_loc))
|
|
||||||
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
|
||||||
|
|
||||||
actions = ArcEager.get_actions(gold_parses=train_sents)
|
|
||||||
features = get_templates('basic')
|
|
||||||
|
|
||||||
model_dir = pathlib.Path(model_dir)
|
|
||||||
if not model_dir.exists():
|
|
||||||
model_dir.mkdir()
|
|
||||||
if not (model_dir / 'deps').exists():
|
|
||||||
(model_dir / 'deps').mkdir()
|
|
||||||
if not (model_dir / 'pos').exists():
|
|
||||||
(model_dir / 'pos').mkdir()
|
|
||||||
with (model_dir / 'deps' / 'config.json').open('wb') as file_:
|
|
||||||
file_.write(
|
|
||||||
json.dumps(
|
|
||||||
{'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
|
|
||||||
|
|
||||||
vocab = LangClass.Defaults.create_vocab()
|
|
||||||
if not (model_dir / 'vocab').exists():
|
|
||||||
(model_dir / 'vocab').mkdir()
|
|
||||||
else:
|
|
||||||
if (model_dir / 'vocab' / 'strings.json').exists():
|
|
||||||
with (model_dir / 'vocab' / 'strings.json').open() as file_:
|
|
||||||
vocab.strings.load(file_)
|
|
||||||
if (model_dir / 'vocab' / 'lexemes.bin').exists():
|
|
||||||
vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
|
|
||||||
|
|
||||||
if clusters_loc is not None:
|
|
||||||
clusters_loc = pathlib.Path(clusters_loc)
|
|
||||||
with clusters_loc.open() as file_:
|
|
||||||
for line in file_:
|
|
||||||
try:
|
|
||||||
cluster, word, freq = line.split()
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
lex = vocab[word]
|
|
||||||
lex.cluster = int(cluster[::-1], 2)
|
|
||||||
# Populate vocab
|
|
||||||
for _, doc_sents in train_sents:
|
|
||||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
|
||||||
for word in words:
|
|
||||||
_ = vocab[word]
|
|
||||||
for dep in deps:
|
|
||||||
_ = vocab[dep]
|
|
||||||
for tag in tags:
|
|
||||||
_ = vocab[tag]
|
|
||||||
if vocab.morphology.tag_map:
|
|
||||||
for tag in tags:
|
|
||||||
vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
|
|
||||||
tagger = Tagger(vocab)
|
|
||||||
encoder = TokenVectorEncoder(vocab, width=64)
|
|
||||||
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
|
|
||||||
|
|
||||||
Xs, ys = organize_data(vocab, train_sents)
|
|
||||||
dev_Xs, dev_ys = organize_data(vocab, dev_sents)
|
|
||||||
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
|
|
||||||
docs = list(Xs)
|
|
||||||
for doc in docs:
|
|
||||||
encoder(doc)
|
|
||||||
nn_loss = [0.]
|
|
||||||
def track_progress():
|
|
||||||
with encoder.tagger.use_params(optimizer.averages):
|
|
||||||
with parser.model.use_params(optimizer.averages):
|
|
||||||
scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
|
|
||||||
itn = len(nn_loss)
|
|
||||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
|
|
||||||
nn_loss.append(0.)
|
|
||||||
track_progress()
|
|
||||||
trainer.each_epoch.append(track_progress)
|
|
||||||
trainer.batch_size = 24
|
|
||||||
trainer.nb_epoch = 40
|
|
||||||
for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
|
|
||||||
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
|
|
||||||
tokvecs, upd_tokvecs = encoder.begin_update(docs)
|
|
||||||
for doc, tokvec in zip(docs, tokvecs):
|
|
||||||
doc.tensor = tokvec
|
|
||||||
d_tokvecs = parser.update(docs, golds, sgd=optimizer)
|
|
||||||
upd_tokvecs(d_tokvecs, sgd=optimizer)
|
|
||||||
encoder.update(docs, golds, sgd=optimizer)
|
|
||||||
nlp = LangClass(vocab=vocab, parser=parser)
|
|
||||||
scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
|
|
||||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
|
||||||
#nlp.end_training(model_dir)
|
|
||||||
#scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
|
||||||
#print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
if 1:
|
|
||||||
plac.call(main)
|
|
||||||
else:
|
|
||||||
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
|
||||||
s = pstats.Stats("Profile.prof")
|
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
|
||||||
|
|
||||||
|
|
||||||
plac.call(main)
|
|
|
@ -1,194 +0,0 @@
|
||||||
"""Convert OntoNotes into a json format.
|
|
||||||
|
|
||||||
doc: {
|
|
||||||
id: string,
|
|
||||||
paragraphs: [{
|
|
||||||
raw: string,
|
|
||||||
sents: [int],
|
|
||||||
tokens: [{
|
|
||||||
start: int,
|
|
||||||
tag: string,
|
|
||||||
head: int,
|
|
||||||
dep: string}],
|
|
||||||
ner: [{
|
|
||||||
start: int,
|
|
||||||
end: int,
|
|
||||||
label: string}],
|
|
||||||
brackets: [{
|
|
||||||
start: int,
|
|
||||||
end: int,
|
|
||||||
label: string}]}]}
|
|
||||||
|
|
||||||
Consumes output of spacy/munge/align_raw.py
|
|
||||||
"""
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import plac
|
|
||||||
import json
|
|
||||||
from os import path
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import io
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
from spacy.munge import read_ptb
|
|
||||||
from spacy.munge import read_conll
|
|
||||||
from spacy.munge import read_ner
|
|
||||||
|
|
||||||
|
|
||||||
def _iter_raw_files(raw_loc):
|
|
||||||
files = json.load(open(raw_loc))
|
|
||||||
for f in files:
|
|
||||||
yield f
|
|
||||||
|
|
||||||
|
|
||||||
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
|
||||||
ptb_sents = read_ptb.split(ptb_text)
|
|
||||||
dep_sents = read_conll.split(dep_text)
|
|
||||||
if len(ptb_sents) != len(dep_sents):
|
|
||||||
return None
|
|
||||||
if ner_text is not None:
|
|
||||||
ner_sents = read_ner.split(ner_text)
|
|
||||||
else:
|
|
||||||
ner_sents = [None] * len(ptb_sents)
|
|
||||||
|
|
||||||
i = 0
|
|
||||||
doc = {'id': file_id}
|
|
||||||
if raw_paras is None:
|
|
||||||
doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
|
|
||||||
#for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents):
|
|
||||||
# doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent]))
|
|
||||||
else:
|
|
||||||
doc['paragraphs'] = []
|
|
||||||
for raw_sents in raw_paras:
|
|
||||||
para = format_para(
|
|
||||||
' '.join(raw_sents).replace('<SEP>', ''),
|
|
||||||
ptb_sents[i:i+len(raw_sents)],
|
|
||||||
dep_sents[i:i+len(raw_sents)],
|
|
||||||
ner_sents[i:i+len(raw_sents)])
|
|
||||||
if para['sentences']:
|
|
||||||
doc['paragraphs'].append(para)
|
|
||||||
i += len(raw_sents)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
|
|
||||||
para = {'raw': raw_text, 'sentences': []}
|
|
||||||
offset = 0
|
|
||||||
assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
|
|
||||||
for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
|
|
||||||
_, deps = read_conll.parse(dep_text, strip_bad_periods=True)
|
|
||||||
if deps and 'VERB' in [t['tag'] for t in deps]:
|
|
||||||
continue
|
|
||||||
if ner_text is not None:
|
|
||||||
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
|
|
||||||
else:
|
|
||||||
ner = ['-' for _ in deps]
|
|
||||||
_, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
|
|
||||||
# Necessary because the ClearNLP converter deletes EDITED words.
|
|
||||||
if len(ner) != len(deps):
|
|
||||||
ner = ['-' for _ in deps]
|
|
||||||
para['sentences'].append(format_sentence(deps, ner, brackets))
|
|
||||||
return para
|
|
||||||
|
|
||||||
|
|
||||||
def format_sentence(deps, ner, brackets):
|
|
||||||
sent = {'tokens': [], 'brackets': []}
|
|
||||||
for token_id, (token, token_ent) in enumerate(zip(deps, ner)):
|
|
||||||
sent['tokens'].append(format_token(token_id, token, token_ent))
|
|
||||||
|
|
||||||
for label, start, end in brackets:
|
|
||||||
if start != end:
|
|
||||||
sent['brackets'].append({
|
|
||||||
'label': label,
|
|
||||||
'first': start,
|
|
||||||
'last': (end-1)})
|
|
||||||
return sent
|
|
||||||
|
|
||||||
|
|
||||||
def format_token(token_id, token, ner):
|
|
||||||
assert token_id == token['id']
|
|
||||||
head = (token['head'] - token_id) if token['head'] != -1 else 0
|
|
||||||
return {
|
|
||||||
'id': token_id,
|
|
||||||
'orth': token['word'],
|
|
||||||
'tag': token['tag'],
|
|
||||||
'head': head,
|
|
||||||
'dep': token['dep'],
|
|
||||||
'ner': ner}
|
|
||||||
|
|
||||||
|
|
||||||
def read_file(*pieces):
|
|
||||||
loc = path.join(*pieces)
|
|
||||||
if not path.exists(loc):
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return io.open(loc, 'r', encoding='utf8').read().strip()
|
|
||||||
|
|
||||||
|
|
||||||
def get_file_names(section_dir, subsection):
|
|
||||||
filenames = []
|
|
||||||
for fn in os.listdir(path.join(section_dir, subsection)):
|
|
||||||
filenames.append(fn.rsplit('.', 1)[0])
|
|
||||||
return list(sorted(set(filenames)))
|
|
||||||
|
|
||||||
|
|
||||||
def read_wsj_with_source(onto_dir, raw_dir):
|
|
||||||
# Now do WSJ, with source alignment
|
|
||||||
onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
|
|
||||||
docs = {}
|
|
||||||
for i in range(25):
|
|
||||||
section = str(i) if i >= 10 else ('0' + str(i))
|
|
||||||
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
|
|
||||||
for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
|
|
||||||
if section == '00':
|
|
||||||
j += 1
|
|
||||||
if section == '04' and filename == '55':
|
|
||||||
continue
|
|
||||||
ptb = read_file(onto_dir, section, '%s.parse' % filename)
|
|
||||||
dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
|
|
||||||
ner = read_file(onto_dir, section, '%s.name' % filename)
|
|
||||||
if ptb is not None and dep is not None:
|
|
||||||
docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def get_doc(onto_dir, file_path, wsj_docs):
|
|
||||||
filename = file_path.rsplit('/', 1)[1]
|
|
||||||
if filename in wsj_docs:
|
|
||||||
return wsj_docs[filename]
|
|
||||||
else:
|
|
||||||
ptb = read_file(onto_dir, file_path + '.parse')
|
|
||||||
dep = read_file(onto_dir, file_path + '.parse.dep')
|
|
||||||
ner = read_file(onto_dir, file_path + '.name')
|
|
||||||
if ptb is not None and dep is not None:
|
|
||||||
return format_doc(filename, None, ptb, dep, ner)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def read_ids(loc):
|
|
||||||
return open(loc).read().strip().split('\n')
|
|
||||||
|
|
||||||
|
|
||||||
def main(onto_dir, raw_dir, out_dir):
|
|
||||||
wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
|
|
||||||
|
|
||||||
for partition in ('train', 'test', 'development'):
|
|
||||||
ids = read_ids(path.join(onto_dir, '%s.id' % partition))
|
|
||||||
docs_by_genre = defaultdict(list)
|
|
||||||
for file_path in ids:
|
|
||||||
doc = get_doc(onto_dir, file_path, wsj_docs)
|
|
||||||
if doc is not None:
|
|
||||||
genre = file_path.split('/')[3]
|
|
||||||
docs_by_genre[genre].append(doc)
|
|
||||||
part_dir = path.join(out_dir, partition)
|
|
||||||
if not path.exists(part_dir):
|
|
||||||
os.mkdir(part_dir)
|
|
||||||
for genre, docs in sorted(docs_by_genre.items()):
|
|
||||||
out_loc = path.join(part_dir, genre + '.json')
|
|
||||||
with open(out_loc, 'w') as file_:
|
|
||||||
json.dump(docs, file_, indent=4)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,13 +0,0 @@
|
||||||
"""Read a vector file, and prepare it as binary data, for easy consumption"""
|
|
||||||
|
|
||||||
import plac
|
|
||||||
|
|
||||||
from spacy.vocab import write_binary_vectors
|
|
||||||
|
|
||||||
|
|
||||||
def main(in_loc, out_loc):
|
|
||||||
write_binary_vectors(in_loc, out_loc)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,175 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import codecs
|
|
||||||
import random
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import re
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.en import English
|
|
||||||
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
|
|
||||||
from spacy.syntax.util import Config
|
|
||||||
from spacy.gold import read_json_file
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
|
|
||||||
from spacy.scorer import Scorer
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, raw_text, annot_tuples):
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
else:
|
|
||||||
tokens = nlp.tokenizer(raw_text)
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold)
|
|
||||||
|
|
||||||
|
|
||||||
def _merge_sents(sents):
|
|
||||||
m_deps = [[], [], [], [], [], []]
|
|
||||||
m_brackets = []
|
|
||||||
i = 0
|
|
||||||
for (ids, words, tags, heads, labels, ner), brackets in sents:
|
|
||||||
m_deps[0].extend(id_ + i for id_ in ids)
|
|
||||||
m_deps[1].extend(words)
|
|
||||||
m_deps[2].extend(tags)
|
|
||||||
m_deps[3].extend(head + i for head in heads)
|
|
||||||
m_deps[4].extend(labels)
|
|
||||||
m_deps[5].extend(ner)
|
|
||||||
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
|
|
||||||
i += len(ids)
|
|
||||||
return [(m_deps, m_brackets)]
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|
||||||
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
|
|
||||||
beam_width=1, verbose=False,
|
|
||||||
use_orig_arc_eager=False):
|
|
||||||
if n_sents > 0:
|
|
||||||
gold_tuples = gold_tuples[:n_sents]
|
|
||||||
|
|
||||||
templates = Tagger.default_templates()
|
|
||||||
nlp = Language(data_dir=model_dir, tagger=False)
|
|
||||||
nlp.tagger = Tagger.blank(nlp.vocab, templates)
|
|
||||||
|
|
||||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
|
||||||
for itn in range(n_iter):
|
|
||||||
scorer = Scorer()
|
|
||||||
loss = 0
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
if gold_preproc:
|
|
||||||
raw_text = None
|
|
||||||
else:
|
|
||||||
sents = _merge_sents(sents)
|
|
||||||
for annot_tuples, ctnt in sents:
|
|
||||||
words = annot_tuples[1]
|
|
||||||
gold_tags = annot_tuples[2]
|
|
||||||
score_model(scorer, nlp, raw_text, annot_tuples)
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
|
||||||
else:
|
|
||||||
tokens = nlp.tokenizer(raw_text)
|
|
||||||
loss += nlp.tagger.train(tokens, gold_tags)
|
|
||||||
random.shuffle(gold_tuples)
|
|
||||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
|
||||||
scorer.tags_acc,
|
|
||||||
scorer.token_acc))
|
|
||||||
nlp.end_training(model_dir)
|
|
||||||
|
|
||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
|
||||||
beam_width=None):
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
if beam_width is not None:
|
|
||||||
nlp.parser.cfg.beam_width = beam_width
|
|
||||||
scorer = Scorer()
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
if gold_preproc:
|
|
||||||
raw_text = None
|
|
||||||
else:
|
|
||||||
sents = _merge_sents(sents)
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
else:
|
|
||||||
tokens = nlp(raw_text, merge_mwes=False)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=verbose)
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
if beam_width is not None:
|
|
||||||
nlp.parser.cfg.beam_width = beam_width
|
|
||||||
gold_tuples = read_json_file(dev_loc)
|
|
||||||
scorer = Scorer()
|
|
||||||
out_file = codecs.open(out_loc, 'w', 'utf8')
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
sents = _merge_sents(sents)
|
|
||||||
for annot_tuples, brackets in sents:
|
|
||||||
if raw_text is None:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.entity(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
else:
|
|
||||||
tokens = nlp(raw_text, merge_mwes=False)
|
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
|
||||||
scorer.score(tokens, gold, verbose=False)
|
|
||||||
for t in tokens:
|
|
||||||
out_file.write(
|
|
||||||
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
|
|
||||||
)
|
|
||||||
return scorer
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_loc=("Location of training file or directory"),
|
|
||||||
dev_loc=("Location of development file or directory"),
|
|
||||||
model_dir=("Location of output model directory",),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
|
||||||
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
|
|
||||||
out_loc=("Out location", "option", "o", str),
|
|
||||||
n_sents=("Number of training sentences", "option", "n", int),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
|
||||||
debug=("Debug mode", "flag", "d", bool),
|
|
||||||
)
|
|
||||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
|
||||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
|
|
||||||
if not eval_only:
|
|
||||||
gold_train = list(read_json_file(train_loc))
|
|
||||||
train(English, gold_train, model_dir,
|
|
||||||
feat_set='basic' if not debug else 'debug',
|
|
||||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
|
||||||
corruption_level=corruption_level, n_iter=n_iter,
|
|
||||||
verbose=verbose)
|
|
||||||
#if out_loc:
|
|
||||||
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
|
||||||
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
|
||||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
|
||||||
print('TOK', scorer.token_acc)
|
|
||||||
print('POS', scorer.tags_acc)
|
|
||||||
print('UAS', scorer.uas)
|
|
||||||
print('LAS', scorer.las)
|
|
||||||
|
|
||||||
print('NER P', scorer.ents_p)
|
|
||||||
print('NER R', scorer.ents_r)
|
|
||||||
print('NER F', scorer.ents_f)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,160 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import io
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
import gzip
|
|
||||||
import ujson
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import cProfile
|
|
||||||
import pstats
|
|
||||||
|
|
||||||
import spacy.util
|
|
||||||
from spacy.de import German
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.tagger import Tagger
|
|
||||||
from spacy.scorer import PRFScore
|
|
||||||
|
|
||||||
from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags
|
|
||||||
from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags
|
|
||||||
from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
|
|
||||||
from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
|
|
||||||
from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
|
|
||||||
|
|
||||||
|
|
||||||
def default_templates():
|
|
||||||
return spacy.tagger.Tagger.default_templates()
|
|
||||||
|
|
||||||
def default_templates_without_clusters():
|
|
||||||
return (
|
|
||||||
(W_orth,),
|
|
||||||
(P1_lemma, P1_pos),
|
|
||||||
(P2_lemma, P2_pos),
|
|
||||||
(N1_orth,),
|
|
||||||
(N2_orth,),
|
|
||||||
|
|
||||||
(W_suffix,),
|
|
||||||
(W_prefix,),
|
|
||||||
|
|
||||||
(P1_pos,),
|
|
||||||
(P2_pos,),
|
|
||||||
(P1_pos, P2_pos),
|
|
||||||
(P1_pos, W_orth),
|
|
||||||
(P1_suffix,),
|
|
||||||
(N1_suffix,),
|
|
||||||
|
|
||||||
(W_shape,),
|
|
||||||
|
|
||||||
(W_flags,),
|
|
||||||
(N1_flags,),
|
|
||||||
(N2_flags,),
|
|
||||||
(P1_flags,),
|
|
||||||
(P2_flags,),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def make_tagger(vocab, templates):
|
|
||||||
model = spacy.tagger.TaggerModel(templates)
|
|
||||||
return spacy.tagger.Tagger(vocab,model)
|
|
||||||
|
|
||||||
|
|
||||||
def read_conll(file_):
|
|
||||||
def sentences():
|
|
||||||
words, tags = [], []
|
|
||||||
for line in file_:
|
|
||||||
line = line.strip()
|
|
||||||
if line:
|
|
||||||
word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
|
|
||||||
words.append(word)
|
|
||||||
tags.append(tag)
|
|
||||||
elif words:
|
|
||||||
yield words, tags
|
|
||||||
words, tags = [], []
|
|
||||||
if words:
|
|
||||||
yield words, tags
|
|
||||||
return [ s for s in sentences() ]
|
|
||||||
|
|
||||||
|
|
||||||
def score_model(score, nlp, words, gold_tags):
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
|
||||||
assert(len(tokens) == len(gold_tags))
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
|
|
||||||
for token, gold_tag in zip(tokens,gold_tags):
|
|
||||||
score.score_set(set([token.tag_]),set([gold_tag]))
|
|
||||||
|
|
||||||
|
|
||||||
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
|
|
||||||
# make shuffling deterministic
|
|
||||||
random.seed(seed)
|
|
||||||
|
|
||||||
# set up directory for model
|
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
|
||||||
if path.exists(pos_model_dir):
|
|
||||||
shutil.rmtree(pos_model_dir)
|
|
||||||
os.mkdir(pos_model_dir)
|
|
||||||
|
|
||||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
|
||||||
nlp.tagger = make_tagger(nlp.vocab,default_templates())
|
|
||||||
|
|
||||||
print("Itn.\ttrain acc %\tdev acc %")
|
|
||||||
for itn in range(n_iter):
|
|
||||||
# train on train set
|
|
||||||
#train_acc = PRFScore()
|
|
||||||
correct, total = 0., 0.
|
|
||||||
for words, gold_tags in train_sents:
|
|
||||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
|
||||||
correct += nlp.tagger.train(tokens, gold_tags)
|
|
||||||
total += len(words)
|
|
||||||
train_acc = correct/total
|
|
||||||
|
|
||||||
# test on dev set
|
|
||||||
dev_acc = PRFScore()
|
|
||||||
for words, gold_tags in dev_sents:
|
|
||||||
score_model(dev_acc, nlp, words, gold_tags)
|
|
||||||
|
|
||||||
random.shuffle(train_sents)
|
|
||||||
print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
|
|
||||||
|
|
||||||
|
|
||||||
print('end training')
|
|
||||||
nlp.end_training(model_dir)
|
|
||||||
print('done')
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
|
||||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
|
||||||
model_dir=("Location of output model directory"),
|
|
||||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
|
||||||
n_iter=("Number of training iterations", "option", "i", int),
|
|
||||||
)
|
|
||||||
def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
|
|
||||||
# training
|
|
||||||
if not eval_only:
|
|
||||||
with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
|
|
||||||
io.open(dev_loc, 'r', encoding='utf8') as devfile_:
|
|
||||||
train_sents = read_conll(trainfile_)
|
|
||||||
dev_sents = read_conll(devfile_)
|
|
||||||
train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
|
|
||||||
|
|
||||||
# testing
|
|
||||||
with io.open(dev_loc, 'r', encoding='utf8') as file_:
|
|
||||||
dev_sents = read_conll(file_)
|
|
||||||
nlp = German(data_dir=model_dir)
|
|
||||||
|
|
||||||
dev_acc = PRFScore()
|
|
||||||
for words, gold_tags in dev_sents:
|
|
||||||
score_model(dev_acc, nlp, words, gold_tags)
|
|
||||||
|
|
||||||
print('POS: %6.2f %%' % (100*dev_acc.precision))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
1
setup.py
1
setup.py
|
@ -24,7 +24,6 @@ MOD_NAMES = [
|
||||||
'spacy.vocab',
|
'spacy.vocab',
|
||||||
'spacy.attrs',
|
'spacy.attrs',
|
||||||
'spacy.morphology',
|
'spacy.morphology',
|
||||||
'spacy.tagger',
|
|
||||||
'spacy.pipeline',
|
'spacy.pipeline',
|
||||||
'spacy.syntax.stateclass',
|
'spacy.syntax.stateclass',
|
||||||
'spacy.syntax._state',
|
'spacy.syntax._state',
|
||||||
|
|
|
@ -3,8 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from .cli.info import info as cli_info
|
from .cli.info import info as cli_info
|
||||||
from .glossary import explain
|
from .glossary import explain
|
||||||
from .deprecated import resolve_load_name
|
|
||||||
#from .about import __version__
|
|
||||||
from .about import __version__
|
from .about import __version__
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
# NB! This breaks in plac on Python 2!!
|
# NB! This breaks in plac on Python 2!!
|
||||||
#from __future__ import unicode_literals
|
# from __future__ import unicode_literals
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import plac
|
import plac
|
||||||
|
|
297
spacy/_ml.py
297
spacy/_ml.py
|
@ -1,47 +1,40 @@
|
||||||
import ujson
|
# coding: utf8
|
||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
||||||
from thinc.i2v import HashEmbed, StaticVectors
|
from thinc.i2v import HashEmbed, StaticVectors
|
||||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||||
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
from thinc.t2v import Pooling, sum_pool
|
||||||
from thinc.misc import Residual
|
from thinc.misc import Residual
|
||||||
from thinc.misc import BatchNorm as BN
|
|
||||||
from thinc.misc import LayerNorm as LN
|
from thinc.misc import LayerNorm as LN
|
||||||
|
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||||
from thinc.api import FeatureExtracter, with_getitem
|
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
|
||||||
from thinc.api import uniqued, wrap, flatten_add_lengths, noop
|
from thinc.api import uniqued, wrap, noop
|
||||||
|
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
import random
|
|
||||||
import cytoolz
|
|
||||||
|
|
||||||
from thinc import describe
|
from thinc import describe
|
||||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
||||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||||
import thinc.extra.load_nlp
|
import thinc.extra.load_nlp
|
||||||
|
|
||||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
from .tokens.doc import Doc
|
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
import numpy
|
|
||||||
import io
|
|
||||||
|
|
||||||
# TODO: Unset this once we don't want to support models previous models.
|
|
||||||
import thinc.neural._classes.layernorm
|
|
||||||
thinc.neural._classes.layernorm.set_compat_six_eight(False)
|
|
||||||
|
|
||||||
VECTORS_KEY = 'spacy_pretrained_vectors'
|
VECTORS_KEY = 'spacy_pretrained_vectors'
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||||
|
|
||||||
def finish_update(d_X, sgd=None):
|
def finish_update(d_X, sgd=None):
|
||||||
return ops.unflatten(d_X, lengths, pad=pad)
|
return ops.unflatten(d_X, lengths, pad=pad)
|
||||||
|
|
||||||
X = ops.flatten(seqs, pad=pad)
|
X = ops.flatten(seqs, pad=pad)
|
||||||
return (X, lengths), finish_update
|
return (X, lengths), finish_update
|
||||||
|
|
||||||
|
@ -55,33 +48,14 @@ def _logistic(X, drop=0.):
|
||||||
X = xp.minimum(X, 10., X)
|
X = xp.minimum(X, 10., X)
|
||||||
X = xp.maximum(X, -10., X)
|
X = xp.maximum(X, -10., X)
|
||||||
Y = 1. / (1. + xp.exp(-X))
|
Y = 1. / (1. + xp.exp(-X))
|
||||||
|
|
||||||
def logistic_bwd(dY, sgd=None):
|
def logistic_bwd(dY, sgd=None):
|
||||||
dX = dY * (Y * (1-Y))
|
dX = dY * (Y * (1-Y))
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
return Y, logistic_bwd
|
return Y, logistic_bwd
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
|
||||||
def add_tuples(X, drop=0.):
|
|
||||||
"""Give inputs of sequence pairs, where each sequence is (vals, length),
|
|
||||||
sum the values, returning a single sequence.
|
|
||||||
|
|
||||||
If input is:
|
|
||||||
((vals1, length), (vals2, length)
|
|
||||||
Output is:
|
|
||||||
(vals1+vals2, length)
|
|
||||||
|
|
||||||
vals are a single tensor for the whole batch.
|
|
||||||
"""
|
|
||||||
(vals1, length1), (vals2, length2) = X
|
|
||||||
assert length1 == length2
|
|
||||||
|
|
||||||
def add_tuples_bwd(dY, sgd=None):
|
|
||||||
return (dY, dY)
|
|
||||||
|
|
||||||
return (vals1+vals2, length), add_tuples_bwd
|
|
||||||
|
|
||||||
|
|
||||||
def _zero_init(model):
|
def _zero_init(model):
|
||||||
def _zero_init_impl(self, X, y):
|
def _zero_init_impl(self, X, y):
|
||||||
self.W.fill(0)
|
self.W.fill(0)
|
||||||
|
@ -115,13 +89,12 @@ def _init_for_precomputed(W, ops):
|
||||||
nF=Dimension("Number of features"),
|
nF=Dimension("Number of features"),
|
||||||
nO=Dimension("Output size"),
|
nO=Dimension("Output size"),
|
||||||
W=Synapses("Weights matrix",
|
W=Synapses("Weights matrix",
|
||||||
lambda obj: (obj.nF, obj.nO, obj.nI),
|
lambda obj: (obj.nF, obj.nO, obj.nI),
|
||||||
lambda W, ops: _init_for_precomputed(W, ops)),
|
lambda W, ops: _init_for_precomputed(W, ops)),
|
||||||
b=Biases("Bias vector",
|
b=Biases("Bias vector",
|
||||||
lambda obj: (obj.nO,)),
|
lambda obj: (obj.nO,)),
|
||||||
d_W=Gradient("W"),
|
d_W=Gradient("W"),
|
||||||
d_b=Gradient("b")
|
d_b=Gradient("b"))
|
||||||
)
|
|
||||||
class PrecomputableAffine(Model):
|
class PrecomputableAffine(Model):
|
||||||
def __init__(self, nO=None, nI=None, nF=None, **kwargs):
|
def __init__(self, nO=None, nI=None, nF=None, **kwargs):
|
||||||
Model.__init__(self, **kwargs)
|
Model.__init__(self, **kwargs)
|
||||||
|
@ -134,18 +107,19 @@ class PrecomputableAffine(Model):
|
||||||
# Yf: (b, f, i)
|
# Yf: (b, f, i)
|
||||||
# dY: (b, o)
|
# dY: (b, o)
|
||||||
# dYf: (b, f, o)
|
# dYf: (b, f, o)
|
||||||
#Yf = numpy.einsum('bi,foi->bfo', X, self.W)
|
# Yf = numpy.einsum('bi,foi->bfo', X, self.W)
|
||||||
Yf = self.ops.xp.tensordot(
|
Yf = self.ops.xp.tensordot(
|
||||||
X, self.W, axes=[[1], [2]])
|
X, self.W, axes=[[1], [2]])
|
||||||
Yf += self.b
|
Yf += self.b
|
||||||
|
|
||||||
def backward(dY_ids, sgd=None):
|
def backward(dY_ids, sgd=None):
|
||||||
tensordot = self.ops.xp.tensordot
|
tensordot = self.ops.xp.tensordot
|
||||||
dY, ids = dY_ids
|
dY, ids = dY_ids
|
||||||
Xf = X[ids]
|
Xf = X[ids]
|
||||||
|
|
||||||
#dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
|
# dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
|
||||||
dXf = tensordot(dY, self.W, axes=[[1], [1]])
|
dXf = tensordot(dY, self.W, axes=[[1], [1]])
|
||||||
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
|
# dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
|
||||||
dW = tensordot(dY, Xf, axes=[[0], [0]])
|
dW = tensordot(dY, Xf, axes=[[0], [0]])
|
||||||
# ofi -> foi
|
# ofi -> foi
|
||||||
self.d_W += dW.transpose((1, 0, 2))
|
self.d_W += dW.transpose((1, 0, 2))
|
||||||
|
@ -154,6 +128,7 @@ class PrecomputableAffine(Model):
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
return dXf
|
return dXf
|
||||||
|
|
||||||
return Yf, backward
|
return Yf, backward
|
||||||
|
|
||||||
|
|
||||||
|
@ -164,13 +139,12 @@ class PrecomputableAffine(Model):
|
||||||
nP=Dimension("Number of pieces"),
|
nP=Dimension("Number of pieces"),
|
||||||
nO=Dimension("Output size"),
|
nO=Dimension("Output size"),
|
||||||
W=Synapses("Weights matrix",
|
W=Synapses("Weights matrix",
|
||||||
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
|
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
|
||||||
lambda W, ops: ops.xavier_uniform_init(W)),
|
lambda W, ops: ops.xavier_uniform_init(W)),
|
||||||
b=Biases("Bias vector",
|
b=Biases("Bias vector",
|
||||||
lambda obj: (obj.nO, obj.nP)),
|
lambda obj: (obj.nO, obj.nP)),
|
||||||
d_W=Gradient("W"),
|
d_W=Gradient("W"),
|
||||||
d_b=Gradient("b")
|
d_b=Gradient("b"))
|
||||||
)
|
|
||||||
class PrecomputableMaxouts(Model):
|
class PrecomputableMaxouts(Model):
|
||||||
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
|
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
|
||||||
Model.__init__(self, **kwargs)
|
Model.__init__(self, **kwargs)
|
||||||
|
@ -186,114 +160,26 @@ class PrecomputableMaxouts(Model):
|
||||||
# dYp: (b, o, p)
|
# dYp: (b, o, p)
|
||||||
# W: (f, o, p, i)
|
# W: (f, o, p, i)
|
||||||
# b: (o, p)
|
# b: (o, p)
|
||||||
|
|
||||||
# bi,opfi->bfop
|
# bi,opfi->bfop
|
||||||
# bop,fopi->bfi
|
# bop,fopi->bfi
|
||||||
# bop,fbi->opfi : fopi
|
# bop,fbi->opfi : fopi
|
||||||
|
|
||||||
tensordot = self.ops.xp.tensordot
|
tensordot = self.ops.xp.tensordot
|
||||||
ascontiguous = self.ops.xp.ascontiguousarray
|
|
||||||
|
|
||||||
Yfp = tensordot(X, self.W, axes=[[1], [3]])
|
Yfp = tensordot(X, self.W, axes=[[1], [3]])
|
||||||
Yfp += self.b
|
Yfp += self.b
|
||||||
|
|
||||||
def backward(dYp_ids, sgd=None):
|
def backward(dYp_ids, sgd=None):
|
||||||
dYp, ids = dYp_ids
|
dYp, ids = dYp_ids
|
||||||
Xf = X[ids]
|
Xf = X[ids]
|
||||||
|
dXf = tensordot(dYp, self.W, axes=[[1, 2], [1, 2]])
|
||||||
dXf = tensordot(dYp, self.W, axes=[[1, 2], [1,2]])
|
|
||||||
dW = tensordot(dYp, Xf, axes=[[0], [0]])
|
dW = tensordot(dYp, Xf, axes=[[0], [0]])
|
||||||
|
|
||||||
self.d_W += dW.transpose((2, 0, 1, 3))
|
self.d_W += dW.transpose((2, 0, 1, 3))
|
||||||
self.d_b += dYp.sum(axis=0)
|
self.d_b += dYp.sum(axis=0)
|
||||||
|
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
return dXf
|
return dXf
|
||||||
|
|
||||||
return Yfp, backward
|
return Yfp, backward
|
||||||
|
|
||||||
# Thinc's Embed class is a bit broken atm, so drop this here.
|
|
||||||
from thinc import describe
|
|
||||||
from thinc.neural._classes.embed import _uniform_init
|
|
||||||
|
|
||||||
|
|
||||||
@describe.attributes(
|
|
||||||
nV=describe.Dimension("Number of vectors"),
|
|
||||||
nO=describe.Dimension("Size of output"),
|
|
||||||
vectors=describe.Weights("Embedding table",
|
|
||||||
lambda obj: (obj.nV, obj.nO),
|
|
||||||
_uniform_init(-0.1, 0.1)
|
|
||||||
),
|
|
||||||
d_vectors=describe.Gradient("vectors")
|
|
||||||
)
|
|
||||||
class Embed(Model):
|
|
||||||
name = 'embed'
|
|
||||||
|
|
||||||
def __init__(self, nO, nV=None, **kwargs):
|
|
||||||
if nV is not None:
|
|
||||||
nV += 1
|
|
||||||
Model.__init__(self, **kwargs)
|
|
||||||
if 'name' in kwargs:
|
|
||||||
self.name = kwargs['name']
|
|
||||||
self.column = kwargs.get('column', 0)
|
|
||||||
self.nO = nO
|
|
||||||
self.nV = nV
|
|
||||||
|
|
||||||
def predict(self, ids):
|
|
||||||
if ids.ndim == 2:
|
|
||||||
ids = ids[:, self.column]
|
|
||||||
return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
|
|
||||||
|
|
||||||
def begin_update(self, ids, drop=0.):
|
|
||||||
if ids.ndim == 2:
|
|
||||||
ids = ids[:, self.column]
|
|
||||||
vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
|
|
||||||
def backprop_embed(d_vectors, sgd=None):
|
|
||||||
n_vectors = d_vectors.shape[0]
|
|
||||||
self.ops.scatter_add(self.d_vectors, ids, d_vectors)
|
|
||||||
if sgd is not None:
|
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
|
||||||
return None
|
|
||||||
return vectors, backprop_embed
|
|
||||||
|
|
||||||
|
|
||||||
def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
|
|
||||||
'''Wrap a model, adding features representing action history.'''
|
|
||||||
if hist_size == 0:
|
|
||||||
return layerize(noop())
|
|
||||||
embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
|
|
||||||
for i in range(hist_size)]
|
|
||||||
embed = chain(concatenate(*embed_tables),
|
|
||||||
LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
|
|
||||||
ops = embed.ops
|
|
||||||
def add_history_fwd(vectors_hists, drop=0.):
|
|
||||||
vectors, hist_ids = vectors_hists
|
|
||||||
hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop)
|
|
||||||
outputs = ops.xp.hstack((vectors, hist_feats))
|
|
||||||
|
|
||||||
def add_history_bwd(d_outputs, sgd=None):
|
|
||||||
d_vectors = d_outputs[:, :vectors.shape[1]]
|
|
||||||
d_hists = d_outputs[:, vectors.shape[1]:]
|
|
||||||
bp_hists(d_hists, sgd=sgd)
|
|
||||||
return embed.ops.xp.ascontiguousarray(d_vectors)
|
|
||||||
return outputs, add_history_bwd
|
|
||||||
return wrap(add_history_fwd, embed)
|
|
||||||
|
|
||||||
|
|
||||||
def drop_layer(layer, factor=2.):
|
|
||||||
def drop_layer_fwd(X, drop=0.):
|
|
||||||
if drop <= 0.:
|
|
||||||
return layer.begin_update(X, drop=drop)
|
|
||||||
else:
|
|
||||||
coinflip = layer.ops.xp.random.random()
|
|
||||||
if (coinflip / factor) >= drop:
|
|
||||||
return layer.begin_update(X, drop=drop)
|
|
||||||
else:
|
|
||||||
return X, lambda dX, sgd=None: dX
|
|
||||||
|
|
||||||
model = wrap(drop_layer_fwd, layer)
|
|
||||||
model.predict = layer
|
|
||||||
return model
|
|
||||||
|
|
||||||
def link_vectors_to_models(vocab):
|
def link_vectors_to_models(vocab):
|
||||||
vectors = vocab.vectors
|
vectors = vocab.vectors
|
||||||
|
@ -308,16 +194,21 @@ def link_vectors_to_models(vocab):
|
||||||
# (unideal, I know)
|
# (unideal, I know)
|
||||||
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, **kwargs):
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
|
||||||
'*': reapply}):
|
'+': add, '*': reapply}):
|
||||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM),
|
||||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
name='embed_norm')
|
||||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
|
||||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
name='embed_prefix')
|
||||||
|
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
|
||||||
|
name='embed_suffix')
|
||||||
|
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
|
||||||
|
name='embed_shape')
|
||||||
if pretrained_dims is not None and pretrained_dims >= 1:
|
if pretrained_dims is not None and pretrained_dims >= 1:
|
||||||
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
||||||
|
|
||||||
|
@ -329,7 +220,6 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
(norm | prefix | suffix | shape)
|
(norm | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||||
|
|
||||||
|
|
||||||
convolution = Residual(
|
convolution = Residual(
|
||||||
ExtractWindow(nW=1)
|
ExtractWindow(nW=1)
|
||||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||||
|
@ -354,6 +244,7 @@ def reapply(layer, n_times):
|
||||||
Y, backprop = layer.begin_update(X, drop=drop)
|
Y, backprop = layer.begin_update(X, drop=drop)
|
||||||
X = Y
|
X = Y
|
||||||
backprops.append(backprop)
|
backprops.append(backprop)
|
||||||
|
|
||||||
def reapply_bwd(dY, sgd=None):
|
def reapply_bwd(dY, sgd=None):
|
||||||
dX = None
|
dX = None
|
||||||
for backprop in reversed(backprops):
|
for backprop in reversed(backprops):
|
||||||
|
@ -363,39 +254,20 @@ def reapply(layer, n_times):
|
||||||
else:
|
else:
|
||||||
dX += dY
|
dX += dY
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
return Y, reapply_bwd
|
return Y, reapply_bwd
|
||||||
return wrap(reapply_fwd, layer)
|
return wrap(reapply_fwd, layer)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def asarray(ops, dtype):
|
def asarray(ops, dtype):
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.):
|
||||||
return ops.asarray(X, dtype=dtype), None
|
return ops.asarray(X, dtype=dtype), None
|
||||||
return layerize(forward)
|
return layerize(forward)
|
||||||
|
|
||||||
|
|
||||||
def foreach(layer):
|
|
||||||
def forward(Xs, drop=0.):
|
|
||||||
results = []
|
|
||||||
backprops = []
|
|
||||||
for X in Xs:
|
|
||||||
result, bp = layer.begin_update(X, drop=drop)
|
|
||||||
results.append(result)
|
|
||||||
backprops.append(bp)
|
|
||||||
def backward(d_results, sgd=None):
|
|
||||||
dXs = []
|
|
||||||
for d_result, backprop in zip(d_results, backprops):
|
|
||||||
dXs.append(backprop(d_result, sgd))
|
|
||||||
return dXs
|
|
||||||
return results, backward
|
|
||||||
model = layerize(forward)
|
|
||||||
model._layers.append(layer)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def rebatch(size, layer):
|
def rebatch(size, layer):
|
||||||
ops = layer.ops
|
ops = layer.ops
|
||||||
|
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.):
|
||||||
if X.shape[0] < size:
|
if X.shape[0] < size:
|
||||||
return layer.begin_update(X)
|
return layer.begin_update(X)
|
||||||
|
@ -403,6 +275,7 @@ def rebatch(size, layer):
|
||||||
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
|
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
|
||||||
for p in parts])
|
for p in parts])
|
||||||
y = ops.flatten(results)
|
y = ops.flatten(results)
|
||||||
|
|
||||||
def backward(dy, sgd=None):
|
def backward(dy, sgd=None):
|
||||||
d_parts = [bp(y, sgd=sgd) for bp, y in
|
d_parts = [bp(y, sgd=sgd) for bp, y in
|
||||||
zip(bp_results, _divide_array(dy, size))]
|
zip(bp_results, _divide_array(dy, size))]
|
||||||
|
@ -413,6 +286,7 @@ def rebatch(size, layer):
|
||||||
except ValueError:
|
except ValueError:
|
||||||
dX = None
|
dX = None
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
return y, backward
|
return y, backward
|
||||||
model = layerize(forward)
|
model = layerize(forward)
|
||||||
model._layers.append(layer)
|
model._layers.append(layer)
|
||||||
|
@ -423,13 +297,14 @@ def _divide_array(X, size):
|
||||||
parts = []
|
parts = []
|
||||||
index = 0
|
index = 0
|
||||||
while index < len(X):
|
while index < len(X):
|
||||||
parts.append(X[index : index + size])
|
parts.append(X[index:index + size])
|
||||||
index += size
|
index += size
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
|
|
||||||
def get_col(idx):
|
def get_col(idx):
|
||||||
assert idx >= 0, idx
|
assert idx >= 0, idx
|
||||||
|
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.):
|
||||||
assert idx >= 0, idx
|
assert idx >= 0, idx
|
||||||
if isinstance(X, numpy.ndarray):
|
if isinstance(X, numpy.ndarray):
|
||||||
|
@ -437,30 +312,28 @@ def get_col(idx):
|
||||||
else:
|
else:
|
||||||
ops = CupyOps()
|
ops = CupyOps()
|
||||||
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
||||||
|
|
||||||
def backward(y, sgd=None):
|
def backward(y, sgd=None):
|
||||||
assert idx >= 0, idx
|
assert idx >= 0, idx
|
||||||
dX = ops.allocate(X.shape)
|
dX = ops.allocate(X.shape)
|
||||||
dX[:, idx] += y
|
dX[:, idx] += y
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
return output, backward
|
return output, backward
|
||||||
|
|
||||||
return layerize(forward)
|
return layerize(forward)
|
||||||
|
|
||||||
|
|
||||||
def zero_init(model):
|
|
||||||
def _hook(self, X, y=None):
|
|
||||||
self.W.fill(0)
|
|
||||||
model.on_data_hooks.append(_hook)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def doc2feats(cols=None):
|
def doc2feats(cols=None):
|
||||||
if cols is None:
|
if cols is None:
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
|
|
||||||
def forward(docs, drop=0.):
|
def forward(docs, drop=0.):
|
||||||
feats = []
|
feats = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
feats.append(doc.to_array(cols))
|
feats.append(doc.to_array(cols))
|
||||||
return feats, None
|
return feats, None
|
||||||
|
|
||||||
model = layerize(forward)
|
model = layerize(forward)
|
||||||
model.cols = cols
|
model.cols = cols
|
||||||
return model
|
return model
|
||||||
|
@ -474,28 +347,14 @@ def print_shape(prefix):
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||||
ops = Model.ops
|
|
||||||
tokens, attrs, vectors = tokens_attrs_vectors
|
tokens, attrs, vectors = tokens_attrs_vectors
|
||||||
|
|
||||||
def backward(d_output, sgd=None):
|
def backward(d_output, sgd=None):
|
||||||
return (tokens, d_output)
|
return (tokens, d_output)
|
||||||
|
|
||||||
return vectors, backward
|
return vectors, backward
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
|
||||||
def flatten(seqs, drop=0.):
|
|
||||||
if isinstance(seqs[0], numpy.ndarray):
|
|
||||||
ops = NumpyOps()
|
|
||||||
elif hasattr(CupyOps.xp, 'ndarray') and isinstance(seqs[0], CupyOps.xp.ndarray):
|
|
||||||
ops = CupyOps()
|
|
||||||
else:
|
|
||||||
raise ValueError("Unable to flatten sequence of type %s" % type(seqs[0]))
|
|
||||||
lengths = [len(seq) for seq in seqs]
|
|
||||||
def finish_update(d_X, sgd=None):
|
|
||||||
return ops.unflatten(d_X, lengths)
|
|
||||||
X = ops.xp.vstack(seqs)
|
|
||||||
return X, finish_update
|
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def logistic(X, drop=0.):
|
def logistic(X, drop=0.):
|
||||||
xp = get_array_module(X)
|
xp = get_array_module(X)
|
||||||
|
@ -505,9 +364,11 @@ def logistic(X, drop=0.):
|
||||||
X = xp.minimum(X, 10., X)
|
X = xp.minimum(X, 10., X)
|
||||||
X = xp.maximum(X, -10., X)
|
X = xp.maximum(X, -10., X)
|
||||||
Y = 1. / (1. + xp.exp(-X))
|
Y = 1. / (1. + xp.exp(-X))
|
||||||
|
|
||||||
def logistic_bwd(dY, sgd=None):
|
def logistic_bwd(dY, sgd=None):
|
||||||
dX = dY * (Y * (1-Y))
|
dX = dY * (Y * (1-Y))
|
||||||
return dX
|
return dX
|
||||||
|
|
||||||
return Y, logistic_bwd
|
return Y, logistic_bwd
|
||||||
|
|
||||||
|
|
||||||
|
@ -517,6 +378,7 @@ def zero_init(model):
|
||||||
model.on_data_hooks.append(_zero_init_impl)
|
model.on_data_hooks.append(_zero_init_impl)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def preprocess_doc(docs, drop=0.):
|
def preprocess_doc(docs, drop=0.):
|
||||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||||
|
@ -526,11 +388,13 @@ def preprocess_doc(docs, drop=0.):
|
||||||
vals = ops.allocate(keys.shape[0]) + 1
|
vals = ops.allocate(keys.shape[0]) + 1
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
def getitem(i):
|
def getitem(i):
|
||||||
def getitem_fwd(X, drop=0.):
|
def getitem_fwd(X, drop=0.):
|
||||||
return X[i], None
|
return X[i], None
|
||||||
return layerize(getitem_fwd)
|
return layerize(getitem_fwd)
|
||||||
|
|
||||||
|
|
||||||
def build_tagger_model(nr_class, **cfg):
|
def build_tagger_model(nr_class, **cfg):
|
||||||
embed_size = util.env_opt('embed_size', 7000)
|
embed_size = util.env_opt('embed_size', 7000)
|
||||||
if 'token_vector_width' in cfg:
|
if 'token_vector_width' in cfg:
|
||||||
|
@ -555,8 +419,6 @@ def build_tagger_model(nr_class, **cfg):
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def SpacyVectors(docs, drop=0.):
|
def SpacyVectors(docs, drop=0.):
|
||||||
xp = get_array_module(docs[0].vocab.vectors.data)
|
|
||||||
width = docs[0].vocab.vectors.data.shape[1]
|
|
||||||
batch = []
|
batch = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
indices = numpy.zeros((len(doc),), dtype='i')
|
indices = numpy.zeros((len(doc),), dtype='i')
|
||||||
|
@ -570,29 +432,6 @@ def SpacyVectors(docs, drop=0.):
|
||||||
return batch, None
|
return batch, None
|
||||||
|
|
||||||
|
|
||||||
def foreach(layer, drop_factor=1.0):
|
|
||||||
'''Map a layer across elements in a list'''
|
|
||||||
def foreach_fwd(Xs, drop=0.):
|
|
||||||
drop *= drop_factor
|
|
||||||
ys = []
|
|
||||||
backprops = []
|
|
||||||
for X in Xs:
|
|
||||||
y, bp_y = layer.begin_update(X, drop=drop)
|
|
||||||
ys.append(y)
|
|
||||||
backprops.append(bp_y)
|
|
||||||
def foreach_bwd(d_ys, sgd=None):
|
|
||||||
d_Xs = []
|
|
||||||
for d_y, bp_y in zip(d_ys, backprops):
|
|
||||||
if bp_y is not None and bp_y is not None:
|
|
||||||
d_Xs.append(d_y, sgd=sgd)
|
|
||||||
else:
|
|
||||||
d_Xs.append(None)
|
|
||||||
return d_Xs
|
|
||||||
return ys, foreach_bwd
|
|
||||||
model = wrap(foreach_fwd, layer)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def build_text_classifier(nr_class, width=64, **cfg):
|
def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
nr_vector = cfg.get('nr_vector', 5000)
|
nr_vector = cfg.get('nr_vector', 5000)
|
||||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||||
|
@ -602,9 +441,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
model = (
|
model = (
|
||||||
SpacyVectors
|
SpacyVectors
|
||||||
>> flatten_add_lengths
|
>> flatten_add_lengths
|
||||||
>> with_getitem(0,
|
>> with_getitem(0, Affine(width, pretrained_dims))
|
||||||
Affine(width, pretrained_dims)
|
|
||||||
)
|
|
||||||
>> ParametricAttention(width)
|
>> ParametricAttention(width)
|
||||||
>> Pooling(sum_pool)
|
>> Pooling(sum_pool)
|
||||||
>> Residual(ReLu(width, width)) ** 2
|
>> Residual(ReLu(width, width)) ** 2
|
||||||
|
@ -613,7 +450,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
lower = HashEmbed(width, nr_vector, column=1)
|
lower = HashEmbed(width, nr_vector, column=1)
|
||||||
prefix = HashEmbed(width//2, nr_vector, column=2)
|
prefix = HashEmbed(width//2, nr_vector, column=2)
|
||||||
suffix = HashEmbed(width//2, nr_vector, column=3)
|
suffix = HashEmbed(width//2, nr_vector, column=3)
|
||||||
|
@ -671,33 +507,40 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
model.lsuv = False
|
model.lsuv = False
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def flatten(seqs, drop=0.):
|
def flatten(seqs, drop=0.):
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||||
|
|
||||||
def finish_update(d_X, sgd=None):
|
def finish_update(d_X, sgd=None):
|
||||||
return ops.unflatten(d_X, lengths, pad=0)
|
return ops.unflatten(d_X, lengths, pad=0)
|
||||||
|
|
||||||
X = ops.flatten(seqs, pad=0)
|
X = ops.flatten(seqs, pad=0)
|
||||||
return X, finish_update
|
return X, finish_update
|
||||||
|
|
||||||
|
|
||||||
def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||||
'''Compose two or more models `f`, `g`, etc, such that their outputs are
|
"""Compose two or more models `f`, `g`, etc, such that their outputs are
|
||||||
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
|
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
|
||||||
'''
|
"""
|
||||||
if not layers:
|
if not layers:
|
||||||
return noop()
|
return noop()
|
||||||
drop_factor = kwargs.get('drop_factor', 1.0)
|
drop_factor = kwargs.get('drop_factor', 1.0)
|
||||||
ops = layers[0].ops
|
ops = layers[0].ops
|
||||||
layers = [chain(layer, flatten) for layer in layers]
|
layers = [chain(layer, flatten) for layer in layers]
|
||||||
concat = concatenate(*layers)
|
concat = concatenate(*layers)
|
||||||
|
|
||||||
def concatenate_lists_fwd(Xs, drop=0.):
|
def concatenate_lists_fwd(Xs, drop=0.):
|
||||||
drop *= drop_factor
|
drop *= drop_factor
|
||||||
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
|
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
|
||||||
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||||
ys = ops.unflatten(flat_y, lengths)
|
ys = ops.unflatten(flat_y, lengths)
|
||||||
|
|
||||||
def concatenate_lists_bwd(d_ys, sgd=None):
|
def concatenate_lists_bwd(d_ys, sgd=None):
|
||||||
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
|
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
|
||||||
|
|
||||||
return ys, concatenate_lists_bwd
|
return ys, concatenate_lists_bwd
|
||||||
|
|
||||||
model = wrap(concatenate_lists_fwd, concat)
|
model = wrap(concatenate_lists_fwd, concat)
|
||||||
return model
|
return model
|
||||||
|
|
|
@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
"""
|
"""
|
||||||
Normalize a dictionary of attributes, converting them to ints.
|
Normalize a dictionary of attributes, converting them to ints.
|
||||||
|
|
||||||
Arguments:
|
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
|
||||||
stringy_attrs (dict):
|
can be ints or strings.
|
||||||
Dictionary keyed by attribute string names. Values can be ints or strings.
|
strings_map (StringStore): Defaults to None. If provided, encodes string
|
||||||
|
values into ints.
|
||||||
strings_map (StringStore):
|
RETURNS (dict): Attributes dictionary with keys and optionally values
|
||||||
Defaults to None. If provided, encodes string values into ints.
|
converted to ints.
|
||||||
|
|
||||||
Returns:
|
|
||||||
inty_attrs (dict):
|
|
||||||
Attributes dictionary with keys and optionally values converted to
|
|
||||||
ints.
|
|
||||||
"""
|
"""
|
||||||
inty_attrs = {}
|
inty_attrs = {}
|
||||||
if _do_deprecated:
|
if _do_deprecated:
|
||||||
|
|
|
@ -7,10 +7,9 @@ from pathlib import Path
|
||||||
from .converters import conllu2json, iob2json, conll_ner2json
|
from .converters import conllu2json, iob2json, conll_ner2json
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
|
|
||||||
# Converters are matched by file extension. To add a converter, add a new entry
|
# Converters are matched by file extension. To add a converter, add a new
|
||||||
# to this dict with the file extension mapped to the converter function imported
|
# entry to this dict with the file extension mapped to the converter function
|
||||||
# from /converters.
|
# imported from /converters.
|
||||||
|
|
||||||
CONVERTERS = {
|
CONVERTERS = {
|
||||||
'conllu': conllu2json,
|
'conllu': conllu2json,
|
||||||
'conll': conllu2json,
|
'conll': conllu2json,
|
||||||
|
@ -24,8 +23,7 @@ CONVERTERS = {
|
||||||
output_dir=("output directory for converted file", "positional", None, str),
|
output_dir=("output directory for converted file", "positional", None, str),
|
||||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
||||||
)
|
|
||||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
||||||
converter='auto'):
|
converter='auto'):
|
||||||
"""
|
"""
|
||||||
|
@ -40,7 +38,7 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
||||||
prints(output_path, title="Output directory not found", exits=1)
|
prints(output_path, title="Output directory not found", exits=1)
|
||||||
if converter == 'auto':
|
if converter == 'auto':
|
||||||
converter = input_path.suffix[1:]
|
converter = input_path.suffix[1:]
|
||||||
if not converter in CONVERTERS:
|
if converter not in CONVERTERS:
|
||||||
prints("Can't find converter for %s" % converter,
|
prints("Can't find converter for %s" % converter,
|
||||||
title="Unknown format", exits=1)
|
title="Unknown format", exits=1)
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
|
|
|
@ -8,7 +8,8 @@ from ...gold import iob_to_biluo
|
||||||
|
|
||||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||||
"""
|
"""
|
||||||
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
|
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||||
|
train cli.
|
||||||
"""
|
"""
|
||||||
docs = read_conll_ner(input_path)
|
docs = read_conll_ner(input_path)
|
||||||
|
|
||||||
|
|
|
@ -13,10 +13,9 @@ from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("model to download (shortcut or model name)", "positional", None, str),
|
model=("model to download, shortcut or name)", "positional", None, str),
|
||||||
direct=("force direct download. Needs model name with version and won't "
|
direct=("force direct download. Needs model name with version and won't "
|
||||||
"perform compatibility check", "flag", "d", bool)
|
"perform compatibility check", "flag", "d", bool))
|
||||||
)
|
|
||||||
def download(cmd, model, direct=False):
|
def download(cmd, model, direct=False):
|
||||||
"""
|
"""
|
||||||
Download compatible model from default download path using pip. Model
|
Download compatible model from default download path using pip. Model
|
||||||
|
@ -30,21 +29,25 @@ def download(cmd, model, direct=False):
|
||||||
model_name = shortcuts.get(model, model)
|
model_name = shortcuts.get(model, model)
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
|
||||||
|
v=version))
|
||||||
if dl == 0:
|
if dl == 0:
|
||||||
try:
|
try:
|
||||||
# Get package path here because link uses
|
# Get package path here because link uses
|
||||||
# pip.get_installed_distributions() to check if model is a package,
|
# pip.get_installed_distributions() to check if model is a
|
||||||
# which fails if model was just installed via subprocess
|
# package, which fails if model was just installed via
|
||||||
|
# subprocess
|
||||||
package_path = get_package_path(model_name)
|
package_path = get_package_path(model_name)
|
||||||
link(None, model_name, model, force=True, model_path=package_path)
|
link(None, model_name, model, force=True,
|
||||||
|
model_path=package_path)
|
||||||
except:
|
except:
|
||||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
# Dirty, but since spacy.download and the auto-linking is
|
||||||
# a convenience wrapper, it's best to show a success message and
|
# mostly a convenience wrapper, it's best to show a success
|
||||||
# loading instructions, even if linking fails.
|
# message and loading instructions, even if linking fails.
|
||||||
prints("Creating a shortcut link for 'en' didn't work (maybe you "
|
prints(
|
||||||
"don't have admin permissions?), but you can still load "
|
"Creating a shortcut link for 'en' didn't work (maybe "
|
||||||
"the model via its full package name:",
|
"you don't have admin permissions?), but you can still "
|
||||||
|
"load the model via its full package name:",
|
||||||
"nlp = spacy.load('%s')" % model_name,
|
"nlp = spacy.load('%s')" % model_name,
|
||||||
title="Download successful")
|
title="Download successful")
|
||||||
|
|
||||||
|
@ -52,9 +55,10 @@ def download(cmd, model, direct=False):
|
||||||
def get_json(url, desc):
|
def get_json(url, desc):
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
|
msg = ("Couldn't fetch %s. Please find a model for your spaCy "
|
||||||
"(v%s), and download it manually." % (desc, about.__version__),
|
"installation (v%s), and download it manually.")
|
||||||
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
|
prints(msg % (desc, about.__version__), about.__docs_models__,
|
||||||
|
title="Server error (%d)" % r.status_code, exits=1)
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
|
@ -71,13 +75,13 @@ def get_compatibility():
|
||||||
def get_version(model, comp):
|
def get_version(model, comp):
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
version = about.__version__
|
version = about.__version__
|
||||||
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
|
msg = "No compatible model found for '%s' (spaCy v%s)."
|
||||||
title="Compatibility error", exits=1)
|
prints(msg % (model, version), title="Compatibility error", exits=1)
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
||||||
|
|
||||||
def download_model(filename):
|
def download_model(filename):
|
||||||
download_url = about.__download_url__ + '/' + filename
|
download_url = about.__download_url__ + '/' + filename
|
||||||
return subprocess.call([sys.executable, '-m',
|
return subprocess.call(
|
||||||
'pip', 'install', '--no-cache-dir', download_url],
|
[sys.executable, '-m', 'pip', 'install', '--no-cache-dir',
|
||||||
env=os.environ.copy())
|
download_url], env=os.environ.copy())
|
||||||
|
|
|
@ -2,27 +2,15 @@
|
||||||
from __future__ import unicode_literals, division, print_function
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import json
|
|
||||||
from collections import defaultdict
|
|
||||||
import cytoolz
|
|
||||||
from pathlib import Path
|
|
||||||
import dill
|
|
||||||
import tqdm
|
|
||||||
from thinc.neural._classes.model import Model
|
|
||||||
from thinc.neural.optimizers import linear_decay
|
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
|
||||||
from ..tokens.doc import Doc
|
from ..gold import GoldCorpus
|
||||||
from ..scorer import Scorer
|
|
||||||
from ..gold import GoldParse, merge_sents
|
|
||||||
from ..gold import GoldCorpus, minibatch
|
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
from ..compat import json_dumps
|
|
||||||
|
|
||||||
random.seed(0)
|
random.seed(0)
|
||||||
numpy.random.seed(0)
|
numpy.random.seed(0)
|
||||||
|
@ -30,17 +18,18 @@ numpy.random.seed(0)
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("Model name or path", "positional", None, str),
|
model=("Model name or path", "positional", None, str),
|
||||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
data_path=("Location of JSON-formatted evaluation data", "positional",
|
||||||
|
None, str),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
gpu_id=("Use GPU", "option", "g", int),
|
gpu_id=("Use GPU", "option", "g", int),
|
||||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
displacy_path=("Directory to output rendered parses as HTML", "option",
|
||||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
|
"dp", str),
|
||||||
)
|
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
|
||||||
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||||
displacy_path=None, displacy_limit=25):
|
displacy_path=None, displacy_limit=25):
|
||||||
"""
|
"""
|
||||||
Evaluate a model. To render a sample of parses in a HTML file, set an output
|
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||||
directory as the displacy_path argument.
|
output directory as the displacy_path argument.
|
||||||
"""
|
"""
|
||||||
if gpu_id >= 0:
|
if gpu_id >= 0:
|
||||||
util.use_gpu(gpu_id)
|
util.use_gpu(gpu_id)
|
||||||
|
@ -50,7 +39,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||||
if not data_path.exists():
|
if not data_path.exists():
|
||||||
prints(data_path, title="Evaluation data not found", exits=1)
|
prints(data_path, title="Evaluation data not found", exits=1)
|
||||||
if displacy_path and not displacy_path.exists():
|
if displacy_path and not displacy_path.exists():
|
||||||
prints(displacy_path, title="Visualization output directory not found", exits=1)
|
prints(displacy_path, title="Visualization output directory not found",
|
||||||
|
exits=1)
|
||||||
corpus = GoldCorpus(data_path, data_path)
|
corpus = GoldCorpus(data_path, data_path)
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||||
|
@ -64,12 +54,14 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||||
docs, golds = zip(*dev_docs)
|
docs, golds = zip(*dev_docs)
|
||||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
||||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||||
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
|
render_parses(docs, displacy_path, model_name=model,
|
||||||
deps=render_deps, ents=render_ents)
|
limit=displacy_limit, deps=render_deps, ents=render_ents)
|
||||||
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
|
msg = "Generated %s parses as HTML" % displacy_limit
|
||||||
|
prints(displacy_path, title=msg)
|
||||||
|
|
||||||
|
|
||||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
|
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
|
||||||
|
ents=True):
|
||||||
docs[0].user_data['title'] = model_name
|
docs[0].user_data['title'] = model_name
|
||||||
if ents:
|
if ents:
|
||||||
with (output_path / 'entities.html').open('w') as file_:
|
with (output_path / 'entities.html').open('w') as file_:
|
||||||
|
@ -77,7 +69,8 @@ def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=T
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
if deps:
|
if deps:
|
||||||
with (output_path / 'parses.html').open('w') as file_:
|
with (output_path / 'parses.html').open('w') as file_:
|
||||||
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
|
html = displacy.render(docs[:limit], style='dep', page=True,
|
||||||
|
options={'compact': True})
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,8 +12,7 @@ from .. import util
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("optional: shortcut link of model", "positional", None, str),
|
model=("optional: shortcut link of model", "positional", None, str),
|
||||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
markdown=("generate Markdown for GitHub issues", "flag", "md", str))
|
||||||
)
|
|
||||||
def info(cmd, model=None, markdown=False):
|
def info(cmd, model=None, markdown=False):
|
||||||
"""Print info about spaCy installation. If a model shortcut link is
|
"""Print info about spaCy installation. If a model shortcut link is
|
||||||
speficied as an argument, print model information. Flag --markdown
|
speficied as an argument, print model information. Flag --markdown
|
||||||
|
|
|
@ -12,8 +12,7 @@ from .. import util
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
origin=("package name or local path to model", "positional", None, str),
|
origin=("package name or local path to model", "positional", None, str),
|
||||||
link_name=("name of shortuct link to create", "positional", None, str),
|
link_name=("name of shortuct link to create", "positional", None, str),
|
||||||
force=("force overwriting of existing link", "flag", "f", bool)
|
force=("force overwriting of existing link", "flag", "f", bool))
|
||||||
)
|
|
||||||
def link(cmd, origin, link_name, force=False, model_path=None):
|
def link(cmd, origin, link_name, force=False, model_path=None):
|
||||||
"""
|
"""
|
||||||
Create a symlink for models within the spacy/data directory. Accepts
|
Create a symlink for models within the spacy/data directory. Accepts
|
||||||
|
@ -46,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None):
|
||||||
# This is quite dirty, but just making sure other errors are caught.
|
# This is quite dirty, but just making sure other errors are caught.
|
||||||
prints("Creating a symlink in spacy/data failed. Make sure you have "
|
prints("Creating a symlink in spacy/data failed. Make sure you have "
|
||||||
"the required permissions and try re-running the command as "
|
"the required permissions and try re-running the command as "
|
||||||
"admin, or use a virtualenv. You can still import the model as a "
|
"admin, or use a virtualenv. You can still import the model as "
|
||||||
"module and call its load() method, or create the symlink manually.",
|
"a module and call its load() method, or create the symlink "
|
||||||
|
"manually.",
|
||||||
"%s --> %s" % (path2str(model_path), path2str(link_path)),
|
"%s --> %s" % (path2str(model_path), path2str(link_path)),
|
||||||
title="Error: Couldn't link model to '%s'" % link_name)
|
title="Error: Couldn't link model to '%s'" % link_name)
|
||||||
raise
|
raise
|
||||||
|
|
|
@ -16,10 +16,12 @@ from .. import about
|
||||||
input_dir=("directory with model data", "positional", None, str),
|
input_dir=("directory with model data", "positional", None, str),
|
||||||
output_dir=("output parent directory", "positional", None, str),
|
output_dir=("output parent directory", "positional", None, str),
|
||||||
meta_path=("path to meta.json", "option", "m", str),
|
meta_path=("path to meta.json", "option", "m", str),
|
||||||
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
|
create_meta=("create meta.json, even if one exists in directory", "flag",
|
||||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
"c", bool),
|
||||||
)
|
force=("force overwriting of existing folder in output directory", "flag",
|
||||||
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
"f", bool))
|
||||||
|
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
|
||||||
|
force=False):
|
||||||
"""
|
"""
|
||||||
Generate Python package for model data, including meta and required
|
Generate Python package for model data, including meta and required
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
|
@ -52,13 +54,15 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
|
||||||
create_dirs(package_path, force)
|
create_dirs(package_path, force)
|
||||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
shutil.copytree(path2str(input_path),
|
||||||
|
path2str(package_path / model_name_v))
|
||||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||||
create_file(main_path / 'setup.py', template_setup)
|
create_file(main_path / 'setup.py', template_setup)
|
||||||
create_file(main_path / 'MANIFEST.in', template_manifest)
|
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||||
create_file(package_path / '__init__.py', template_init)
|
create_file(package_path / '__init__.py', template_init)
|
||||||
prints(main_path, "To build the package, run `python setup.py sdist` in this "
|
prints(main_path, "To build the package, run `python setup.py sdist` in "
|
||||||
"directory.", title="Successfully created package '%s'" % model_name_v)
|
"this directory.",
|
||||||
|
title="Successfully created package '%s'" % model_name_v)
|
||||||
|
|
||||||
|
|
||||||
def create_dirs(package_path, force):
|
def create_dirs(package_path, force):
|
||||||
|
@ -66,9 +70,10 @@ def create_dirs(package_path, force):
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(path2str(package_path))
|
shutil.rmtree(path2str(package_path))
|
||||||
else:
|
else:
|
||||||
prints(package_path, "Please delete the directory and try again, or "
|
prints(package_path, "Please delete the directory and try again, "
|
||||||
"use the --force flag to overwrite existing directories.",
|
"or use the --force flag to overwrite existing "
|
||||||
title="Package directory already exists", exits=1)
|
"directories.", title="Package directory already exists",
|
||||||
|
exits=1)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -82,7 +87,8 @@ def generate_meta(model_path):
|
||||||
settings = [('lang', 'Model language', 'en'),
|
settings = [('lang', 'Model language', 'en'),
|
||||||
('name', 'Model name', 'model'),
|
('name', 'Model name', 'model'),
|
||||||
('version', 'Model version', '0.0.0'),
|
('version', 'Model version', '0.0.0'),
|
||||||
('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
|
('spacy_version', 'Required spaCy version',
|
||||||
|
'>=%s,<3.0.0' % about.__version__),
|
||||||
('description', 'Model description', False),
|
('description', 'Model description', False),
|
||||||
('author', 'Author', False),
|
('author', 'Author', False),
|
||||||
('email', 'Author email', False),
|
('email', 'Author email', False),
|
||||||
|
|
|
@ -27,15 +27,15 @@ def read_inputs(loc):
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model/language", "positional", None, str),
|
lang=("model/language", "positional", None, str),
|
||||||
inputs=("Location of input file", "positional", None, read_inputs)
|
inputs=("Location of input file", "positional", None, read_inputs))
|
||||||
)
|
|
||||||
def profile(cmd, lang, inputs=None):
|
def profile(cmd, lang, inputs=None):
|
||||||
"""
|
"""
|
||||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||||
"""
|
"""
|
||||||
nlp = spacy.load(lang)
|
nlp = spacy.load(lang)
|
||||||
texts = list(cytoolz.take(10000, inputs))
|
texts = list(cytoolz.take(10000, inputs))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
||||||
|
"Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
|
||||||
|
|
|
@ -2,21 +2,14 @@
|
||||||
from __future__ import unicode_literals, division, print_function
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import json
|
|
||||||
from collections import defaultdict
|
|
||||||
import cytoolz
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import dill
|
import dill
|
||||||
import tqdm
|
import tqdm
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from thinc.neural.optimizers import linear_decay
|
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
|
||||||
from ..tokens.doc import Doc
|
|
||||||
from ..scorer import Scorer
|
|
||||||
from ..gold import GoldParse, merge_sents
|
|
||||||
from ..gold import GoldCorpus, minibatch
|
from ..gold import GoldCorpus, minibatch
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -31,8 +24,10 @@ numpy.random.seed(0)
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model language", "positional", None, str),
|
lang=("model language", "positional", None, str),
|
||||||
output_dir=("output directory to store model in", "positional", None, str),
|
output_dir=("output directory to store model in", "positional", None, str),
|
||||||
train_data=("location of JSON-formatted training data", "positional", None, str),
|
train_data=("location of JSON-formatted training data", "positional",
|
||||||
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
|
None, str),
|
||||||
|
dev_data=("location of JSON-formatted development data (optional)",
|
||||||
|
"positional", None, str),
|
||||||
n_iter=("number of iterations", "option", "n", int),
|
n_iter=("number of iterations", "option", "n", int),
|
||||||
n_sents=("number of sentences", "option", "ns", int),
|
n_sents=("number of sentences", "option", "ns", int),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
|
@ -42,11 +37,12 @@ numpy.random.seed(0)
|
||||||
no_entities=("Don't train NER", "flag", "N", bool),
|
no_entities=("Don't train NER", "flag", "N", bool),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
version=("Model version", "option", "V", str),
|
version=("Model version", "option", "V", str),
|
||||||
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||||
)
|
"overwritten.", "option", "m", Path))
|
||||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
|
||||||
gold_preproc=False, version="0.0.0", meta_path=None):
|
no_entities=False, gold_preproc=False, version="0.0.0",
|
||||||
|
meta_path=None):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
|
@ -72,9 +68,12 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
meta.setdefault('name', 'unnamed')
|
meta.setdefault('name', 'unnamed')
|
||||||
|
|
||||||
pipeline = ['tagger', 'parser', 'ner']
|
pipeline = ['tagger', 'parser', 'ner']
|
||||||
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
if no_tagger and 'tagger' in pipeline:
|
||||||
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
|
pipeline.remove('tagger')
|
||||||
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
|
if no_parser and 'parser' in pipeline:
|
||||||
|
pipeline.remove('parser')
|
||||||
|
if no_entities and 'ner' in pipeline:
|
||||||
|
pipeline.remove('ner')
|
||||||
|
|
||||||
# Take dropout and batch size as generators of values -- dropout
|
# Take dropout and batch size as generators of values -- dropout
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
|
@ -139,7 +138,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
scorer = nlp_loaded.evaluate(dev_docs)
|
scorer = nlp_loaded.evaluate(dev_docs)
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
cpu_wps = nwords/(end_time-start_time)
|
cpu_wps = nwords/(end_time-start_time)
|
||||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
|
||||||
with acc_loc.open('w') as file_:
|
with acc_loc.open('w') as file_:
|
||||||
file_.write(json_dumps(scorer.scores))
|
file_.write(json_dumps(scorer.scores))
|
||||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||||
|
@ -157,7 +156,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
with meta_loc.open('w') as file_:
|
with meta_loc.open('w') as file_:
|
||||||
file_.write(json_dumps(meta))
|
file_.write(json_dumps(meta))
|
||||||
util.set_env_log(True)
|
util.set_env_log(True)
|
||||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
|
||||||
|
gpu_wps=gpu_wps)
|
||||||
finally:
|
finally:
|
||||||
print("Saving model...")
|
print("Saving model...")
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
@ -29,8 +29,10 @@ def validate(cmd):
|
||||||
model_links = get_model_links(current_compat)
|
model_links = get_model_links(current_compat)
|
||||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
||||||
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
|
incompat_models = {d['name'] for _, d in model_pkgs.items()
|
||||||
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
|
if not d['compat']}
|
||||||
|
incompat_models.update([d['name'] for _, d in model_links.items()
|
||||||
|
if not d['compat']])
|
||||||
na_models = [m for m in incompat_models if m not in current_compat]
|
na_models = [m for m in incompat_models if m not in current_compat]
|
||||||
update_models = [m for m in incompat_models if m in current_compat]
|
update_models = [m for m in incompat_models if m in current_compat]
|
||||||
|
|
||||||
|
@ -90,7 +92,6 @@ def get_model_pkgs(compat, all_models):
|
||||||
|
|
||||||
|
|
||||||
def get_model_row(compat, name, data, type='package'):
|
def get_model_row(compat, name, data, type='package'):
|
||||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
|
||||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
||||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
||||||
if data['compat']:
|
if data['compat']:
|
||||||
|
@ -110,7 +111,8 @@ def get_row(*args):
|
||||||
def is_model_path(model_path):
|
def is_model_path(model_path):
|
||||||
exclude = ['cache', 'pycache', '__pycache__']
|
exclude = ['cache', 'pycache', '__pycache__']
|
||||||
name = model_path.parts[-1]
|
name = model_path.parts[-1]
|
||||||
return model_path.is_dir() and name not in exclude and not name.startswith('.')
|
return (model_path.is_dir() and name not in exclude
|
||||||
|
and not name.startswith('.'))
|
||||||
|
|
||||||
|
|
||||||
def is_compat(compat, name, version):
|
def is_compat(compat, name, version):
|
||||||
|
@ -118,6 +120,7 @@ def is_compat(compat, name, version):
|
||||||
|
|
||||||
|
|
||||||
def reformat_version(version):
|
def reformat_version(version):
|
||||||
|
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||||
if version.endswith('-alpha'):
|
if version.endswith('-alpha'):
|
||||||
return version.replace('-alpha', 'a0')
|
return version.replace('-alpha', 'a0')
|
||||||
return version.replace('-alpha', 'a')
|
return version.replace('-alpha', 'a')
|
||||||
|
|
|
@ -87,15 +87,15 @@ def symlink_to(orig, dest):
|
||||||
|
|
||||||
|
|
||||||
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
||||||
return ((python2 == None or python2 == is_python2) and
|
return ((python2 is None or python2 == is_python2) and
|
||||||
(python3 == None or python3 == is_python3) and
|
(python3 is None or python3 == is_python3) and
|
||||||
(windows == None or windows == is_windows) and
|
(windows is None or windows == is_windows) and
|
||||||
(linux == None or linux == is_linux) and
|
(linux is None or linux == is_linux) and
|
||||||
(osx == None or osx == is_osx))
|
(osx is None or osx == is_osx))
|
||||||
|
|
||||||
|
|
||||||
def normalize_string_keys(old):
|
def normalize_string_keys(old):
|
||||||
'''Given a dictionary, make sure keys are unicode strings, not bytes.'''
|
"""Given a dictionary, make sure keys are unicode strings, not bytes."""
|
||||||
new = {}
|
new = {}
|
||||||
for key, value in old.items():
|
for key, value in old.items():
|
||||||
if isinstance(key, bytes_):
|
if isinstance(key, bytes_):
|
||||||
|
|
|
@ -24,7 +24,7 @@ def depr_model_download(lang):
|
||||||
|
|
||||||
|
|
||||||
def resolve_load_name(name, **overrides):
|
def resolve_load_name(name, **overrides):
|
||||||
"""Resolve model loading if deprecated path kwarg is specified in overrides.
|
"""Resolve model loading if deprecated path kwarg in overrides.
|
||||||
|
|
||||||
name (unicode): Name of model to load.
|
name (unicode): Name of model to load.
|
||||||
**overrides: Overrides specified in spacy.load().
|
**overrides: Overrides specified in spacy.load().
|
||||||
|
@ -32,8 +32,9 @@ def resolve_load_name(name, **overrides):
|
||||||
"""
|
"""
|
||||||
if overrides.get('path') not in (None, False, True):
|
if overrides.get('path') not in (None, False, True):
|
||||||
name = overrides.get('path')
|
name = overrides.get('path')
|
||||||
prints("To load a model from a path, you can now use the first argument. "
|
prints("To load a model from a path, you can now use the first "
|
||||||
"The model meta is used to load the required Language class.",
|
"argument. The model meta is used to load the Language class.",
|
||||||
"OLD: spacy.load('en', path='/some/path')", "NEW: spacy.load('/some/path')",
|
"OLD: spacy.load('en', path='/some/path')",
|
||||||
|
"NEW: spacy.load('/some/path')",
|
||||||
title="Warning: deprecated argument 'path'")
|
title="Warning: deprecated argument 'path'")
|
||||||
return name
|
return name
|
||||||
|
|
|
@ -12,7 +12,7 @@ IS_JUPYTER = is_in_jupyter()
|
||||||
|
|
||||||
|
|
||||||
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||||
options={}, manual=False):
|
options={}, manual=False):
|
||||||
"""Render displaCy visualisation.
|
"""Render displaCy visualisation.
|
||||||
|
|
||||||
docs (list or Doc): Document(s) to visualise.
|
docs (list or Doc): Document(s) to visualise.
|
||||||
|
@ -21,7 +21,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
|
jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
|
||||||
options (dict): Visualiser-specific options, e.g. colors.
|
options (dict): Visualiser-specific options, e.g. colors.
|
||||||
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
"""
|
"""
|
||||||
factories = {'dep': (DependencyRenderer, parse_deps),
|
factories = {'dep': (DependencyRenderer, parse_deps),
|
||||||
|
@ -35,7 +35,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||||
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
||||||
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
||||||
html = _html['parsed']
|
html = _html['parsed']
|
||||||
if jupyter: # return HTML rendered by IPython display()
|
if jupyter: # return HTML rendered by IPython display()
|
||||||
from IPython.core.display import display, HTML
|
from IPython.core.display import display, HTML
|
||||||
return display(HTML(html))
|
return display(HTML(html))
|
||||||
return html
|
return html
|
||||||
|
@ -50,13 +50,15 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
||||||
page (bool): Render markup as full HTML page.
|
page (bool): Render markup as full HTML page.
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
options (dict): Visualiser-specific options, e.g. colors.
|
options (dict): Visualiser-specific options, e.g. colors.
|
||||||
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
|
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||||
port (int): Port to serve visualisation.
|
port (int): Port to serve visualisation.
|
||||||
"""
|
"""
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
render(docs, style=style, page=page, minify=minify, options=options,
|
||||||
|
manual=manual)
|
||||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
httpd = simple_server.make_server('0.0.0.0', port, app)
|
||||||
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
|
prints("Using the '%s' visualizer" % style,
|
||||||
|
title="Serving on port %d..." % port)
|
||||||
try:
|
try:
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
@ -67,7 +69,8 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
||||||
|
|
||||||
def app(environ, start_response):
|
def app(environ, start_response):
|
||||||
# headers and status need to be bytes in Python 2, see #1227
|
# headers and status need to be bytes in Python 2, see #1227
|
||||||
headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
|
headers = [(b_to_str(b'Content-type'),
|
||||||
|
b_to_str(b'text/html; charset=utf-8'))]
|
||||||
start_response(b_to_str(b'200 OK'), headers)
|
start_response(b_to_str(b'200 OK'), headers)
|
||||||
res = _html['parsed'].encode(encoding='utf-8')
|
res = _html['parsed'].encode(encoding='utf-8')
|
||||||
return [res]
|
return [res]
|
||||||
|
@ -89,9 +92,9 @@ def parse_deps(orig_doc, options={}):
|
||||||
end = word.i + 1
|
end = word.i + 1
|
||||||
while end < len(doc) and doc[end].is_punct:
|
while end < len(doc) and doc[end].is_punct:
|
||||||
end += 1
|
end += 1
|
||||||
span = doc[start : end]
|
span = doc[start:end]
|
||||||
spans.append((span.start_char, span.end_char, word.tag_,
|
spans.append((span.start_char, span.end_char, word.tag_,
|
||||||
word.lemma_, word.ent_type_))
|
word.lemma_, word.ent_type_))
|
||||||
for span_props in spans:
|
for span_props in spans:
|
||||||
doc.merge(*span_props)
|
doc.merge(*span_props)
|
||||||
words = [{'text': w.text, 'tag': w.tag_} for w in doc]
|
words = [{'text': w.text, 'tag': w.tag_} for w in doc]
|
||||||
|
@ -113,6 +116,7 @@ def parse_ents(doc, options={}):
|
||||||
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
||||||
"""
|
"""
|
||||||
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
|
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
|
||||||
for ent in doc.ents]
|
for ent in doc.ents]
|
||||||
title = doc.user_data.get('title', None) if hasattr(doc, 'user_data') else None
|
title = (doc.user_data.get('title', None)
|
||||||
|
if hasattr(doc, 'user_data') else None)
|
||||||
return {'text': doc.text, 'ents': ents, 'title': title}
|
return {'text': doc.text, 'ents': ents, 'title': title}
|
||||||
|
|
|
@ -14,13 +14,15 @@ class DependencyRenderer(object):
|
||||||
"""Initialise dependency renderer.
|
"""Initialise dependency renderer.
|
||||||
|
|
||||||
options (dict): Visualiser-specific options (compact, word_spacing,
|
options (dict): Visualiser-specific options (compact, word_spacing,
|
||||||
arrow_spacing, arrow_width, arrow_stroke, distance,
|
arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
|
||||||
offset_x, color, bg, font)
|
color, bg, font)
|
||||||
"""
|
"""
|
||||||
self.compact = options.get('compact', False)
|
self.compact = options.get('compact', False)
|
||||||
self.word_spacing = options.get('word_spacing', 45)
|
self.word_spacing = options.get('word_spacing', 45)
|
||||||
self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
|
self.arrow_spacing = options.get('arrow_spacing',
|
||||||
self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
|
12 if self.compact else 20)
|
||||||
|
self.arrow_width = options.get('arrow_width',
|
||||||
|
6 if self.compact else 10)
|
||||||
self.arrow_stroke = options.get('arrow_stroke', 2)
|
self.arrow_stroke = options.get('arrow_stroke', 2)
|
||||||
self.distance = options.get('distance', 150 if self.compact else 175)
|
self.distance = options.get('distance', 150 if self.compact else 175)
|
||||||
self.offset_x = options.get('offset_x', 50)
|
self.offset_x = options.get('offset_x', 50)
|
||||||
|
@ -39,7 +41,8 @@ class DependencyRenderer(object):
|
||||||
rendered = [self.render_svg(i, p['words'], p['arcs'])
|
rendered = [self.render_svg(i, p['words'], p['arcs'])
|
||||||
for i, p in enumerate(parsed)]
|
for i, p in enumerate(parsed)]
|
||||||
if page:
|
if page:
|
||||||
content = ''.join([TPL_FIGURE.format(content=svg) for svg in rendered])
|
content = ''.join([TPL_FIGURE.format(content=svg)
|
||||||
|
for svg in rendered])
|
||||||
markup = TPL_PAGE.format(content=content)
|
markup = TPL_PAGE.format(content=content)
|
||||||
else:
|
else:
|
||||||
markup = ''.join(rendered)
|
markup = ''.join(rendered)
|
||||||
|
@ -63,12 +66,13 @@ class DependencyRenderer(object):
|
||||||
self.id = render_id
|
self.id = render_id
|
||||||
words = [self.render_word(w['text'], w['tag'], i)
|
words = [self.render_word(w['text'], w['tag'], i)
|
||||||
for i, w in enumerate(words)]
|
for i, w in enumerate(words)]
|
||||||
arcs = [self.render_arrow(a['label'], a['start'], a['end'], a['dir'], i)
|
arcs = [self.render_arrow(a['label'], a['start'],
|
||||||
|
a['end'], a['dir'], i)
|
||||||
for i, a in enumerate(arcs)]
|
for i, a in enumerate(arcs)]
|
||||||
content = ''.join(words) + ''.join(arcs)
|
content = ''.join(words) + ''.join(arcs)
|
||||||
return TPL_DEP_SVG.format(id=self.id, width=self.width, height=self.height,
|
return TPL_DEP_SVG.format(id=self.id, width=self.width,
|
||||||
color=self.color, bg=self.bg, font=self.font,
|
height=self.height, color=self.color,
|
||||||
content=content)
|
bg=self.bg, font=self.font, content=content)
|
||||||
|
|
||||||
def render_word(self, text, tag, i):
|
def render_word(self, text, tag, i):
|
||||||
"""Render individual word.
|
"""Render individual word.
|
||||||
|
@ -96,7 +100,7 @@ class DependencyRenderer(object):
|
||||||
x_start = self.offset_x+start*self.distance+self.arrow_spacing
|
x_start = self.offset_x+start*self.distance+self.arrow_spacing
|
||||||
y = self.offset_y
|
y = self.offset_y
|
||||||
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
||||||
-self.arrow_spacing*(self.highest_level-level)/4)
|
- self.arrow_spacing*(self.highest_level-level)/4)
|
||||||
y_curve = self.offset_y-level*self.distance/2
|
y_curve = self.offset_y-level*self.distance/2
|
||||||
if self.compact:
|
if self.compact:
|
||||||
y_curve = self.offset_y-level*self.distance/6
|
y_curve = self.offset_y-level*self.distance/6
|
||||||
|
@ -133,8 +137,10 @@ class DependencyRenderer(object):
|
||||||
if direction is 'left':
|
if direction is 'left':
|
||||||
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
|
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
|
||||||
else:
|
else:
|
||||||
pos1, pos2, pos3 = (end, end+self.arrow_width-2, end-self.arrow_width+2)
|
pos1, pos2, pos3 = (end, end+self.arrow_width-2,
|
||||||
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, y-self.arrow_width)
|
end-self.arrow_width+2)
|
||||||
|
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3,
|
||||||
|
y-self.arrow_width)
|
||||||
return "M{},{} L{},{} {},{}".format(*arrowhead)
|
return "M{},{} L{},{} {},{}".format(*arrowhead)
|
||||||
|
|
||||||
def get_levels(self, arcs):
|
def get_levels(self, arcs):
|
||||||
|
@ -159,9 +165,10 @@ class EntityRenderer(object):
|
||||||
"""
|
"""
|
||||||
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
|
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
|
||||||
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
|
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
|
||||||
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LANGUAGE': '#ff8197',
|
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197',
|
||||||
'WORK_OF_ART': '#f0d0ff', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9',
|
'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff',
|
||||||
'MONEY': '#e4e7d2', 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
|
'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2',
|
||||||
|
'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
|
||||||
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
|
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
|
||||||
colors.update(options.get('colors', {}))
|
colors.update(options.get('colors', {}))
|
||||||
self.default_color = '#ddd'
|
self.default_color = '#ddd'
|
||||||
|
@ -176,9 +183,11 @@ class EntityRenderer(object):
|
||||||
minify (bool): Minify HTML markup.
|
minify (bool): Minify HTML markup.
|
||||||
RETURNS (unicode): Rendered HTML markup.
|
RETURNS (unicode): Rendered HTML markup.
|
||||||
"""
|
"""
|
||||||
rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed]
|
rendered = [self.render_ents(p['text'], p['ents'],
|
||||||
|
p.get('title', None)) for p in parsed]
|
||||||
if page:
|
if page:
|
||||||
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
docs = ''.join([TPL_FIGURE.format(content=doc)
|
||||||
|
for doc in rendered])
|
||||||
markup = TPL_PAGE.format(content=docs)
|
markup = TPL_PAGE.format(content=docs)
|
||||||
else:
|
else:
|
||||||
markup = ''.join(rendered)
|
markup = ''.join(rendered)
|
||||||
|
|
|
@ -264,7 +264,6 @@ GLOSSARY = {
|
||||||
'nk': 'noun kernel element',
|
'nk': 'noun kernel element',
|
||||||
'nmc': 'numerical component',
|
'nmc': 'numerical component',
|
||||||
'oa': 'accusative object',
|
'oa': 'accusative object',
|
||||||
'oa': 'second accusative object',
|
|
||||||
'oc': 'clausal object',
|
'oc': 'clausal object',
|
||||||
'og': 'genitive object',
|
'og': 'genitive object',
|
||||||
'op': 'prepositional object',
|
'op': 'prepositional object',
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import io
|
|
||||||
import re
|
import re
|
||||||
import ujson
|
import ujson
|
||||||
import random
|
import random
|
||||||
|
@ -10,9 +9,8 @@ import cytoolz
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .util import ensure_path
|
|
||||||
from . import util
|
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
|
@ -54,7 +52,8 @@ def merge_sents(sents):
|
||||||
m_deps[3].extend(head + i for head in heads)
|
m_deps[3].extend(head + i for head in heads)
|
||||||
m_deps[4].extend(labels)
|
m_deps[4].extend(labels)
|
||||||
m_deps[5].extend(ner)
|
m_deps[5].extend(ner)
|
||||||
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
|
m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])
|
||||||
|
for b in brackets)
|
||||||
i += len(ids)
|
i += len(ids)
|
||||||
return [(m_deps, m_brackets)]
|
return [(m_deps, m_brackets)]
|
||||||
|
|
||||||
|
@ -80,6 +79,8 @@ def align(cand_words, gold_words):
|
||||||
|
|
||||||
|
|
||||||
punct_re = re.compile(r'\W')
|
punct_re = re.compile(r'\W')
|
||||||
|
|
||||||
|
|
||||||
def _min_edit_path(cand_words, gold_words):
|
def _min_edit_path(cand_words, gold_words):
|
||||||
cdef:
|
cdef:
|
||||||
Pool mem
|
Pool mem
|
||||||
|
@ -98,9 +99,9 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
mem = Pool()
|
mem = Pool()
|
||||||
n_cand = len(cand_words)
|
n_cand = len(cand_words)
|
||||||
n_gold = len(gold_words)
|
n_gold = len(gold_words)
|
||||||
# Levenshtein distance, except we need the history, and we may want different
|
# Levenshtein distance, except we need the history, and we may want
|
||||||
# costs.
|
# different costs. Mark operations with a string, and score the history
|
||||||
# Mark operations with a string, and score the history using _edit_cost.
|
# using _edit_cost.
|
||||||
previous_row = []
|
previous_row = []
|
||||||
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||||
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||||
|
@ -144,9 +145,9 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
|
|
||||||
|
|
||||||
def minibatch(items, size=8):
|
def minibatch(items, size=8):
|
||||||
'''Iterate over batches of items. `size` may be an iterator,
|
"""Iterate over batches of items. `size` may be an iterator,
|
||||||
so that batch-size can vary on each step.
|
so that batch-size can vary on each step.
|
||||||
'''
|
"""
|
||||||
if isinstance(size, int):
|
if isinstance(size, int):
|
||||||
size_ = itertools.repeat(8)
|
size_ = itertools.repeat(8)
|
||||||
else:
|
else:
|
||||||
|
@ -168,6 +169,7 @@ class GoldCorpus(object):
|
||||||
|
|
||||||
train_path (unicode or Path): File or directory of training data.
|
train_path (unicode or Path): File or directory of training data.
|
||||||
dev_path (unicode or Path): File or directory of development data.
|
dev_path (unicode or Path): File or directory of development data.
|
||||||
|
RETURNS (GoldCorpus): The newly created object.
|
||||||
"""
|
"""
|
||||||
self.train_path = util.ensure_path(train_path)
|
self.train_path = util.ensure_path(train_path)
|
||||||
self.dev_path = util.ensure_path(dev_path)
|
self.dev_path = util.ensure_path(dev_path)
|
||||||
|
@ -213,7 +215,7 @@ class GoldCorpus(object):
|
||||||
train_tuples = self.train_tuples
|
train_tuples = self.train_tuples
|
||||||
if projectivize:
|
if projectivize:
|
||||||
train_tuples = nonproj.preprocess_training_data(
|
train_tuples = nonproj.preprocess_training_data(
|
||||||
self.train_tuples, label_freq_cutoff=100)
|
self.train_tuples, label_freq_cutoff=100)
|
||||||
random.shuffle(train_tuples)
|
random.shuffle(train_tuples)
|
||||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
|
@ -222,7 +224,6 @@ class GoldCorpus(object):
|
||||||
|
|
||||||
def dev_docs(self, nlp, gold_preproc=False):
|
def dev_docs(self, nlp, gold_preproc=False):
|
||||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||||
#gold_docs = nlp.preprocess_gold(gold_docs)
|
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -233,7 +234,6 @@ class GoldCorpus(object):
|
||||||
raw_text = None
|
raw_text = None
|
||||||
else:
|
else:
|
||||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||||
|
|
||||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||||
gold_preproc, noise_level=noise_level)
|
gold_preproc, noise_level=noise_level)
|
||||||
golds = cls._make_golds(docs, paragraph_tuples)
|
golds = cls._make_golds(docs, paragraph_tuples)
|
||||||
|
@ -248,17 +248,20 @@ class GoldCorpus(object):
|
||||||
raw_text = add_noise(raw_text, noise_level)
|
raw_text = add_noise(raw_text, noise_level)
|
||||||
return [nlp.make_doc(raw_text)]
|
return [nlp.make_doc(raw_text)]
|
||||||
else:
|
else:
|
||||||
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
return [Doc(nlp.vocab,
|
||||||
for (sent_tuples, brackets) in paragraph_tuples]
|
words=add_noise(sent_tuples[1], noise_level))
|
||||||
|
for (sent_tuples, brackets) in paragraph_tuples]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_golds(cls, docs, paragraph_tuples):
|
def _make_golds(cls, docs, paragraph_tuples):
|
||||||
assert len(docs) == len(paragraph_tuples)
|
assert len(docs) == len(paragraph_tuples)
|
||||||
if len(docs) == 1:
|
if len(docs) == 1:
|
||||||
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
|
return [GoldParse.from_annot_tuples(docs[0],
|
||||||
|
paragraph_tuples[0][0])]
|
||||||
else:
|
else:
|
||||||
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
||||||
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
|
for doc, (sent_tuples, brackets)
|
||||||
|
in zip(docs, paragraph_tuples)]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def walk_corpus(path):
|
def walk_corpus(path):
|
||||||
|
@ -305,7 +308,7 @@ def _corrupt(c, noise_level):
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None, limit=None):
|
def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
loc = ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
if loc.is_dir():
|
if loc.is_dir():
|
||||||
for filename in loc.iterdir():
|
for filename in loc.iterdir():
|
||||||
yield from read_json_file(loc / filename, limit=limit)
|
yield from read_json_file(loc / filename, limit=limit)
|
||||||
|
@ -330,16 +333,16 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
for i, token in enumerate(sent['tokens']):
|
for i, token in enumerate(sent['tokens']):
|
||||||
words.append(token['orth'])
|
words.append(token['orth'])
|
||||||
ids.append(i)
|
ids.append(i)
|
||||||
tags.append(token.get('tag','-'))
|
tags.append(token.get('tag', '-'))
|
||||||
heads.append(token.get('head',0) + i)
|
heads.append(token.get('head', 0) + i)
|
||||||
labels.append(token.get('dep',''))
|
labels.append(token.get('dep', ''))
|
||||||
# Ensure ROOT label is case-insensitive
|
# Ensure ROOT label is case-insensitive
|
||||||
if labels[-1].lower() == 'root':
|
if labels[-1].lower() == 'root':
|
||||||
labels[-1] = 'ROOT'
|
labels[-1] = 'ROOT'
|
||||||
ner.append(token.get('ner', '-'))
|
ner.append(token.get('ner', '-'))
|
||||||
sents.append([
|
sents.append([
|
||||||
[ids, words, tags, heads, labels, ner],
|
[ids, words, tags, heads, labels, ner],
|
||||||
sent.get('brackets', [])])
|
sent.get('brackets', [])])
|
||||||
if sents:
|
if sents:
|
||||||
yield [paragraph.get('raw', None), sents]
|
yield [paragraph.get('raw', None), sents]
|
||||||
|
|
||||||
|
@ -382,19 +385,21 @@ cdef class GoldParse:
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
|
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
|
||||||
_, words, tags, heads, deps, entities = annot_tuples
|
_, words, tags, heads, deps, entities = annot_tuples
|
||||||
return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,
|
return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
|
||||||
make_projective=make_projective)
|
entities=entities, make_projective=make_projective)
|
||||||
|
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None,
|
||||||
deps=None, entities=None, make_projective=False,
|
heads=None, deps=None, entities=None, make_projective=False,
|
||||||
cats=None):
|
cats=None):
|
||||||
"""Create a GoldParse.
|
"""Create a GoldParse.
|
||||||
|
|
||||||
doc (Doc): The document the annotations refer to.
|
doc (Doc): The document the annotations refer to.
|
||||||
words (iterable): A sequence of unicode word strings.
|
words (iterable): A sequence of unicode word strings.
|
||||||
tags (iterable): A sequence of strings, representing tag annotations.
|
tags (iterable): A sequence of strings, representing tag annotations.
|
||||||
heads (iterable): A sequence of integers, representing syntactic head offsets.
|
heads (iterable): A sequence of integers, representing syntactic
|
||||||
deps (iterable): A sequence of strings, representing the syntactic relation types.
|
head offsets.
|
||||||
|
deps (iterable): A sequence of strings, representing the syntactic
|
||||||
|
relation types.
|
||||||
entities (iterable): A sequence of named entity annotations, either as
|
entities (iterable): A sequence of named entity annotations, either as
|
||||||
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||||
representing the entity positions.
|
representing the entity positions.
|
||||||
|
@ -404,9 +409,10 @@ cdef class GoldParse:
|
||||||
document (usually a sentence). Unlike entity annotations, label
|
document (usually a sentence). Unlike entity annotations, label
|
||||||
annotations can overlap, i.e. a single word can be covered by
|
annotations can overlap, i.e. a single word can be covered by
|
||||||
multiple labelled spans. The TextCategorizer component expects
|
multiple labelled spans. The TextCategorizer component expects
|
||||||
true examples of a label to have the value 1.0, and negative examples
|
true examples of a label to have the value 1.0, and negative
|
||||||
of a label to have the value 0.0. Labels not in the dictionary are
|
examples of a label to have the value 0.0. Labels not in the
|
||||||
treated as missing -- the gradient for those labels will be zero.
|
dictionary are treated as missing - the gradient for those labels
|
||||||
|
will be zero.
|
||||||
RETURNS (GoldParse): The newly constructed object.
|
RETURNS (GoldParse): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
if words is None:
|
if words is None:
|
||||||
|
@ -470,11 +476,11 @@ cdef class GoldParse:
|
||||||
self.ner[i] = entities[gold_i]
|
self.ner[i] = entities[gold_i]
|
||||||
|
|
||||||
cycle = nonproj.contains_cycle(self.heads)
|
cycle = nonproj.contains_cycle(self.heads)
|
||||||
if cycle != None:
|
if cycle is not None:
|
||||||
raise Exception("Cycle found: %s" % cycle)
|
raise Exception("Cycle found: %s" % cycle)
|
||||||
|
|
||||||
if make_projective:
|
if make_projective:
|
||||||
proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
|
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
|
||||||
self.heads = proj_heads
|
self.heads = proj_heads
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
@ -497,20 +503,19 @@ cdef class GoldParse:
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||||
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
"""Encode labelled spans into per-token tags, using the
|
||||||
scheme (BILUO).
|
Begin/In/Last/Unit/Out scheme (BILUO).
|
||||||
|
|
||||||
doc (Doc): The document that the entity offsets refer to. The output tags
|
doc (Doc): The document that the entity offsets refer to. The output tags
|
||||||
will refer to the token boundaries within the document.
|
will refer to the token boundaries within the document.
|
||||||
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
|
entities (iterable): A sequence of `(start, end, label)` triples. `start`
|
||||||
`end` should be character-offset integers denoting the slice into the
|
and `end` should be character-offset integers denoting the slice into
|
||||||
original string.
|
the original string.
|
||||||
|
|
||||||
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||||
string will be of the form either "", "O" or "{action}-{label}", where
|
string will be of the form either "", "O" or "{action}-{label}", where
|
||||||
action is one of "B", "I", "L", "U". The string "-" is used where the
|
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||||
entity offsets don't align with the tokenization in the `Doc` object. The
|
entity offsets don't align with the tokenization in the `Doc` object.
|
||||||
training algorithm will view these as missing values. "O" denotes a
|
The training algorithm will view these as missing values. "O" denotes a
|
||||||
non-entity token. "B" denotes the beginning of a multi-token entity,
|
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||||
"I" the inside of an entity of three or more tokens, and "L" the end
|
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||||
of an entity of two or more tokens. "U" denotes a single-token entity.
|
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||||
|
|
|
@ -1,31 +1,28 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
from contextlib import contextmanager
|
|
||||||
import copy
|
|
||||||
|
|
||||||
from thinc.neural import Model
|
|
||||||
from thinc.neural.optimizers import Adam
|
|
||||||
import random
|
import random
|
||||||
import ujson
|
import ujson
|
||||||
from collections import OrderedDict
|
|
||||||
import itertools
|
import itertools
|
||||||
import weakref
|
import weakref
|
||||||
import functools
|
import functools
|
||||||
import tqdm
|
from collections import OrderedDict
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from copy import copy
|
||||||
|
from thinc.neural import Model
|
||||||
|
from thinc.neural.optimizers import Adam
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .tagger import Tagger
|
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||||
from .pipeline import DependencyParser, Tensorizer, Tagger
|
from .pipeline import SimilarityHook, TextCategorizer
|
||||||
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
|
from .compat import json_dumps, izip
|
||||||
|
|
||||||
from .compat import json_dumps, izip, copy_reg
|
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from ._ml import link_vectors_to_models
|
from ._ml import link_vectors_to_models
|
||||||
from .attrs import IS_STOP
|
from .attrs import IS_STOP
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .lang.punctuation import TOKENIZER_INFIXES
|
||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||||
from .lang.tag_map import TAG_MAP
|
from .lang.tag_map import TAG_MAP
|
||||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||||
|
@ -57,16 +54,18 @@ class BaseDefaults(object):
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
rules = cls.tokenizer_exceptions
|
rules = cls.tokenizer_exceptions
|
||||||
token_match = cls.token_match
|
token_match = cls.token_match
|
||||||
prefix_search = util.compile_prefix_regex(cls.prefixes).search \
|
prefix_search = (util.compile_prefix_regex(cls.prefixes).search
|
||||||
if cls.prefixes else None
|
if cls.prefixes else None)
|
||||||
suffix_search = util.compile_suffix_regex(cls.suffixes).search \
|
suffix_search = (util.compile_suffix_regex(cls.suffixes).search
|
||||||
if cls.suffixes else None
|
if cls.suffixes else None)
|
||||||
infix_finditer = util.compile_infix_regex(cls.infixes).finditer \
|
infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
|
||||||
if cls.infixes else None
|
if cls.infixes else None)
|
||||||
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
return Tokenizer(vocab, rules=rules,
|
return Tokenizer(vocab, rules=rules,
|
||||||
prefix_search=prefix_search, suffix_search=suffix_search,
|
prefix_search=prefix_search,
|
||||||
infix_finditer=infix_finditer, token_match=token_match)
|
suffix_search=suffix_search,
|
||||||
|
infix_finditer=infix_finditer,
|
||||||
|
token_match=token_match)
|
||||||
|
|
||||||
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
|
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
@ -98,7 +97,7 @@ class Language(object):
|
||||||
|
|
||||||
factories = {
|
factories = {
|
||||||
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
||||||
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
|
'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
|
||||||
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
||||||
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
||||||
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
||||||
|
@ -218,14 +217,14 @@ class Language(object):
|
||||||
def add_pipe(self, component, name=None, before=None, after=None,
|
def add_pipe(self, component, name=None, before=None, after=None,
|
||||||
first=None, last=None):
|
first=None, last=None):
|
||||||
"""Add a component to the processing pipeline. Valid components are
|
"""Add a component to the processing pipeline. Valid components are
|
||||||
callables that take a `Doc` object, modify it and return it. Only one of
|
callables that take a `Doc` object, modify it and return it. Only one
|
||||||
before, after, first or last can be set. Default behaviour is "last".
|
of before/after/first/last can be set. Default behaviour is "last".
|
||||||
|
|
||||||
component (callable): The pipeline component.
|
component (callable): The pipeline component.
|
||||||
name (unicode): Name of pipeline component. Overwrites existing
|
name (unicode): Name of pipeline component. Overwrites existing
|
||||||
component.name attribute if available. If no name is set and
|
component.name attribute if available. If no name is set and
|
||||||
the component exposes no name attribute, component.__name__ is
|
the component exposes no name attribute, component.__name__ is
|
||||||
used. An error is raised if the name already exists in the pipeline.
|
used. An error is raised if a name already exists in the pipeline.
|
||||||
before (unicode): Component name to insert component directly before.
|
before (unicode): Component name to insert component directly before.
|
||||||
after (unicode): Component name to insert component directly after.
|
after (unicode): Component name to insert component directly after.
|
||||||
first (bool): Insert component first / not first in the pipeline.
|
first (bool): Insert component first / not first in the pipeline.
|
||||||
|
@ -240,7 +239,8 @@ class Language(object):
|
||||||
name = component.name
|
name = component.name
|
||||||
elif hasattr(component, '__name__'):
|
elif hasattr(component, '__name__'):
|
||||||
name = component.__name__
|
name = component.__name__
|
||||||
elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
|
elif (hasattr(component, '__class__') and
|
||||||
|
hasattr(component.__class__, '__name__')):
|
||||||
name = component.__class__.__name__
|
name = component.__class__.__name__
|
||||||
else:
|
else:
|
||||||
name = repr(component)
|
name = repr(component)
|
||||||
|
@ -269,7 +269,7 @@ class Language(object):
|
||||||
`name in nlp.pipe_names`.
|
`name in nlp.pipe_names`.
|
||||||
|
|
||||||
name (unicode): Name of the component.
|
name (unicode): Name of the component.
|
||||||
RETURNS (bool): Whether a component of that name exists in the pipeline.
|
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||||||
"""
|
"""
|
||||||
return name in self.pipe_names
|
return name in self.pipe_names
|
||||||
|
|
||||||
|
@ -332,15 +332,12 @@ class Language(object):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def disable_pipes(self, *names):
|
def disable_pipes(self, *names):
|
||||||
'''Disable one or more pipeline components.
|
"""Disable one or more pipeline components. If used as a context
|
||||||
|
manager, the pipeline will be restored to the initial state at the end
|
||||||
If used as a context manager, the pipeline will be restored to the initial
|
of the block. Otherwise, a DisabledPipes object is returned, that has
|
||||||
state at the end of the block. Otherwise, a DisabledPipes object is
|
a `.restore()` method you can use to undo your changes.
|
||||||
returned, that has a `.restore()` method you can use to undo your
|
|
||||||
changes.
|
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
|
|
||||||
>>> nlp.add_pipe('parser')
|
>>> nlp.add_pipe('parser')
|
||||||
>>> nlp.add_pipe('tagger')
|
>>> nlp.add_pipe('tagger')
|
||||||
>>> with nlp.disable_pipes('parser', 'tagger'):
|
>>> with nlp.disable_pipes('parser', 'tagger'):
|
||||||
|
@ -351,7 +348,7 @@ class Language(object):
|
||||||
>>> assert not nlp.has_pipe('parser')
|
>>> assert not nlp.has_pipe('parser')
|
||||||
>>> disabled.restore()
|
>>> disabled.restore()
|
||||||
>>> assert nlp.has_pipe('parser')
|
>>> assert nlp.has_pipe('parser')
|
||||||
'''
|
"""
|
||||||
return DisabledPipes(self, *names)
|
return DisabledPipes(self, *names)
|
||||||
|
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
|
@ -367,14 +364,14 @@ class Language(object):
|
||||||
RETURNS (dict): Results from the update.
|
RETURNS (dict): Results from the update.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
>>> with nlp.begin_training(gold) as (trainer, optimizer):
|
||||||
>>> for epoch in trainer.epochs(gold):
|
>>> for epoch in trainer.epochs(gold):
|
||||||
>>> for docs, golds in epoch:
|
>>> for docs, golds in epoch:
|
||||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||||
"""
|
"""
|
||||||
if len(docs) != len(golds):
|
if len(docs) != len(golds):
|
||||||
raise IndexError("Update expects same number of docs and golds "
|
raise IndexError("Update expects same number of docs and golds "
|
||||||
"Got: %d, %d" % (len(docs), len(golds)))
|
"Got: %d, %d" % (len(docs), len(golds)))
|
||||||
if len(docs) == 0:
|
if len(docs) == 0:
|
||||||
return
|
return
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
@ -382,8 +379,10 @@ class Language(object):
|
||||||
self._optimizer = Adam(Model.ops, 0.001)
|
self._optimizer = Adam(Model.ops, 0.001)
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
grads = {}
|
grads = {}
|
||||||
|
|
||||||
def get_grads(W, dW, key=None):
|
def get_grads(W, dW, key=None):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
|
|
||||||
pipes = list(self.pipeline)
|
pipes = list(self.pipeline)
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
for name, proc in pipes:
|
for name, proc in pipes:
|
||||||
|
@ -421,7 +420,7 @@ class Language(object):
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||||
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||||
beta2=beta2, eps=eps)
|
beta2=beta2, eps=eps)
|
||||||
self._optimizer.max_grad_norm = max_grad_norm
|
self._optimizer.max_grad_norm = max_grad_norm
|
||||||
self._optimizer.device = device
|
self._optimizer.device = device
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
@ -461,7 +460,7 @@ class Language(object):
|
||||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||||
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||||
beta2=beta2, eps=eps)
|
beta2=beta2, eps=eps)
|
||||||
self._optimizer.max_grad_norm = max_grad_norm
|
self._optimizer.max_grad_norm = max_grad_norm
|
||||||
self._optimizer.device = device
|
self._optimizer.device = device
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
@ -512,17 +511,17 @@ class Language(object):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
||||||
disable=[]):
|
disable=[]):
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||||
GIL-free multi-threading.
|
Supports GIL-free multi-threading.
|
||||||
|
|
||||||
texts (iterator): A sequence of texts to process.
|
texts (iterator): A sequence of texts to process.
|
||||||
as_tuples (bool):
|
as_tuples (bool):
|
||||||
If set to True, inputs should be a sequence of
|
If set to True, inputs should be a sequence of
|
||||||
(text, context) tuples. Output will then be a sequence of
|
(text, context) tuples. Output will then be a sequence of
|
||||||
(doc, context) tuples. Defaults to False.
|
(doc, context) tuples. Defaults to False.
|
||||||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
n_threads (int): The number of worker threads to use. If -1, OpenMP
|
||||||
decide how many to use at run time. Default is 2.
|
will decide how many to use at run time. Default is 2.
|
||||||
batch_size (int): The number of texts to buffer.
|
batch_size (int): The number of texts to buffer.
|
||||||
disable (list): Names of the pipeline components to disable.
|
disable (list): Names of the pipeline components to disable.
|
||||||
YIELDS (Doc): Documents in the order of the original text.
|
YIELDS (Doc): Documents in the order of the original text.
|
||||||
|
@ -546,7 +545,8 @@ class Language(object):
|
||||||
if name in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
if hasattr(proc, 'pipe'):
|
if hasattr(proc, 'pipe'):
|
||||||
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
|
docs = proc.pipe(docs, n_threads=n_threads,
|
||||||
|
batch_size=batch_size)
|
||||||
else:
|
else:
|
||||||
# Apply the function, but yield the doc
|
# Apply the function, but yield the doc
|
||||||
docs = _pipe(proc, docs)
|
docs = _pipe(proc, docs)
|
||||||
|
@ -583,7 +583,7 @@ class Language(object):
|
||||||
will include the model.
|
will include the model.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
it doesn't exist. Paths may be strings or `Path`-like objects.
|
||||||
disable (list): Names of pipeline components to disable and prevent
|
disable (list): Names of pipeline components to disable and prevent
|
||||||
from being saved.
|
from being saved.
|
||||||
|
|
||||||
|
@ -649,7 +649,7 @@ class Language(object):
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('vocab', lambda: self.vocab.to_bytes()),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||||
('meta', lambda: ujson.dumps(self.meta))
|
('meta', lambda: json_dumps(self.meta))
|
||||||
))
|
))
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
if name in disable:
|
if name in disable:
|
||||||
|
@ -682,14 +682,14 @@ class Language(object):
|
||||||
|
|
||||||
|
|
||||||
class DisabledPipes(list):
|
class DisabledPipes(list):
|
||||||
'''Manager for temporary pipeline disabling.'''
|
"""Manager for temporary pipeline disabling."""
|
||||||
def __init__(self, nlp, *names):
|
def __init__(self, nlp, *names):
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.names = names
|
self.names = names
|
||||||
# Important! Not deep copy -- we just want the container (but we also
|
# Important! Not deep copy -- we just want the container (but we also
|
||||||
# want to support people providing arbitrarily typed nlp.pipeline
|
# want to support people providing arbitrarily typed nlp.pipeline
|
||||||
# objects.)
|
# objects.)
|
||||||
self.original_pipeline = copy.copy(nlp.pipeline)
|
self.original_pipeline = copy(nlp.pipeline)
|
||||||
list.__init__(self)
|
list.__init__(self)
|
||||||
self.extend(nlp.remove_pipe(name) for name in names)
|
self.extend(nlp.remove_pipe(name) for name in names)
|
||||||
|
|
||||||
|
@ -702,7 +702,8 @@ class DisabledPipes(list):
|
||||||
def restore(self):
|
def restore(self):
|
||||||
'''Restore the pipeline to its state when DisabledPipes was created.'''
|
'''Restore the pipeline to its state when DisabledPipes was created.'''
|
||||||
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
|
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
|
||||||
unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)]
|
unexpected = [name for name, pipe in current
|
||||||
|
if not self.nlp.has_pipe(name)]
|
||||||
if unexpected:
|
if unexpected:
|
||||||
# Don't change the pipeline if we're raising an error.
|
# Don't change the pipeline if we're raising an error.
|
||||||
self.nlp.pipeline = current
|
self.nlp.pipeline = current
|
||||||
|
|
|
@ -43,16 +43,15 @@ class Lemmatizer(object):
|
||||||
morphology = {} if morphology is None else morphology
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology
|
others = [key for key in morphology
|
||||||
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
||||||
true_morph_key = morphology.get('morph', 0)
|
|
||||||
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
||||||
return True
|
return True
|
||||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||||
# morphology
|
# morphology
|
||||||
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
|
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and
|
||||||
morphology.get('Tense') == 'pres' and \
|
morphology.get('Tense') == 'pres' and
|
||||||
morphology.get('Number') is None and \
|
morphology.get('Number') is None and
|
||||||
not others):
|
not others):
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||||
|
@ -89,9 +88,6 @@ class Lemmatizer(object):
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
# TODO: Is this correct? See discussion in Issue #435.
|
|
||||||
#if string in index:
|
|
||||||
# forms.append(string)
|
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
oov_forms = []
|
oov_forms = []
|
||||||
if not forms:
|
if not forms:
|
||||||
|
|
344
spacy/lexeme.pyx
344
spacy/lexeme.pyx
|
@ -2,27 +2,17 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from libc.math cimport sqrt
|
|
||||||
from cpython.ref cimport Py_INCREF
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
|
|
||||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
from cython.view cimport array as cvarray
|
from cython.view cimport array as cvarray
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
np.import_array()
|
np.import_array()
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from .typedefs cimport attr_t, flags_t
|
from .typedefs cimport attr_t, flags_t
|
||||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from .attrs cimport IS_BRACKET
|
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
|
||||||
from .attrs cimport IS_QUOTE
|
|
||||||
from .attrs cimport IS_LEFT_PUNCT
|
|
||||||
from .attrs cimport IS_RIGHT_PUNCT
|
|
||||||
from .attrs cimport IS_OOV
|
|
||||||
from . import about
|
from . import about
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,8 +22,8 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
"""An entry in the vocabulary. A `Lexeme` has no string context – it's a
|
"""An entry in the vocabulary. A `Lexeme` has no string context – it's a
|
||||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
tag, dependency parse, or lemma (lemmatization depends on the
|
||||||
tag).
|
part-of-speech tag).
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, attr_t orth):
|
def __init__(self, Vocab vocab, attr_t orth):
|
||||||
"""Create a Lexeme object.
|
"""Create a Lexeme object.
|
||||||
|
@ -60,17 +50,17 @@ cdef class Lexeme:
|
||||||
else:
|
else:
|
||||||
a = 0
|
a = 0
|
||||||
b = 1
|
b = 1
|
||||||
if op == 2: # ==
|
if op == 2: # ==
|
||||||
return a == b
|
return a == b
|
||||||
elif op == 3: # !=
|
elif op == 3: # !=
|
||||||
return a != b
|
return a != b
|
||||||
elif op == 0: # <
|
elif op == 0: # <
|
||||||
return a < b
|
return a < b
|
||||||
elif op == 1: # <=
|
elif op == 1: # <=
|
||||||
return a <= b
|
return a <= b
|
||||||
elif op == 4: # >
|
elif op == 4: # >
|
||||||
return a > b
|
return a > b
|
||||||
elif op == 5: # >=
|
elif op == 5: # >=
|
||||||
return a >= b
|
return a >= b
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(op)
|
raise NotImplementedError(op)
|
||||||
|
@ -104,7 +94,8 @@ cdef class Lexeme:
|
||||||
"""
|
"""
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return (numpy.dot(self.vector, other.vector) /
|
||||||
|
(self.vector_norm * other.vector_norm))
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
lex_data = Lexeme.c_to_bytes(self.c)
|
lex_data = Lexeme.c_to_bytes(self.c)
|
||||||
|
@ -130,19 +121,13 @@ cdef class Lexeme:
|
||||||
self.orth = self.c.orth
|
self.orth = self.c.orth
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
"""A boolean value indicating whether a word vector is associated with
|
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
the object.
|
|
||||||
|
|
||||||
RETURNS (bool): Whether a word vector is associated with the object.
|
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.has_vector(self.c.orth)
|
return self.vocab.has_vector(self.c.orth)
|
||||||
|
|
||||||
property vector_norm:
|
property vector_norm:
|
||||||
"""The L2 norm of the lexeme's vector representation.
|
"""RETURNS (float): The L2 norm of the vector representation."""
|
||||||
|
|
||||||
RETURNS (float): The L2 norm of the vector representation.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
vector = self.vector
|
vector = self.vector
|
||||||
return numpy.sqrt((vector**2).sum())
|
return numpy.sqrt((vector**2).sum())
|
||||||
|
@ -169,149 +154,320 @@ cdef class Lexeme:
|
||||||
self.vocab.set_vector(self.c.orth, vector)
|
self.vocab.set_vector(self.c.orth, vector)
|
||||||
|
|
||||||
property rank:
|
property rank:
|
||||||
|
"""RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
|
||||||
|
to index into tables, e.g. for word vectors."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.id
|
return self.c.id
|
||||||
|
|
||||||
def __set__(self, value):
|
def __set__(self, value):
|
||||||
self.c.id = value
|
self.c.id = value
|
||||||
|
|
||||||
property sentiment:
|
property sentiment:
|
||||||
|
"""RETURNS (float): A scalar value indicating the positivity or
|
||||||
|
negativity of the lexeme."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.sentiment
|
return self.c.sentiment
|
||||||
|
|
||||||
def __set__(self, float sentiment):
|
def __set__(self, float sentiment):
|
||||||
self.c.sentiment = sentiment
|
self.c.sentiment = sentiment
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
|
"""RETURNS (unicode): The original verbatim text of the lexeme
|
||||||
|
(identical to `Lexeme.text`). Exists mostly for consistency with
|
||||||
|
the other attributes."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.orth]
|
return self.vocab.strings[self.c.orth]
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
"""A unicode representation of the token text.
|
"""RETURNS (unicode): The original verbatim text of the lexeme."""
|
||||||
|
|
||||||
RETURNS (unicode): The original verbatim text of the token.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
property lower:
|
property lower:
|
||||||
def __get__(self): return self.c.lower
|
"""RETURNS (unicode): Lowercase form of the lexeme."""
|
||||||
def __set__(self, attr_t x): self.c.lower = x
|
def __get__(self):
|
||||||
|
return self.c.lower
|
||||||
|
|
||||||
|
def __set__(self, attr_t x):
|
||||||
|
self.c.lower = x
|
||||||
|
|
||||||
property norm:
|
property norm:
|
||||||
def __get__(self): return self.c.norm
|
"""RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the
|
||||||
def __set__(self, attr_t x): self.c.norm = x
|
lexeme text.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.norm
|
||||||
|
|
||||||
|
def __set__(self, attr_t x):
|
||||||
|
self.c.norm = x
|
||||||
|
|
||||||
property shape:
|
property shape:
|
||||||
def __get__(self): return self.c.shape
|
"""RETURNS (uint64): Transform of the word's string, to show
|
||||||
def __set__(self, attr_t x): self.c.shape = x
|
orthographic features.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.shape
|
||||||
|
|
||||||
|
def __set__(self, attr_t x):
|
||||||
|
self.c.shape = x
|
||||||
|
|
||||||
property prefix:
|
property prefix:
|
||||||
def __get__(self): return self.c.prefix
|
"""RETURNS (uint64): Length-N substring from the start of the word.
|
||||||
def __set__(self, attr_t x): self.c.prefix = x
|
Defaults to `N=1`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.prefix
|
||||||
|
|
||||||
|
def __set__(self, attr_t x):
|
||||||
|
self.c.prefix = x
|
||||||
|
|
||||||
property suffix:
|
property suffix:
|
||||||
def __get__(self): return self.c.suffix
|
"""RETURNS (uint64): Length-N substring from the end of the word.
|
||||||
def __set__(self, attr_t x): self.c.suffix = x
|
Defaults to `N=3`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.suffix
|
||||||
|
|
||||||
|
def __set__(self, attr_t x):
|
||||||
|
self.c.suffix = x
|
||||||
|
|
||||||
property cluster:
|
property cluster:
|
||||||
def __get__(self): return self.c.cluster
|
"""RETURNS (int): Brown cluster ID."""
|
||||||
def __set__(self, attr_t x): self.c.cluster = x
|
def __get__(self):
|
||||||
|
return self.c.cluster
|
||||||
|
|
||||||
|
def __set__(self, attr_t x):
|
||||||
|
self.c.cluster = x
|
||||||
|
|
||||||
property lang:
|
property lang:
|
||||||
def __get__(self): return self.c.lang
|
"""RETURNS (uint64): Language of the parent vocabulary."""
|
||||||
def __set__(self, attr_t x): self.c.lang = x
|
def __get__(self):
|
||||||
|
return self.c.lang
|
||||||
|
|
||||||
|
def __set__(self, attr_t x):
|
||||||
|
self.c.lang = x
|
||||||
|
|
||||||
property prob:
|
property prob:
|
||||||
def __get__(self): return self.c.prob
|
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
||||||
def __set__(self, float x): self.c.prob = x
|
type."""
|
||||||
|
def __get__(self):
|
||||||
|
return self.c.prob
|
||||||
|
|
||||||
|
def __set__(self, float x):
|
||||||
|
self.c.prob = x
|
||||||
|
|
||||||
property lower_:
|
property lower_:
|
||||||
def __get__(self): return self.vocab.strings[self.c.lower]
|
"""RETURNS (unicode): Lowercase form of the word."""
|
||||||
def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x)
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.lower]
|
||||||
|
|
||||||
|
def __set__(self, unicode x):
|
||||||
|
self.c.lower = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property norm_:
|
property norm_:
|
||||||
def __get__(self): return self.vocab.strings[self.c.norm]
|
"""RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
|
||||||
def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x)
|
lexeme text.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.norm]
|
||||||
|
|
||||||
|
def __set__(self, unicode x):
|
||||||
|
self.c.norm = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property shape_:
|
property shape_:
|
||||||
def __get__(self): return self.vocab.strings[self.c.shape]
|
"""RETURNS (unicode): Transform of the word's string, to show
|
||||||
def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x)
|
orthographic features.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.shape]
|
||||||
|
|
||||||
|
def __set__(self, unicode x):
|
||||||
|
self.c.shape = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property prefix_:
|
property prefix_:
|
||||||
def __get__(self): return self.vocab.strings[self.c.prefix]
|
"""RETURNS (unicode): Length-N substring from the start of the word.
|
||||||
def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x)
|
Defaults to `N=1`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.prefix]
|
||||||
|
|
||||||
|
def __set__(self, unicode x):
|
||||||
|
self.c.prefix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property suffix_:
|
property suffix_:
|
||||||
def __get__(self): return self.vocab.strings[self.c.suffix]
|
"""RETURNS (unicode): Length-N substring from the end of the word.
|
||||||
def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x)
|
Defaults to `N=3`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.suffix]
|
||||||
|
|
||||||
|
def __set__(self, unicode x):
|
||||||
|
self.c.suffix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property lang_:
|
property lang_:
|
||||||
def __get__(self): return self.vocab.strings[self.c.lang]
|
"""RETURNS (unicode): Language of the parent vocabulary."""
|
||||||
def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x)
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.lang]
|
||||||
|
|
||||||
|
def __set__(self, unicode x):
|
||||||
|
self.c.lang = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property flags:
|
property flags:
|
||||||
def __get__(self): return self.c.flags
|
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
||||||
def __set__(self, flags_t x): self.c.flags = x
|
def __get__(self):
|
||||||
|
return self.c.flags
|
||||||
|
|
||||||
|
def __set__(self, flags_t x):
|
||||||
|
self.c.flags = x
|
||||||
|
|
||||||
property is_oov:
|
property is_oov:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
|
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
||||||
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_OOV)
|
||||||
|
|
||||||
|
def __set__(self, attr_t x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_OOV, x)
|
||||||
|
|
||||||
property is_stop:
|
property is_stop:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
|
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
||||||
|
|
||||||
property is_alpha:
|
property is_alpha:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
"""RETURNS (bool): Whether the lexeme consists of alphanumeric
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
characters. Equivalent to `lexeme.text.isalpha()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
||||||
|
|
||||||
property is_ascii:
|
property is_ascii:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII)
|
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
||||||
|
|
||||||
property is_digit:
|
property is_digit:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
to `lexeme.text.isdigit()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
||||||
|
|
||||||
property is_lower:
|
property is_lower:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER)
|
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
`lexeme.text.islower()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
||||||
|
|
||||||
|
property is_upper:
|
||||||
|
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
||||||
|
`lexeme.text.isupper()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
||||||
|
|
||||||
property is_title:
|
property is_title:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE)
|
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
`lexeme.text.istitle()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
||||||
|
|
||||||
property is_punct:
|
property is_punct:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
||||||
|
|
||||||
property is_space:
|
property is_space:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
|
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
Equivalent to `lexeme.text.isspace()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||||||
|
|
||||||
property is_bracket:
|
property is_bracket:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||||||
|
|
||||||
property is_quote:
|
property is_quote:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||||
|
|
||||||
property is_left_punct:
|
property is_left_punct:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. )."""
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||||||
|
|
||||||
property is_right_punct:
|
property is_right_punct:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||||
|
|
||||||
property like_url:
|
property like_url:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
|
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||||
|
|
||||||
property like_num:
|
property like_num:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
"10", "ten", etc.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
||||||
|
|
||||||
property like_email:
|
property like_email:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
||||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
||||||
|
|
||||||
|
def __set__(self, bint x):
|
||||||
|
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
||||||
|
|
|
@ -4,12 +4,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
|
||||||
from .typedefs cimport hash_t
|
|
||||||
from .attrs cimport attr_id_t
|
|
||||||
from .structs cimport TokenC
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
|
from .typedefs cimport attr_t
|
||||||
from . import attrs
|
from .typedefs cimport hash_t
|
||||||
from .tokens.doc cimport get_token_attr
|
from .structs cimport TokenC
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc, get_token_attr
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
|
|
||||||
|
from .attrs import IDS
|
||||||
|
from .attrs cimport attr_id_t, ID, NULL_ATTR
|
||||||
from .attrs import FLAG61 as U_ENT
|
from .attrs import FLAG61 as U_ENT
|
||||||
|
|
||||||
from .attrs import FLAG60 as B2_ENT
|
from .attrs import FLAG60 as B2_ENT
|
||||||
from .attrs import FLAG59 as B3_ENT
|
from .attrs import FLAG59 as B3_ENT
|
||||||
from .attrs import FLAG58 as B4_ENT
|
from .attrs import FLAG58 as B4_ENT
|
||||||
|
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
|
||||||
from .attrs import FLAG54 as B8_ENT
|
from .attrs import FLAG54 as B8_ENT
|
||||||
from .attrs import FLAG53 as B9_ENT
|
from .attrs import FLAG53 as B9_ENT
|
||||||
from .attrs import FLAG52 as B10_ENT
|
from .attrs import FLAG52 as B10_ENT
|
||||||
|
|
||||||
from .attrs import FLAG51 as I3_ENT
|
from .attrs import FLAG51 as I3_ENT
|
||||||
from .attrs import FLAG50 as I4_ENT
|
from .attrs import FLAG50 as I4_ENT
|
||||||
from .attrs import FLAG49 as I5_ENT
|
from .attrs import FLAG49 as I5_ENT
|
||||||
|
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
|
||||||
from .attrs import FLAG46 as I8_ENT
|
from .attrs import FLAG46 as I8_ENT
|
||||||
from .attrs import FLAG45 as I9_ENT
|
from .attrs import FLAG45 as I9_ENT
|
||||||
from .attrs import FLAG44 as I10_ENT
|
from .attrs import FLAG44 as I10_ENT
|
||||||
|
|
||||||
from .attrs import FLAG43 as L2_ENT
|
from .attrs import FLAG43 as L2_ENT
|
||||||
from .attrs import FLAG42 as L3_ENT
|
from .attrs import FLAG42 as L3_ENT
|
||||||
from .attrs import FLAG41 as L4_ENT
|
from .attrs import FLAG41 as L4_ENT
|
||||||
|
@ -153,7 +146,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||||
def _convert_strings(token_specs, string_store):
|
def _convert_strings(token_specs, string_store):
|
||||||
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
||||||
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
||||||
'?': (ZERO_ONE,), '1': (ONE,)}
|
'?': (ZERO_ONE,), '1': (ONE,)}
|
||||||
tokens = []
|
tokens = []
|
||||||
op = ONE
|
op = ONE
|
||||||
for spec in token_specs:
|
for spec in token_specs:
|
||||||
|
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
|
||||||
if value in operators:
|
if value in operators:
|
||||||
ops = operators[value]
|
ops = operators[value]
|
||||||
else:
|
else:
|
||||||
raise KeyError(
|
msg = "Unknown operator '%s'. Options: %s"
|
||||||
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
|
raise KeyError(msg % (value, ', '.join(operators.keys())))
|
||||||
if isinstance(attr, basestring):
|
if isinstance(attr, basestring):
|
||||||
attr = attrs.IDS.get(attr.upper())
|
attr = IDS.get(attr.upper())
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, basestring):
|
||||||
value = string_store.add(value)
|
value = string_store.add(value)
|
||||||
if isinstance(value, bool):
|
if isinstance(value, bool):
|
||||||
|
@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store):
|
||||||
def merge_phrase(matcher, doc, i, matches):
|
def merge_phrase(matcher, doc, i, matches):
|
||||||
"""Callback to merge a phrase on match."""
|
"""Callback to merge a phrase on match."""
|
||||||
ent_id, label, start, end = matches[i]
|
ent_id, label, start, end = matches[i]
|
||||||
span = doc[start : end]
|
span = doc[start:end]
|
||||||
span.merge(ent_type=label, ent_id=ent_id)
|
span.merge(ent_type=label, ent_id=ent_id)
|
||||||
|
|
||||||
|
|
||||||
|
@ -233,13 +226,13 @@ cdef class Matcher:
|
||||||
return self._normalize_key(key) in self._patterns
|
return self._normalize_key(key) in self._patterns
|
||||||
|
|
||||||
def add(self, key, on_match, *patterns):
|
def add(self, key, on_match, *patterns):
|
||||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||||
an on_match callback, and one or more patterns.
|
key, an on_match callback, and one or more patterns.
|
||||||
|
|
||||||
If the key exists, the patterns are appended to the previous ones, and
|
If the key exists, the patterns are appended to the previous ones, and
|
||||||
the previous on_match callback is replaced. The `on_match` callback will
|
the previous on_match callback is replaced. The `on_match` callback
|
||||||
receive the arguments `(matcher, doc, i, matches)`. You can also set
|
will receive the arguments `(matcher, doc, i, matches)`. You can also
|
||||||
`on_match` to `None` to not perform any actions.
|
set `on_match` to `None` to not perform any actions.
|
||||||
|
|
||||||
A pattern consists of one or more `token_specs`, where a `token_spec`
|
A pattern consists of one or more `token_specs`, where a `token_spec`
|
||||||
is a dictionary mapping attribute IDs to values, and optionally a
|
is a dictionary mapping attribute IDs to values, and optionally a
|
||||||
|
@ -253,8 +246,8 @@ cdef class Matcher:
|
||||||
The + and * operators are usually interpretted "greedily", i.e. longer
|
The + and * operators are usually interpretted "greedily", i.e. longer
|
||||||
matches are returned where possible. However, if you specify two '+'
|
matches are returned where possible. However, if you specify two '+'
|
||||||
and '*' patterns in a row and their matches overlap, the first
|
and '*' patterns in a row and their matches overlap, the first
|
||||||
operator will behave non-greedily. This quirk in the semantics
|
operator will behave non-greedily. This quirk in the semantics makes
|
||||||
makes the matcher more efficient, by avoiding the need for back-tracking.
|
the matcher more efficient, by avoiding the need for back-tracking.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (unicode): The match ID.
|
||||||
on_match (callable): Callback executed on match.
|
on_match (callable): Callback executed on match.
|
||||||
|
@ -268,7 +261,6 @@ cdef class Matcher:
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
self._patterns.setdefault(key, [])
|
self._patterns.setdefault(key, [])
|
||||||
self._callbacks[key] = on_match
|
self._callbacks[key] = on_match
|
||||||
|
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
specs = _convert_strings(pattern, self.vocab.strings)
|
specs = _convert_strings(pattern, self.vocab.strings)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
|
@ -315,9 +307,9 @@ cdef class Matcher:
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
docs (iterable): A stream of documents.
|
docs (iterable): A stream of documents.
|
||||||
batch_size (int): The number of documents to accumulate into a working set.
|
batch_size (int): Number of documents to accumulate into a working set.
|
||||||
n_threads (int): The number of threads with which to work on the buffer
|
n_threads (int): The number of threads with which to work on the buffer
|
||||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
in parallel, if the implementation supports multi-threading.
|
||||||
YIELDS (Doc): Documents, in order.
|
YIELDS (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -325,7 +317,7 @@ cdef class Matcher:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc):
|
||||||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
"""Find all token sequences matching the supplied pattern.
|
||||||
|
|
||||||
doc (Doc): The document to match over.
|
doc (Doc): The document to match over.
|
||||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||||
|
@ -342,8 +334,8 @@ cdef class Matcher:
|
||||||
for token_i in range(doc.length):
|
for token_i in range(doc.length):
|
||||||
token = &doc.c[token_i]
|
token = &doc.c[token_i]
|
||||||
q = 0
|
q = 0
|
||||||
# Go over the open matches, extending or finalizing if able. Otherwise,
|
# Go over the open matches, extending or finalizing if able.
|
||||||
# we over-write them (q doesn't advance)
|
# Otherwise, we over-write them (q doesn't advance)
|
||||||
for state in partials:
|
for state in partials:
|
||||||
action = get_action(state.second, token)
|
action = get_action(state.second, token)
|
||||||
if action == PANIC:
|
if action == PANIC:
|
||||||
|
@ -356,8 +348,8 @@ cdef class Matcher:
|
||||||
|
|
||||||
if action == REPEAT:
|
if action == REPEAT:
|
||||||
# Leave the state in the queue, and advance to next slot
|
# Leave the state in the queue, and advance to next slot
|
||||||
# (i.e. we don't overwrite -- we want to greedily match more
|
# (i.e. we don't overwrite -- we want to greedily match
|
||||||
# pattern.
|
# more pattern.
|
||||||
q += 1
|
q += 1
|
||||||
elif action == REJECT:
|
elif action == REJECT:
|
||||||
pass
|
pass
|
||||||
|
@ -366,8 +358,8 @@ cdef class Matcher:
|
||||||
partials[q].second += 1
|
partials[q].second += 1
|
||||||
q += 1
|
q += 1
|
||||||
elif action in (ACCEPT, ACCEPT_PREV):
|
elif action in (ACCEPT, ACCEPT_PREV):
|
||||||
# TODO: What to do about patterns starting with ZERO? Need to
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
# adjust the start position.
|
# to adjust the start position.
|
||||||
start = state.first
|
start = state.first
|
||||||
end = token_i+1 if action == ACCEPT else token_i
|
end = token_i+1 if action == ACCEPT else token_i
|
||||||
ent_id = state.second[1].attrs[0].value
|
ent_id = state.second[1].attrs[0].value
|
||||||
|
@ -388,8 +380,8 @@ cdef class Matcher:
|
||||||
state.second = pattern
|
state.second = pattern
|
||||||
partials.push_back(state)
|
partials.push_back(state)
|
||||||
elif action == ADVANCE:
|
elif action == ADVANCE:
|
||||||
# TODO: What to do about patterns starting with ZERO? Need to
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
# adjust the start position.
|
# to adjust the start position.
|
||||||
state.first = token_i
|
state.first = token_i
|
||||||
state.second = pattern + 1
|
state.second = pattern + 1
|
||||||
partials.push_back(state)
|
partials.push_back(state)
|
||||||
|
@ -413,7 +405,6 @@ cdef class Matcher:
|
||||||
on_match = self._callbacks.get(ent_id)
|
on_match = self._callbacks.get(ent_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
on_match(self, doc, i, matches)
|
on_match(self, doc, i, matches)
|
||||||
# TODO: only return (match_id, start, end)
|
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def _normalize_key(self, key):
|
def _normalize_key(self, key):
|
||||||
|
@ -441,7 +432,8 @@ def get_bilou(length):
|
||||||
elif length == 8:
|
elif length == 8:
|
||||||
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
||||||
elif length == 9:
|
elif length == 9:
|
||||||
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
|
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
|
||||||
|
L9_ENT]
|
||||||
elif length == 10:
|
elif length == 10:
|
||||||
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
||||||
I10_ENT, I10_ENT, L10_ENT]
|
I10_ENT, I10_ENT, L10_ENT]
|
||||||
|
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
|
||||||
cdef Vocab vocab
|
cdef Vocab vocab
|
||||||
cdef Matcher matcher
|
cdef Matcher matcher
|
||||||
cdef PreshMap phrase_ids
|
cdef PreshMap phrase_ids
|
||||||
|
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
cdef attr_t* _phrase_key
|
cdef attr_t* _phrase_key
|
||||||
|
|
||||||
cdef public object _callbacks
|
cdef public object _callbacks
|
||||||
cdef public object _patterns
|
cdef public object _patterns
|
||||||
|
|
||||||
|
@ -470,7 +460,8 @@ cdef class PhraseMatcher:
|
||||||
self.phrase_ids = PreshMap()
|
self.phrase_ids = PreshMap()
|
||||||
abstract_patterns = []
|
abstract_patterns = []
|
||||||
for length in range(1, max_length):
|
for length in range(1, max_length):
|
||||||
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
abstract_patterns.append([{tag: True}
|
||||||
|
for tag in get_bilou(length)])
|
||||||
self.matcher.add('Candidate', None, *abstract_patterns)
|
self.matcher.add('Candidate', None, *abstract_patterns)
|
||||||
self._callbacks = {}
|
self._callbacks = {}
|
||||||
|
|
||||||
|
@ -496,8 +487,8 @@ cdef class PhraseMatcher:
|
||||||
return (self.__class__, (self.vocab,), None, None)
|
return (self.__class__, (self.vocab,), None, None)
|
||||||
|
|
||||||
def add(self, key, on_match, *docs):
|
def add(self, key, on_match, *docs):
|
||||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||||
an on_match callback, and one or more patterns.
|
key, an on_match callback, and one or more patterns.
|
||||||
|
|
||||||
key (unicode): The match ID.
|
key (unicode): The match ID.
|
||||||
on_match (callable): Callback executed on match.
|
on_match (callable): Callback executed on match.
|
||||||
|
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
|
||||||
raise ValueError(msg % (len(doc), self.max_length))
|
raise ValueError(msg % (len(doc), self.max_length))
|
||||||
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||||
self._callbacks[ent_id] = on_match
|
self._callbacks[ent_id] = on_match
|
||||||
|
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef hash_t phrase_hash
|
cdef hash_t phrase_hash
|
||||||
|
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
docs (iterable): A stream of documents.
|
docs (iterable): A stream of documents.
|
||||||
batch_size (int): The number of documents to accumulate into a working set.
|
batch_size (int): Number of documents to accumulate into a working set.
|
||||||
n_threads (int): The number of threads with which to work on the buffer
|
n_threads (int): The number of threads with which to work on the buffer
|
||||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
in parallel, if the implementation supports multi-threading.
|
||||||
YIELDS (Doc): Documents, in order.
|
YIELDS (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
for doc in stream:
|
for doc in stream:
|
||||||
|
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
|
||||||
self._phrase_key[i] = 0
|
self._phrase_key[i] = 0
|
||||||
for i, j in enumerate(range(start, end)):
|
for i, j in enumerate(range(start, end)):
|
||||||
self._phrase_key[i] = doc.c[j].lex.orth
|
self._phrase_key[i] = doc.c[j].lex.orth
|
||||||
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
cdef hash_t key = hash64(self._phrase_key,
|
||||||
|
self.max_length * sizeof(attr_t), 0)
|
||||||
ent_id = <hash_t>self.phrase_ids.get(key)
|
ent_id = <hash_t>self.phrase_ids.get(key)
|
||||||
if ent_id == 0:
|
if ent_id == 0:
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -4,17 +4,15 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
|
|
||||||
from .attrs cimport POS, IS_SPACE
|
from .attrs cimport POS, IS_SPACE
|
||||||
|
from .attrs import LEMMA, intify_attrs
|
||||||
|
from .parts_of_speech cimport SPACE
|
||||||
from .parts_of_speech import IDS as POS_IDS
|
from .parts_of_speech import IDS as POS_IDS
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .attrs import LEMMA, intify_attrs
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_props(props):
|
def _normalize_props(props):
|
||||||
"""
|
"""Transform deprecated string keys to correct names."""
|
||||||
Transform deprecated string keys to correct names.
|
|
||||||
"""
|
|
||||||
out = {}
|
out = {}
|
||||||
for key, value in props.items():
|
for key, value in props.items():
|
||||||
if key == POS:
|
if key == POS:
|
||||||
|
@ -77,7 +75,8 @@ cdef class Morphology:
|
||||||
cdef int assign_untagged(self, TokenC* token) except -1:
|
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||||
"""Set morphological attributes on a token without a POS tag. Uses
|
"""Set morphological attributes on a token without a POS tag. Uses
|
||||||
the lemmatizer's lookup() method, which looks up the string in the
|
the lemmatizer's lookup() method, which looks up the string in the
|
||||||
table provided by the language data as lemma_lookup (if available)."""
|
table provided by the language data as lemma_lookup (if available).
|
||||||
|
"""
|
||||||
if token.lemma == 0:
|
if token.lemma == 0:
|
||||||
orth_str = self.strings[token.lex.orth]
|
orth_str = self.strings[token.lex.orth]
|
||||||
lemma = self.lemmatizer.lookup(orth_str)
|
lemma = self.lemmatizer.lookup(orth_str)
|
||||||
|
@ -95,11 +94,10 @@ cdef class Morphology:
|
||||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||||
if tag_id > self.n_tags:
|
if tag_id > self.n_tags:
|
||||||
raise ValueError("Unknown tag ID: %s" % tag_id)
|
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
# TODO: It's pretty arbitrary to put this logic here. I guess the
|
||||||
# is that this is where the specific word and the tag interact. Still,
|
# justification is that this is where the specific word and the tag
|
||||||
# we should have a better way to enforce this rule, or figure out why
|
# interact. Still, we should have a better way to enforce this rule, or
|
||||||
# the statistical model fails.
|
# figure out why the statistical model fails. Related to Issue #220
|
||||||
# Related to Issue #220
|
|
||||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||||
rich_tag = self.rich_tags[tag_id]
|
rich_tag = self.rich_tags[tag_id]
|
||||||
|
@ -123,14 +121,13 @@ cdef class Morphology:
|
||||||
else:
|
else:
|
||||||
flags[0] &= ~(one << flag_id)
|
flags[0] &= ~(one << flag_id)
|
||||||
|
|
||||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||||
"""
|
force=False):
|
||||||
Add a special-case rule to the morphological analyser. Tokens whose
|
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||||
tag and orth match the rule will receive the specified properties.
|
tag and orth match the rule will receive the specified properties.
|
||||||
|
|
||||||
Arguments:
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
orth (unicode): The word-form to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
|
||||||
"""
|
"""
|
||||||
self.exc[(tag_str, orth_str)] = dict(attrs)
|
self.exc[(tag_str, orth_str)] = dict(attrs)
|
||||||
tag = self.strings.add(tag_str)
|
tag = self.strings.add(tag_str)
|
||||||
|
@ -144,10 +141,9 @@ cdef class Morphology:
|
||||||
elif force:
|
elif force:
|
||||||
memset(cached, 0, sizeof(cached[0]))
|
memset(cached, 0, sizeof(cached[0]))
|
||||||
else:
|
else:
|
||||||
msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
|
raise ValueError(
|
||||||
"to overwrite.")
|
"Conflicting morphology exception for (%s, %s). Use "
|
||||||
msg = msg % (tag_str, orth_str)
|
"force=True to overwrite." % (tag_str, orth_str))
|
||||||
raise ValueError(msg)
|
|
||||||
|
|
||||||
cached.tag = rich_tag
|
cached.tag = rich_tag
|
||||||
# TODO: Refactor this to take arbitrary attributes.
|
# TODO: Refactor this to take arbitrary attributes.
|
||||||
|
@ -218,7 +214,7 @@ IDS = {
|
||||||
"Definite_two": Definite_two,
|
"Definite_two": Definite_two,
|
||||||
"Definite_def": Definite_def,
|
"Definite_def": Definite_def,
|
||||||
"Definite_red": Definite_red,
|
"Definite_red": Definite_red,
|
||||||
"Definite_cons": Definite_cons, # U20
|
"Definite_cons": Definite_cons, # U20
|
||||||
"Definite_ind": Definite_ind,
|
"Definite_ind": Definite_ind,
|
||||||
"Degree_cmp": Degree_cmp,
|
"Degree_cmp": Degree_cmp,
|
||||||
"Degree_comp": Degree_comp,
|
"Degree_comp": Degree_comp,
|
||||||
|
@ -227,7 +223,7 @@ IDS = {
|
||||||
"Degree_sup": Degree_sup,
|
"Degree_sup": Degree_sup,
|
||||||
"Degree_abs": Degree_abs,
|
"Degree_abs": Degree_abs,
|
||||||
"Degree_com": Degree_com,
|
"Degree_com": Degree_com,
|
||||||
"Degree_dim ": Degree_dim, # du
|
"Degree_dim ": Degree_dim, # du
|
||||||
"Gender_com": Gender_com,
|
"Gender_com": Gender_com,
|
||||||
"Gender_fem": Gender_fem,
|
"Gender_fem": Gender_fem,
|
||||||
"Gender_masc": Gender_masc,
|
"Gender_masc": Gender_masc,
|
||||||
|
@ -242,15 +238,15 @@ IDS = {
|
||||||
"Negative_neg": Negative_neg,
|
"Negative_neg": Negative_neg,
|
||||||
"Negative_pos": Negative_pos,
|
"Negative_pos": Negative_pos,
|
||||||
"Negative_yes": Negative_yes,
|
"Negative_yes": Negative_yes,
|
||||||
"Polarity_neg": Polarity_neg, # U20
|
"Polarity_neg": Polarity_neg, # U20
|
||||||
"Polarity_pos": Polarity_pos, # U20
|
"Polarity_pos": Polarity_pos, # U20
|
||||||
"Number_com": Number_com,
|
"Number_com": Number_com,
|
||||||
"Number_dual": Number_dual,
|
"Number_dual": Number_dual,
|
||||||
"Number_none": Number_none,
|
"Number_none": Number_none,
|
||||||
"Number_plur": Number_plur,
|
"Number_plur": Number_plur,
|
||||||
"Number_sing": Number_sing,
|
"Number_sing": Number_sing,
|
||||||
"Number_ptan ": Number_ptan, # bg
|
"Number_ptan ": Number_ptan, # bg
|
||||||
"Number_count ": Number_count, # bg
|
"Number_count ": Number_count, # bg
|
||||||
"NumType_card": NumType_card,
|
"NumType_card": NumType_card,
|
||||||
"NumType_dist": NumType_dist,
|
"NumType_dist": NumType_dist,
|
||||||
"NumType_frac": NumType_frac,
|
"NumType_frac": NumType_frac,
|
||||||
|
@ -276,7 +272,7 @@ IDS = {
|
||||||
"PronType_rel": PronType_rel,
|
"PronType_rel": PronType_rel,
|
||||||
"PronType_tot": PronType_tot,
|
"PronType_tot": PronType_tot,
|
||||||
"PronType_clit": PronType_clit,
|
"PronType_clit": PronType_clit,
|
||||||
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
||||||
"Reflex_yes": Reflex_yes,
|
"Reflex_yes": Reflex_yes,
|
||||||
"Tense_fut": Tense_fut,
|
"Tense_fut": Tense_fut,
|
||||||
"Tense_imp": Tense_imp,
|
"Tense_imp": Tense_imp,
|
||||||
|
@ -292,19 +288,19 @@ IDS = {
|
||||||
"VerbForm_partPres": VerbForm_partPres,
|
"VerbForm_partPres": VerbForm_partPres,
|
||||||
"VerbForm_sup": VerbForm_sup,
|
"VerbForm_sup": VerbForm_sup,
|
||||||
"VerbForm_trans": VerbForm_trans,
|
"VerbForm_trans": VerbForm_trans,
|
||||||
"VerbForm_conv": VerbForm_conv, # U20
|
"VerbForm_conv": VerbForm_conv, # U20
|
||||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||||
"Voice_act": Voice_act,
|
"Voice_act": Voice_act,
|
||||||
"Voice_cau": Voice_cau,
|
"Voice_cau": Voice_cau,
|
||||||
"Voice_pass": Voice_pass,
|
"Voice_pass": Voice_pass,
|
||||||
"Voice_mid ": Voice_mid, # gkc,
|
"Voice_mid ": Voice_mid, # gkc,
|
||||||
"Voice_int ": Voice_int, # hb,
|
"Voice_int ": Voice_int, # hb,
|
||||||
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
||||||
"AdpType_prep ": AdpType_prep, # cz, U,
|
"AdpType_prep ": AdpType_prep, # cz, U,
|
||||||
"AdpType_post ": AdpType_post, # U,
|
"AdpType_post ": AdpType_post, # U,
|
||||||
"AdpType_voc ": AdpType_voc, # cz,
|
"AdpType_voc ": AdpType_voc, # cz,
|
||||||
"AdpType_comprep ": AdpType_comprep, # cz,
|
"AdpType_comprep ": AdpType_comprep, # cz,
|
||||||
"AdpType_circ ": AdpType_circ, # U,
|
"AdpType_circ ": AdpType_circ, # U,
|
||||||
"AdvType_man": AdvType_man,
|
"AdvType_man": AdvType_man,
|
||||||
"AdvType_loc": AdvType_loc,
|
"AdvType_loc": AdvType_loc,
|
||||||
"AdvType_tim": AdvType_tim,
|
"AdvType_tim": AdvType_tim,
|
||||||
|
@ -314,122 +310,122 @@ IDS = {
|
||||||
"AdvType_sta": AdvType_sta,
|
"AdvType_sta": AdvType_sta,
|
||||||
"AdvType_ex": AdvType_ex,
|
"AdvType_ex": AdvType_ex,
|
||||||
"AdvType_adadj": AdvType_adadj,
|
"AdvType_adadj": AdvType_adadj,
|
||||||
"ConjType_oper ": ConjType_oper, # cz, U,
|
"ConjType_oper ": ConjType_oper, # cz, U,
|
||||||
"ConjType_comp ": ConjType_comp, # cz, U,
|
"ConjType_comp ": ConjType_comp, # cz, U,
|
||||||
"Connegative_yes ": Connegative_yes, # fi,
|
"Connegative_yes ": Connegative_yes, # fi,
|
||||||
"Derivation_minen ": Derivation_minen, # fi,
|
"Derivation_minen ": Derivation_minen, # fi,
|
||||||
"Derivation_sti ": Derivation_sti, # fi,
|
"Derivation_sti ": Derivation_sti, # fi,
|
||||||
"Derivation_inen ": Derivation_inen, # fi,
|
"Derivation_inen ": Derivation_inen, # fi,
|
||||||
"Derivation_lainen ": Derivation_lainen, # fi,
|
"Derivation_lainen ": Derivation_lainen, # fi,
|
||||||
"Derivation_ja ": Derivation_ja, # fi,
|
"Derivation_ja ": Derivation_ja, # fi,
|
||||||
"Derivation_ton ": Derivation_ton, # fi,
|
"Derivation_ton ": Derivation_ton, # fi,
|
||||||
"Derivation_vs ": Derivation_vs, # fi,
|
"Derivation_vs ": Derivation_vs, # fi,
|
||||||
"Derivation_ttain ": Derivation_ttain, # fi,
|
"Derivation_ttain ": Derivation_ttain, # fi,
|
||||||
"Derivation_ttaa ": Derivation_ttaa, # fi,
|
"Derivation_ttaa ": Derivation_ttaa, # fi,
|
||||||
"Echo_rdp ": Echo_rdp, # U,
|
"Echo_rdp ": Echo_rdp, # U,
|
||||||
"Echo_ech ": Echo_ech, # U,
|
"Echo_ech ": Echo_ech, # U,
|
||||||
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
|
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
|
||||||
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
|
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
|
||||||
"Foreign_tscript ": Foreign_tscript, # cz, U,
|
"Foreign_tscript ": Foreign_tscript, # cz, U,
|
||||||
"Foreign_yes ": Foreign_yes, # sl,
|
"Foreign_yes ": Foreign_yes, # sl,
|
||||||
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
|
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
|
||||||
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
|
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
|
||||||
"Gender_erg_masc ": Gender_erg_masc, # bq,
|
"Gender_erg_masc ": Gender_erg_masc, # bq,
|
||||||
"Gender_erg_fem ": Gender_erg_fem, # bq,
|
"Gender_erg_fem ": Gender_erg_fem, # bq,
|
||||||
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
|
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
|
||||||
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
|
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
|
||||||
"Gender_psor_neut ": Gender_psor_neut, # sl,
|
"Gender_psor_neut ": Gender_psor_neut, # sl,
|
||||||
"Hyph_yes ": Hyph_yes, # cz, U,
|
"Hyph_yes ": Hyph_yes, # cz, U,
|
||||||
"InfForm_one ": InfForm_one, # fi,
|
"InfForm_one ": InfForm_one, # fi,
|
||||||
"InfForm_two ": InfForm_two, # fi,
|
"InfForm_two ": InfForm_two, # fi,
|
||||||
"InfForm_three ": InfForm_three, # fi,
|
"InfForm_three ": InfForm_three, # fi,
|
||||||
"NameType_geo ": NameType_geo, # U, cz,
|
"NameType_geo ": NameType_geo, # U, cz,
|
||||||
"NameType_prs ": NameType_prs, # U, cz,
|
"NameType_prs ": NameType_prs, # U, cz,
|
||||||
"NameType_giv ": NameType_giv, # U, cz,
|
"NameType_giv ": NameType_giv, # U, cz,
|
||||||
"NameType_sur ": NameType_sur, # U, cz,
|
"NameType_sur ": NameType_sur, # U, cz,
|
||||||
"NameType_nat ": NameType_nat, # U, cz,
|
"NameType_nat ": NameType_nat, # U, cz,
|
||||||
"NameType_com ": NameType_com, # U, cz,
|
"NameType_com ": NameType_com, # U, cz,
|
||||||
"NameType_pro ": NameType_pro, # U, cz,
|
"NameType_pro ": NameType_pro, # U, cz,
|
||||||
"NameType_oth ": NameType_oth, # U, cz,
|
"NameType_oth ": NameType_oth, # U, cz,
|
||||||
"NounType_com ": NounType_com, # U,
|
"NounType_com ": NounType_com, # U,
|
||||||
"NounType_prop ": NounType_prop, # U,
|
"NounType_prop ": NounType_prop, # U,
|
||||||
"NounType_class ": NounType_class, # U,
|
"NounType_class ": NounType_class, # U,
|
||||||
"Number_abs_sing ": Number_abs_sing, # bq, U,
|
"Number_abs_sing ": Number_abs_sing, # bq, U,
|
||||||
"Number_abs_plur ": Number_abs_plur, # bq, U,
|
"Number_abs_plur ": Number_abs_plur, # bq, U,
|
||||||
"Number_dat_sing ": Number_dat_sing, # bq, U,
|
"Number_dat_sing ": Number_dat_sing, # bq, U,
|
||||||
"Number_dat_plur ": Number_dat_plur, # bq, U,
|
"Number_dat_plur ": Number_dat_plur, # bq, U,
|
||||||
"Number_erg_sing ": Number_erg_sing, # bq, U,
|
"Number_erg_sing ": Number_erg_sing, # bq, U,
|
||||||
"Number_erg_plur ": Number_erg_plur, # bq, U,
|
"Number_erg_plur ": Number_erg_plur, # bq, U,
|
||||||
"Number_psee_sing ": Number_psee_sing, # U,
|
"Number_psee_sing ": Number_psee_sing, # U,
|
||||||
"Number_psee_plur ": Number_psee_plur, # U,
|
"Number_psee_plur ": Number_psee_plur, # U,
|
||||||
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
||||||
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
||||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||||
"NumValue_one ": NumValue_one, # cz, U,
|
"NumValue_one ": NumValue_one, # cz, U,
|
||||||
"NumValue_two ": NumValue_two, # cz, U,
|
"NumValue_two ": NumValue_two, # cz, U,
|
||||||
"NumValue_three ": NumValue_three, # cz, U,
|
"NumValue_three ": NumValue_three, # cz, U,
|
||||||
"PartForm_pres ": PartForm_pres, # fi,
|
"PartForm_pres ": PartForm_pres, # fi,
|
||||||
"PartForm_past ": PartForm_past, # fi,
|
"PartForm_past ": PartForm_past, # fi,
|
||||||
"PartForm_agt ": PartForm_agt, # fi,
|
"PartForm_agt ": PartForm_agt, # fi,
|
||||||
"PartForm_neg ": PartForm_neg, # fi,
|
"PartForm_neg ": PartForm_neg, # fi,
|
||||||
"PartType_mod ": PartType_mod, # U,
|
"PartType_mod ": PartType_mod, # U,
|
||||||
"PartType_emp ": PartType_emp, # U,
|
"PartType_emp ": PartType_emp, # U,
|
||||||
"PartType_res ": PartType_res, # U,
|
"PartType_res ": PartType_res, # U,
|
||||||
"PartType_inf ": PartType_inf, # U,
|
"PartType_inf ": PartType_inf, # U,
|
||||||
"PartType_vbp ": PartType_vbp, # U,
|
"PartType_vbp ": PartType_vbp, # U,
|
||||||
"Person_abs_one ": Person_abs_one, # bq, U,
|
"Person_abs_one ": Person_abs_one, # bq, U,
|
||||||
"Person_abs_two ": Person_abs_two, # bq, U,
|
"Person_abs_two ": Person_abs_two, # bq, U,
|
||||||
"Person_abs_three ": Person_abs_three, # bq, U,
|
"Person_abs_three ": Person_abs_three, # bq, U,
|
||||||
"Person_dat_one ": Person_dat_one, # bq, U,
|
"Person_dat_one ": Person_dat_one, # bq, U,
|
||||||
"Person_dat_two ": Person_dat_two, # bq, U,
|
"Person_dat_two ": Person_dat_two, # bq, U,
|
||||||
"Person_dat_three ": Person_dat_three, # bq, U,
|
"Person_dat_three ": Person_dat_three, # bq, U,
|
||||||
"Person_erg_one ": Person_erg_one, # bq, U,
|
"Person_erg_one ": Person_erg_one, # bq, U,
|
||||||
"Person_erg_two ": Person_erg_two, # bq, U,
|
"Person_erg_two ": Person_erg_two, # bq, U,
|
||||||
"Person_erg_three ": Person_erg_three, # bq, U,
|
"Person_erg_three ": Person_erg_three, # bq, U,
|
||||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||||
"Polite_inf ": Polite_inf, # bq, U,
|
"Polite_inf ": Polite_inf, # bq, U,
|
||||||
"Polite_pol ": Polite_pol, # bq, U,
|
"Polite_pol ": Polite_pol, # bq, U,
|
||||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||||
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
|
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
|
||||||
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
|
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
|
||||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||||
"Prefix_yes ": Prefix_yes, # U,
|
"Prefix_yes ": Prefix_yes, # U,
|
||||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||||
"PrepCase_pre ": PrepCase_pre, # U,
|
"PrepCase_pre ": PrepCase_pre, # U,
|
||||||
"PunctSide_ini ": PunctSide_ini, # U,
|
"PunctSide_ini ": PunctSide_ini, # U,
|
||||||
"PunctSide_fin ": PunctSide_fin, # U,
|
"PunctSide_fin ": PunctSide_fin, # U,
|
||||||
"PunctType_peri ": PunctType_peri, # U,
|
"PunctType_peri ": PunctType_peri, # U,
|
||||||
"PunctType_qest ": PunctType_qest, # U,
|
"PunctType_qest ": PunctType_qest, # U,
|
||||||
"PunctType_excl ": PunctType_excl, # U,
|
"PunctType_excl ": PunctType_excl, # U,
|
||||||
"PunctType_quot ": PunctType_quot, # U,
|
"PunctType_quot ": PunctType_quot, # U,
|
||||||
"PunctType_brck ": PunctType_brck, # U,
|
"PunctType_brck ": PunctType_brck, # U,
|
||||||
"PunctType_comm ": PunctType_comm, # U,
|
"PunctType_comm ": PunctType_comm, # U,
|
||||||
"PunctType_colo ": PunctType_colo, # U,
|
"PunctType_colo ": PunctType_colo, # U,
|
||||||
"PunctType_semi ": PunctType_semi, # U,
|
"PunctType_semi ": PunctType_semi, # U,
|
||||||
"PunctType_dash ": PunctType_dash, # U,
|
"PunctType_dash ": PunctType_dash, # U,
|
||||||
"Style_arch ": Style_arch, # cz, fi, U,
|
"Style_arch ": Style_arch, # cz, fi, U,
|
||||||
"Style_rare ": Style_rare, # cz, fi, U,
|
"Style_rare ": Style_rare, # cz, fi, U,
|
||||||
"Style_poet ": Style_poet, # cz, U,
|
"Style_poet ": Style_poet, # cz, U,
|
||||||
"Style_norm ": Style_norm, # cz, U,
|
"Style_norm ": Style_norm, # cz, U,
|
||||||
"Style_coll ": Style_coll, # cz, U,
|
"Style_coll ": Style_coll, # cz, U,
|
||||||
"Style_vrnc ": Style_vrnc, # cz, U,
|
"Style_vrnc ": Style_vrnc, # cz, U,
|
||||||
"Style_sing ": Style_sing, # cz, U,
|
"Style_sing ": Style_sing, # cz, U,
|
||||||
"Style_expr ": Style_expr, # cz, U,
|
"Style_expr ": Style_expr, # cz, U,
|
||||||
"Style_derg ": Style_derg, # cz, U,
|
"Style_derg ": Style_derg, # cz, U,
|
||||||
"Style_vulg ": Style_vulg, # cz, U,
|
"Style_vulg ": Style_vulg, # cz, U,
|
||||||
"Style_yes ": Style_yes, # fi, U,
|
"Style_yes ": Style_yes, # fi, U,
|
||||||
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
|
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
|
||||||
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
|
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
|
||||||
"VerbType_aux ": VerbType_aux, # U,
|
"VerbType_aux ": VerbType_aux, # U,
|
||||||
"VerbType_cop ": VerbType_cop, # U,
|
"VerbType_cop ": VerbType_cop, # U,
|
||||||
"VerbType_mod ": VerbType_mod, # U,
|
"VerbType_mod ": VerbType_mod, # U,
|
||||||
"VerbType_light ": VerbType_light, # U,
|
"VerbType_light ": VerbType_light, # U,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ IDS = {
|
||||||
"ADP": ADP,
|
"ADP": ADP,
|
||||||
"ADV": ADV,
|
"ADV": ADV,
|
||||||
"AUX": AUX,
|
"AUX": AUX,
|
||||||
"CONJ": CONJ, # U20
|
"CONJ": CONJ, # U20
|
||||||
"CCONJ": CCONJ,
|
"CCONJ": CCONJ,
|
||||||
"DET": DET,
|
"DET": DET,
|
||||||
"INTJ": INTJ,
|
"INTJ": INTJ,
|
||||||
|
|
|
@ -3,26 +3,17 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from thinc.api import chain, layerize, with_getitem
|
|
||||||
import numpy
|
import numpy
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import util
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
import ujson
|
||||||
import msgpack
|
import msgpack
|
||||||
|
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
from thinc.api import chain
|
||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
from thinc.v2v import Softmax
|
||||||
from thinc.i2v import HashEmbed
|
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||||
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
|
||||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
|
||||||
from thinc.misc import Residual
|
|
||||||
from thinc.misc import BatchNorm as BN
|
|
||||||
from thinc.misc import LayerNorm as LN
|
|
||||||
|
|
||||||
from thinc.neural.util import to_categorical
|
from thinc.neural.util import to_categorical
|
||||||
|
|
||||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
@ -30,29 +21,23 @@ from .syntax.nn_parser cimport Parser
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .syntax.ner cimport BiluoPushDown
|
from .syntax.ner cimport BiluoPushDown
|
||||||
from .syntax.arc_eager cimport ArcEager
|
from .syntax.arc_eager cimport ArcEager
|
||||||
from .tagger import Tagger
|
|
||||||
from .syntax.stateclass cimport StateClass
|
|
||||||
from .gold cimport GoldParse
|
|
||||||
from .morphology cimport Morphology
|
from .morphology cimport Morphology
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .compat import json_dumps
|
from .compat import json_dumps
|
||||||
|
|
||||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
from .attrs import POS
|
||||||
from ._ml import rebatch, Tok2Vec, flatten
|
|
||||||
from ._ml import build_text_classifier, build_tagger_model
|
|
||||||
from ._ml import link_vectors_to_models
|
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
class SentenceSegmenter(object):
|
class SentenceSegmenter(object):
|
||||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||||
(that doesn't require the dependency parse).
|
(that doesn't require the dependency parse). To change the sentence
|
||||||
|
boundary detection strategy, pass a generator function `strategy` on
|
||||||
To change the sentence boundary detection strategy, pass a generator
|
initialization, or assign a new strategy to the .strategy attribute.
|
||||||
function `strategy` on initialization, or assign a new strategy to
|
|
||||||
the .strategy attribute.
|
|
||||||
|
|
||||||
Sentence detection strategies should be generators that take `Doc` objects
|
Sentence detection strategies should be generators that take `Doc` objects
|
||||||
and yield `Span` objects for each sentence.
|
and yield `Span` objects for each sentence.
|
||||||
"""
|
"""
|
||||||
|
@ -74,16 +59,20 @@ class SentenceSegmenter(object):
|
||||||
seen_period = False
|
seen_period = False
|
||||||
for i, word in enumerate(doc):
|
for i, word in enumerate(doc):
|
||||||
if seen_period and not word.is_punct:
|
if seen_period and not word.is_punct:
|
||||||
yield doc[start : word.i]
|
yield doc[start:word.i]
|
||||||
start = word.i
|
start = word.i
|
||||||
seen_period = False
|
seen_period = False
|
||||||
elif word.text in ['.', '!', '?']:
|
elif word.text in ['.', '!', '?']:
|
||||||
seen_period = True
|
seen_period = True
|
||||||
if start < len(doc):
|
if start < len(doc):
|
||||||
yield doc[start : len(doc)]
|
yield doc[start:len(doc)]
|
||||||
|
|
||||||
|
|
||||||
class Pipe(object):
|
class Pipe(object):
|
||||||
|
"""This class is not instantiated directly. Components inherit from it, and
|
||||||
|
it defines the interface that components should follow to function as
|
||||||
|
components in a spaCy analysis pipeline.
|
||||||
|
"""
|
||||||
name = None
|
name = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -149,8 +138,7 @@ class Pipe(object):
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
"""Modify the pipe's model, to use the given parameter values.
|
"""Modify the pipe's model, to use the given parameter values."""
|
||||||
"""
|
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
@ -235,8 +223,8 @@ class Tensorizer(Pipe):
|
||||||
"""Construct a new statistical model. Weights are not allocated on
|
"""Construct a new statistical model. Weights are not allocated on
|
||||||
initialisation.
|
initialisation.
|
||||||
|
|
||||||
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
vocab (Vocab): A `Vocab` instance. The model must share the same
|
||||||
instance with the `Doc` objects it will process.
|
`Vocab` instance with the `Doc` objects it will process.
|
||||||
model (Model): A `Model` instance or `True` allocate one later.
|
model (Model): A `Model` instance or `True` allocate one later.
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
|
|
||||||
|
@ -280,7 +268,7 @@ class Tensorizer(Pipe):
|
||||||
"""Return a single tensor for a batch of documents.
|
"""Return a single tensor for a batch of documents.
|
||||||
|
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
RETURNS (object): Vector representations for each token in the documents.
|
RETURNS (object): Vector representations for each token in the docs.
|
||||||
"""
|
"""
|
||||||
tokvecs = self.model(docs)
|
tokvecs = self.model(docs)
|
||||||
return tokvecs
|
return tokvecs
|
||||||
|
@ -289,7 +277,7 @@ class Tensorizer(Pipe):
|
||||||
"""Set the tensor attribute for a batch of documents.
|
"""Set the tensor attribute for a batch of documents.
|
||||||
|
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
tokvecs (object): Vector representation for each token in the documents.
|
tokvecs (object): Vector representation for each token in the docs.
|
||||||
"""
|
"""
|
||||||
for doc, tokvecs in zip(docs, tokvecses):
|
for doc, tokvecs in zip(docs, tokvecses):
|
||||||
assert tokvecs.shape[0] == len(doc)
|
assert tokvecs.shape[0] == len(doc)
|
||||||
|
@ -328,12 +316,14 @@ class Tensorizer(Pipe):
|
||||||
|
|
||||||
class Tagger(Pipe):
|
class Tagger(Pipe):
|
||||||
name = 'tagger'
|
name = 'tagger'
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
self.cfg.setdefault('pretrained_dims',
|
||||||
|
self.vocab.vectors.data.shape[1])
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
|
@ -353,8 +343,7 @@ class Tagger(Pipe):
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
if not isinstance(guesses, numpy.ndarray):
|
if not isinstance(guesses, numpy.ndarray):
|
||||||
guesses = guesses.get()
|
guesses = guesses.get()
|
||||||
guesses = self.model.ops.unflatten(guesses,
|
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
|
||||||
[len(d) for d in docs])
|
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
|
@ -387,8 +376,8 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
tag_index = {tag: i
|
||||||
|
for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
|
@ -443,17 +432,18 @@ class Tagger(Pipe):
|
||||||
serialize['model'] = self.model.to_bytes
|
serialize['model'] = self.model.to_bytes
|
||||||
serialize['vocab'] = self.vocab.to_bytes
|
serialize['vocab'] = self.vocab.to_bytes
|
||||||
|
|
||||||
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
|
serialize['tag_map'] = lambda: msgpack.dumps(
|
||||||
use_bin_type=True,
|
self.vocab.morphology.tag_map, use_bin_type=True, encoding='utf8')
|
||||||
encoding='utf8')
|
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width',
|
token_vector_width = util.env_opt(
|
||||||
self.cfg.get('token_vector_width', 128))
|
'token_vector_width',
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
self.cfg.get('token_vector_width', 128))
|
||||||
|
self.model = self.Model(self.vocab.morphology.n_tags,
|
||||||
|
**self.cfg)
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
def load_tag_map(b):
|
def load_tag_map(b):
|
||||||
|
@ -509,11 +499,11 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
|
|
||||||
class MultitaskObjective(Tagger):
|
class MultitaskObjective(Tagger):
|
||||||
'''Assist training of a parser or tagger, by training a side-objective.
|
"""Experimental: Assist training of a parser or tagger, by training a
|
||||||
|
side-objective.
|
||||||
Experimental
|
"""
|
||||||
'''
|
|
||||||
name = 'nn_labeller'
|
name = 'nn_labeller'
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -530,12 +520,12 @@ class MultitaskObjective(Tagger):
|
||||||
elif hasattr(target, '__call__'):
|
elif hasattr(target, '__call__'):
|
||||||
self.make_label = target
|
self.make_label = target
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError("MultitaskObjective target should be function or "
|
||||||
"MultitaskObjective target should be function or one of "
|
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
|
||||||
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
|
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
self.cfg.setdefault('pretrained_dims',
|
||||||
|
self.vocab.vectors.data.shape[1])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -623,20 +613,19 @@ class MultitaskObjective(Tagger):
|
||||||
|
|
||||||
class SimilarityHook(Pipe):
|
class SimilarityHook(Pipe):
|
||||||
"""
|
"""
|
||||||
Experimental
|
Experimental: A pipeline component to install a hook for supervised
|
||||||
|
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
||||||
|
documents. The similarity model can be any object obeying the Thinc `Model`
|
||||||
|
interface. By default, the model concatenates the elementwise mean and
|
||||||
|
elementwise max of the two tensors, and compares them using the
|
||||||
|
Cauchy-like similarity function from Chen (2013):
|
||||||
|
|
||||||
A pipeline component to install a hook for supervised similarity into
|
>>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||||
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
|
|
||||||
model can be any object obeying the Thinc Model interface. By default,
|
|
||||||
the model concatenates the elementwise mean and elementwise max of the two
|
|
||||||
tensors, and compares them using the Cauchy-like similarity function
|
|
||||||
from Chen (2013):
|
|
||||||
|
|
||||||
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
|
||||||
|
|
||||||
Where W is a vector of dimension weights, initialized to 1.
|
Where W is a vector of dimension weights, initialized to 1.
|
||||||
"""
|
"""
|
||||||
name = 'similarity'
|
name = 'similarity'
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -662,8 +651,7 @@ class SimilarityHook(Pipe):
|
||||||
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
||||||
|
|
||||||
def begin_training(self, _=tuple(), pipeline=None):
|
def begin_training(self, _=tuple(), pipeline=None):
|
||||||
"""
|
"""Allocate model, using width from tensorizer in pipeline.
|
||||||
Allocate model, using width from tensorizer in pipeline.
|
|
||||||
|
|
||||||
gold_tuples (iterable): Gold-standard training data.
|
gold_tuples (iterable): Gold-standard training data.
|
||||||
pipeline (list): The pipeline the model is part of.
|
pipeline (list): The pipeline the model is part of.
|
||||||
|
@ -763,12 +751,14 @@ cdef class DependencyParser(Parser):
|
||||||
for target in []:
|
for target in []:
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
tok2vec = self.model[0]
|
tok2vec = self.model[0]
|
||||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||||
|
tok2vec=tok2vec)
|
||||||
pipeline.append(labeller)
|
pipeline.append(labeller)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
|
return (DependencyParser, (self.vocab, self.moves, self.model),
|
||||||
|
None, None)
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
|
@ -781,12 +771,14 @@ cdef class EntityRecognizer(Parser):
|
||||||
for target in []:
|
for target in []:
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
tok2vec = self.model[0]
|
tok2vec = self.model[0]
|
||||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||||
|
tok2vec=tok2vec)
|
||||||
pipeline.append(labeller)
|
pipeline.append(labeller)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
return (EntityRecognizer, (self.vocab, self.moves, self.model),
|
||||||
|
None, None)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
|
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
|
||||||
|
|
|
@ -74,18 +74,21 @@ class Scorer(object):
|
||||||
@property
|
@property
|
||||||
def scores(self):
|
def scores(self):
|
||||||
return {
|
return {
|
||||||
'uas': self.uas, 'las': self.las,
|
'uas': self.uas,
|
||||||
'ents_p': self.ents_p, 'ents_r': self.ents_r, 'ents_f': self.ents_f,
|
'las': self.las,
|
||||||
|
'ents_p': self.ents_p,
|
||||||
|
'ents_r': self.ents_r,
|
||||||
|
'ents_f': self.ents_f,
|
||||||
'tags_acc': self.tags_acc,
|
'tags_acc': self.tags_acc,
|
||||||
'token_acc': self.token_acc
|
'token_acc': self.token_acc
|
||||||
}
|
}
|
||||||
|
|
||||||
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
||||||
assert len(tokens) == len(gold)
|
assert len(tokens) == len(gold)
|
||||||
|
|
||||||
gold_deps = set()
|
gold_deps = set()
|
||||||
gold_tags = set()
|
gold_tags = set()
|
||||||
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
gold_ents = set(tags_to_entities([annot[-1]
|
||||||
|
for annot in gold.orig_annot]))
|
||||||
for id_, word, tag, head, dep, ner in gold.orig_annot:
|
for id_, word, tag, head, dep, ner in gold.orig_annot:
|
||||||
gold_tags.add((id_, tag))
|
gold_tags.add((id_, tag))
|
||||||
if dep not in (None, "") and dep.lower() not in punct_labels:
|
if dep not in (None, "") and dep.lower() not in punct_labels:
|
||||||
|
|
|
@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from libc.stdint cimport uint64_t, uint32_t
|
|
||||||
from murmurhash.mrmr cimport hash64, hash32
|
|
||||||
from preshed.maps cimport map_iter, key_t
|
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
|
from murmurhash.mrmr cimport hash64, hash32
|
||||||
import ujson
|
import ujson
|
||||||
import dill
|
|
||||||
|
|
||||||
from .symbols import IDS as SYMBOLS_BY_STR
|
from .symbols import IDS as SYMBOLS_BY_STR
|
||||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from . import util
|
|
||||||
from .compat import json_dumps
|
from .compat import json_dumps
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
cpdef hash_t hash_string(unicode string) except 0:
|
cpdef hash_t hash_string(unicode string) except 0:
|
||||||
|
@ -195,7 +191,7 @@ cdef class StringStore:
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
strings = list(self)
|
strings = list(self)
|
||||||
|
@ -225,7 +221,7 @@ cdef class StringStore:
|
||||||
**exclude: Named attributes to prevent from being serialized.
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||||
"""
|
"""
|
||||||
return ujson.dumps(list(self))
|
return json_dumps(list(self))
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
#cython: optimize.unpack_method_calls=False
|
#cython: optimize.unpack_method_calls=False
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
"": NIL,
|
"": NIL,
|
||||||
"IS_ALPHA": IS_ALPHA,
|
"IS_ALPHA": IS_ALPHA,
|
||||||
|
@ -464,9 +464,11 @@ IDS = {
|
||||||
"LAW": LAW
|
"LAW": LAW
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def sort_nums(x):
|
def sort_nums(x):
|
||||||
return x[1]
|
return x[1]
|
||||||
|
|
||||||
|
|
||||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import numpy
|
import numpy
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
from thinc.extra.search import MaxViolation
|
from thinc.extra.search import MaxViolation
|
||||||
from thinc.typedefs cimport hash_t, class_t
|
from thinc.typedefs cimport hash_t, class_t
|
||||||
|
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
|
|
||||||
|
|
||||||
# These are passed as callbacks to thinc.search.Beam
|
# These are passed as callbacks to thinc.search.Beam
|
||||||
|
@ -50,7 +49,7 @@ cdef class ParserBeam(object):
|
||||||
cdef public object dones
|
cdef public object dones
|
||||||
|
|
||||||
def __init__(self, TransitionSystem moves, states, golds,
|
def __init__(self, TransitionSystem moves, states, golds,
|
||||||
int width, float density):
|
int width, float density):
|
||||||
self.moves = moves
|
self.moves = moves
|
||||||
self.states = states
|
self.states = states
|
||||||
self.golds = golds
|
self.golds = golds
|
||||||
|
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
|
||||||
cdef StateClass state, st
|
cdef StateClass state, st
|
||||||
for state in states:
|
for state in states:
|
||||||
beam = Beam(self.moves.n_moves, width, density)
|
beam = Beam(self.moves.n_moves, width, density)
|
||||||
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
|
beam.initialize(self.moves.init_beam_state, state.c.length,
|
||||||
|
state.c._sent)
|
||||||
for i in range(beam.width):
|
for i in range(beam.width):
|
||||||
st = <StateClass>beam.at(i)
|
st = <StateClass>beam.at(i)
|
||||||
st.c.offset = state.c.offset
|
st.c.offset = state.c.offset
|
||||||
|
@ -74,7 +74,8 @@ cdef class ParserBeam(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_done(self):
|
def is_done(self):
|
||||||
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
|
return all(b.is_done or self.dones[i]
|
||||||
|
for i, b in enumerate(self.beams))
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
return self.beams[i]
|
return self.beams[i]
|
||||||
|
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
state = <StateClass>beam.at(i)
|
state = <StateClass>beam.at(i)
|
||||||
if not state.c.is_final():
|
if not state.c.is_final():
|
||||||
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
|
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
|
||||||
|
state, gold)
|
||||||
if follow_gold:
|
if follow_gold:
|
||||||
for j in range(beam.nr_class):
|
for j in range(beam.nr_class):
|
||||||
if beam.costs[i][j] >= 1:
|
if beam.costs[i][j] >= 1:
|
||||||
|
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
|
||||||
c_ids += ids.shape[1]
|
c_ids += ids.shape[1]
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
|
||||||
nr_update = 0
|
nr_update = 0
|
||||||
|
|
||||||
|
|
||||||
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
states, golds,
|
states, golds,
|
||||||
state2vec, vec2scores,
|
state2vec, vec2scores,
|
||||||
|
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
if pbeam.is_done and gbeam.is_done:
|
if pbeam.is_done and gbeam.is_done:
|
||||||
break
|
break
|
||||||
# The beam maps let us find the right row in the flattened scores
|
# The beam maps let us find the right row in the flattened scores
|
||||||
# arrays for each state. States are identified by (example id, history).
|
# arrays for each state. States are identified by (example id,
|
||||||
# We keep a different beam map for each step (since we'll have a flat
|
# history). We keep a different beam map for each step (since we'll
|
||||||
# scores array for each step). The beam map will let us take the per-state
|
# have a flat scores array for each step). The beam map will let us
|
||||||
# losses, and compute the gradient for each (step, state, class).
|
# take the per-state losses, and compute the gradient for each (step,
|
||||||
|
# state, class).
|
||||||
beam_maps.append({})
|
beam_maps.append({})
|
||||||
# Gather all states from the two beams in a list. Some stats may occur
|
# Gather all states from the two beams in a list. Some stats may occur
|
||||||
# in both beams. To figure out which beam each state belonged to,
|
# in both beams. To figure out which beam each state belonged to,
|
||||||
# we keep two lists of indices, p_indices and g_indices
|
# we keep two lists of indices, p_indices and g_indices
|
||||||
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
|
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
|
||||||
|
nr_update)
|
||||||
if not states:
|
if not states:
|
||||||
break
|
break
|
||||||
# Now that we have our flat list of states, feed them through the model
|
# Now that we have our flat list of states, feed them through the model
|
||||||
token_ids = get_token_ids(states, nr_feature)
|
token_ids = get_token_ids(states, nr_feature)
|
||||||
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
||||||
if hist_feats:
|
if hist_feats:
|
||||||
hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i')
|
hists = numpy.asarray([st.history[:hist_feats] for st in states],
|
||||||
scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop)
|
dtype='i')
|
||||||
|
scores, bp_scores = vec2scores.begin_update((vectors, hists),
|
||||||
|
drop=drop)
|
||||||
else:
|
else:
|
||||||
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
||||||
|
|
||||||
|
@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
|
|
||||||
# Unpack the flat scores into lists for the two beams. The indices arrays
|
# Unpack the flat scores into lists for the two beams. The indices arrays
|
||||||
# tell us which example and state the scores-row refers to.
|
# tell us which example and state the scores-row refers to.
|
||||||
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
|
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
||||||
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
|
for indices in p_indices]
|
||||||
|
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
||||||
|
for indices in g_indices]
|
||||||
# Now advance the states in the beams. The gold beam is contrained to
|
# Now advance the states in the beams. The gold beam is contrained to
|
||||||
# to follow only gold analyses.
|
# to follow only gold analyses.
|
||||||
pbeam.advance(p_scores)
|
pbeam.advance(p_scores)
|
||||||
|
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
|
||||||
|
|
||||||
|
|
||||||
def get_gradient(nr_class, beam_maps, histories, losses):
|
def get_gradient(nr_class, beam_maps, histories, losses):
|
||||||
"""
|
"""The global model assigns a loss to each parse. The beam scores
|
||||||
The global model assigns a loss to each parse. The beam scores
|
|
||||||
are additive, so the same gradient is applied to each action
|
are additive, so the same gradient is applied to each action
|
||||||
in the history. This gives the gradient of a single *action*
|
in the history. This gives the gradient of a single *action*
|
||||||
for a beam state -- so we have "the gradient of loss for taking
|
for a beam state -- so we have "the gradient of loss for taking
|
||||||
|
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
||||||
if loss != 0.0 and not numpy.isnan(loss):
|
if loss != 0.0 and not numpy.isnan(loss):
|
||||||
nr_step = max(nr_step, len(hist))
|
nr_step = max(nr_step, len(hist))
|
||||||
for i in range(nr_step):
|
for i in range(nr_step):
|
||||||
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
|
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
|
||||||
|
dtype='f'))
|
||||||
assert len(histories) == len(losses)
|
assert len(histories) == len(losses)
|
||||||
for eg_id, hists in enumerate(histories):
|
for eg_id, hists in enumerate(histories):
|
||||||
for loss, hist in zip(losses[eg_id], hists):
|
for loss, hist in zip(losses[eg_id], hists):
|
||||||
|
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
||||||
grads[j][i, clas] += loss
|
grads[j][i, clas] += loss
|
||||||
key = key + tuple([clas])
|
key = key + tuple([clas])
|
||||||
return grads
|
return grads
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
# test
|
|
|
@ -4,24 +4,16 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport Py_INCREF
|
||||||
import ctypes
|
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
from libc.string cimport memcpy
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
import numpy
|
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC, is_space_token
|
from ._state cimport StateC
|
||||||
from .nonproj import is_nonproj_tree
|
from .nonproj import is_nonproj_tree
|
||||||
from .transition_system cimport do_func_t, get_cost_func_t
|
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse, GoldParseC
|
||||||
from ..gold cimport GoldParseC
|
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
|
|
||||||
from ..lexeme cimport Lexeme
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
|
||||||
|
|
||||||
|
@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_actions(cls, **kwargs):
|
def get_actions(cls, **kwargs):
|
||||||
actions = kwargs.get('actions',
|
actions = kwargs.get('actions', OrderedDict((
|
||||||
OrderedDict((
|
(SHIFT, ['']),
|
||||||
(SHIFT, ['']),
|
(REDUCE, ['']),
|
||||||
(REDUCE, ['']),
|
(RIGHT, []),
|
||||||
(RIGHT, []),
|
(LEFT, []),
|
||||||
(LEFT, []),
|
(BREAK, ['ROOT']))
|
||||||
(BREAK, ['ROOT'])
|
))
|
||||||
)))
|
|
||||||
seen_actions = set()
|
seen_actions = set()
|
||||||
for label in kwargs.get('left_labels', []):
|
for label in kwargs.get('left_labels', []):
|
||||||
if label.upper() != 'ROOT':
|
if label.upper() != 'ROOT':
|
||||||
|
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if gold.cand_to_gold[i] is None:
|
if gold.cand_to_gold[i] is None:
|
||||||
continue
|
continue
|
||||||
if state.safe_get(i).dep:
|
if state.safe_get(i).dep:
|
||||||
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
|
predicted.add((i, state.H(i),
|
||||||
|
self.strings[state.safe_get(i).dep]))
|
||||||
else:
|
else:
|
||||||
predicted.add((i, state.H(i), 'ROOT'))
|
predicted.add((i, state.H(i), 'ROOT'))
|
||||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||||
|
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if not self.has_gold(gold):
|
if not self.has_gold(gold):
|
||||||
return None
|
return None
|
||||||
for i in range(gold.length):
|
for i in range(gold.length):
|
||||||
if gold.heads[i] is None or gold.labels[i] is None: # Missing values
|
# Missing values
|
||||||
|
if gold.heads[i] is None or gold.labels[i] is None:
|
||||||
gold.c.heads[i] = i
|
gold.c.heads[i] = i
|
||||||
gold.c.has_dep[i] = False
|
gold.c.has_dep[i] = False
|
||||||
else:
|
else:
|
||||||
|
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
|
||||||
# Check projectivity --- leading cause
|
# Check projectivity --- leading cause
|
||||||
if is_nonproj_tree(gold.heads):
|
if is_nonproj_tree(gold.heads):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not find a gold-standard action to supervise the dependency "
|
"Could not find a gold-standard action to supervise the "
|
||||||
"parser.\n"
|
"dependency parser. Likely cause: the tree is "
|
||||||
"Likely cause: the tree is non-projective (i.e. it has crossing "
|
"non-projective (i.e. it has crossing arcs -- see "
|
||||||
"arcs -- see spacy/syntax/nonproj.pyx for definitions)\n"
|
"spacy/syntax/nonproj.pyx for definitions). The ArcEager "
|
||||||
"The ArcEager transition system only supports projective trees.\n"
|
"transition system only supports projective trees. To "
|
||||||
"To learn non-projective representations, transform the data "
|
"learn non-projective representations, transform the data "
|
||||||
"before training and after parsing. Either pass make_projective=True "
|
"before training and after parsing. Either pass "
|
||||||
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
|
"make_projective=True to the GoldParse class, or use "
|
||||||
|
"spacy.syntax.nonproj.preprocess_training_data.")
|
||||||
else:
|
else:
|
||||||
print(gold.orig_annot)
|
print(gold.orig_annot)
|
||||||
print(gold.words)
|
print(gold.words)
|
||||||
|
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
|
||||||
print(gold.labels)
|
print(gold.labels)
|
||||||
print(gold.sent_starts)
|
print(gold.sent_starts)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not find a gold-standard action to supervise the dependency "
|
"Could not find a gold-standard action to supervise the"
|
||||||
"parser.\n"
|
"dependency parser. The GoldParse was projective. The "
|
||||||
"The GoldParse was projective.\n"
|
"transition system has %d actions. State at failure: %s"
|
||||||
"The transition system has %d actions.\n"
|
% (self.n_moves, stcls.print_state(gold.words)))
|
||||||
"State at failure:\n"
|
|
||||||
"%s" % (self.n_moves, stcls.print_state(gold.words)))
|
|
||||||
assert n_gold >= 1
|
assert n_gold >= 1
|
||||||
|
|
||||||
def get_beam_annot(self, Beam beam):
|
def get_beam_annot(self, Beam beam):
|
||||||
|
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
|
||||||
deps[j].setdefault(dep, 0.0)
|
deps[j].setdefault(dep, 0.0)
|
||||||
deps[j][dep] += prob
|
deps[j][dep] += prob
|
||||||
return heads, deps
|
return heads, deps
|
||||||
|
|
||||||
|
|
|
@ -1,144 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
|
|
||||||
|
|
||||||
|
|
||||||
def english_noun_chunks(obj):
|
|
||||||
"""
|
|
||||||
Detect base noun phrases from a dependency parse.
|
|
||||||
Works on both Doc and Span.
|
|
||||||
"""
|
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
|
||||||
'attr', 'ROOT']
|
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
|
||||||
conj = doc.vocab.strings.add('conj')
|
|
||||||
np_label = doc.vocab.strings.add('NP')
|
|
||||||
seen = set()
|
|
||||||
for i, word in enumerate(obj):
|
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
|
||||||
continue
|
|
||||||
# Prevent nested chunks from being produced
|
|
||||||
if word.i in seen:
|
|
||||||
continue
|
|
||||||
if word.dep in np_deps:
|
|
||||||
if any(w.i in seen for w in word.subtree):
|
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i+1))
|
|
||||||
yield word.left_edge.i, word.i+1, np_label
|
|
||||||
elif word.dep == conj:
|
|
||||||
head = word.head
|
|
||||||
while head.dep == conj and head.head.i < head.i:
|
|
||||||
head = head.head
|
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
||||||
if head.dep in np_deps:
|
|
||||||
if any(w.i in seen for w in word.subtree):
|
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i+1))
|
|
||||||
yield word.left_edge.i, word.i+1, np_label
|
|
||||||
|
|
||||||
|
|
||||||
# this iterator extracts spans headed by NOUNs starting from the left-most
|
|
||||||
# syntactic dependent until the NOUN itself
|
|
||||||
# for close apposition and measurement construction, the span is sometimes
|
|
||||||
# extended to the right of the NOUN
|
|
||||||
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
|
||||||
# just "eine Tasse", same for "das Thema Familie"
|
|
||||||
def german_noun_chunks(obj):
|
|
||||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
|
||||||
np_label = doc.vocab.strings.add('NP')
|
|
||||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
|
||||||
close_app = doc.vocab.strings.add('nk')
|
|
||||||
|
|
||||||
rbracket = 0
|
|
||||||
for i, word in enumerate(obj):
|
|
||||||
if i < rbracket:
|
|
||||||
continue
|
|
||||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
|
||||||
rbracket = word.i+1
|
|
||||||
# try to extend the span to the right
|
|
||||||
# to capture close apposition/measurement constructions
|
|
||||||
for rdep in doc[word.i].rights:
|
|
||||||
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
|
|
||||||
rbracket = rdep.i+1
|
|
||||||
yield word.left_edge.i, rbracket, np_label
|
|
||||||
|
|
||||||
|
|
||||||
def es_noun_chunks(obj):
|
|
||||||
doc = obj.doc
|
|
||||||
np_label = doc.vocab.strings['NP']
|
|
||||||
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
|
||||||
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
|
||||||
stop_labels = ['punct']
|
|
||||||
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
|
|
||||||
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
|
|
||||||
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
|
|
||||||
|
|
||||||
def next_token(token):
|
|
||||||
try:
|
|
||||||
return token.nbor()
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def noun_bounds(root):
|
|
||||||
def is_verb_token(token):
|
|
||||||
return token.pos in [VERB, AUX]
|
|
||||||
|
|
||||||
left_bound = root
|
|
||||||
for token in reversed(list(root.lefts)):
|
|
||||||
if token.dep in np_left_deps:
|
|
||||||
left_bound = token
|
|
||||||
right_bound = root
|
|
||||||
for token in root.rights:
|
|
||||||
if (token.dep in np_right_deps):
|
|
||||||
left, right = noun_bounds(token)
|
|
||||||
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
|
|
||||||
doc[left_bound.i: right.i])):
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
right_bound = right
|
|
||||||
return left_bound, right_bound
|
|
||||||
|
|
||||||
token = doc[0]
|
|
||||||
while token and token.i < len(doc):
|
|
||||||
if token.pos in [PROPN, NOUN, PRON]:
|
|
||||||
left, right = noun_bounds(token)
|
|
||||||
yield left.i, right.i+1, np_label
|
|
||||||
token = right
|
|
||||||
token = next_token(token)
|
|
||||||
|
|
||||||
|
|
||||||
def french_noun_chunks(obj):
|
|
||||||
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
|
||||||
conj = doc.vocab.strings.add('conj')
|
|
||||||
np_label = doc.vocab.strings.add('NP')
|
|
||||||
seen = set()
|
|
||||||
for i, word in enumerate(obj):
|
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
|
||||||
continue
|
|
||||||
# Prevent nested chunks from being produced
|
|
||||||
if word.i in seen:
|
|
||||||
continue
|
|
||||||
if word.dep in np_deps:
|
|
||||||
if any(w.i in seen for w in word.subtree):
|
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
|
||||||
elif word.dep == conj:
|
|
||||||
head = word.head
|
|
||||||
while head.dep == conj and head.head.i < head.i:
|
|
||||||
head = head.head
|
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
|
||||||
if head.dep in np_deps:
|
|
||||||
if any(w.i in seen for w in word.subtree):
|
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
|
||||||
|
|
||||||
|
|
||||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
|
|
||||||
'es': es_noun_chunks, 'fr': french_noun_chunks}
|
|
|
@ -4,17 +4,12 @@ from __future__ import unicode_literals
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import numpy
|
|
||||||
from thinc.neural.ops import NumpyOps
|
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
from .transition_system cimport Transition
|
from .transition_system cimport Transition
|
||||||
from .transition_system cimport do_func_t
|
from .transition_system cimport do_func_t
|
||||||
from ..structs cimport TokenC, Entity
|
from ..gold cimport GoldParseC, GoldParse
|
||||||
from ..gold cimport GoldParseC
|
|
||||||
from ..gold cimport GoldParse
|
|
||||||
from ..attrs cimport ENT_TYPE, ENT_IOB
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
cdef enum:
|
||||||
|
@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_actions(cls, **kwargs):
|
def get_actions(cls, **kwargs):
|
||||||
actions = kwargs.get('actions',
|
actions = kwargs.get('actions', OrderedDict((
|
||||||
OrderedDict((
|
(MISSING, ['']),
|
||||||
(MISSING, ['']),
|
(BEGIN, []),
|
||||||
(BEGIN, []),
|
(IN, []),
|
||||||
(IN, []),
|
(LAST, []),
|
||||||
(LAST, []),
|
(UNIT, []),
|
||||||
(UNIT, []),
|
(OUT, [''])
|
||||||
(OUT, [''])
|
)))
|
||||||
)))
|
|
||||||
seen_entities = set()
|
seen_entities = set()
|
||||||
for entity_type in kwargs.get('entity_types', []):
|
for entity_type in kwargs.get('entity_types', []):
|
||||||
if entity_type in seen_entities:
|
if entity_type in seen_entities:
|
||||||
|
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
cdef attr_t label
|
cdef attr_t label
|
||||||
if name == '-' or name == None:
|
if name == '-' or name is None:
|
||||||
return Transition(clas=0, move=MISSING, label=0, score=0)
|
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||||
elif name == '!O':
|
elif name == '!O':
|
||||||
return Transition(clas=0, move=ISNT, label=0, score=0)
|
return Transition(clas=0, move=ISNT, label=0, score=0)
|
||||||
|
@ -328,8 +322,8 @@ cdef class In:
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 3:
|
elif preset_ent_iob == 3:
|
||||||
return False
|
return False
|
||||||
# TODO: Is this quite right?
|
# TODO: Is this quite right? I think it's supposed to be ensuring the
|
||||||
# I think it's supposed to be ensuring the gazetteer matches are maintained
|
# gazetteer matches are maintained
|
||||||
elif st.B_(1).ent_iob != preset_ent_iob:
|
elif st.B_(1).ent_iob != preset_ent_iob:
|
||||||
return False
|
return False
|
||||||
# Don't allow entities to extend across sentence boundaries
|
# Don't allow entities to extend across sentence boundaries
|
||||||
|
@ -354,10 +348,12 @@ cdef class In:
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
return 0
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
|
# I, Gold B --> True
|
||||||
|
# (P of bad open entity sunk, R of this entity sunk)
|
||||||
return 0
|
return 0
|
||||||
elif g_act == IN:
|
elif g_act == IN:
|
||||||
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
|
# I, Gold I --> True
|
||||||
|
# (label forced by prev, if mismatch, P and R both sunk)
|
||||||
return 0
|
return 0
|
||||||
elif g_act == LAST:
|
elif g_act == LAST:
|
||||||
# I, Gold L --> True iff this entity sunk and next tag == O
|
# I, Gold L --> True iff this entity sunk and next tag == O
|
||||||
|
@ -505,11 +501,3 @@ cdef class Out:
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
class OracleError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class UnknownMove(Exception):
|
|
||||||
pass
|
|
||||||
|
|
|
@ -5,79 +5,55 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from collections import Counter, OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
import ujson
|
||||||
import json
|
import json
|
||||||
import contextlib
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from libc.math cimport exp
|
|
||||||
cimport cython
|
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import dill
|
|
||||||
|
|
||||||
import numpy.random
|
import numpy.random
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.math cimport exp
|
||||||
from libc.string cimport memset, memcpy
|
from libcpp.vector cimport vector
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.string cimport memset
|
||||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
from libc.stdlib cimport calloc, free
|
||||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
from cymem.cymem cimport Pool
|
||||||
from thinc.linalg cimport Vec, VecVec
|
from thinc.typedefs cimport weight_t, class_t, hash_t
|
||||||
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
|
||||||
from thinc.extra.eg cimport Example
|
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
|
from thinc.api import chain, clone
|
||||||
from cymem.cymem cimport Pool, Address
|
from thinc.v2v import Model, Maxout, Affine
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
from preshed.maps cimport MapStruct
|
|
||||||
from preshed.maps cimport map_get
|
|
||||||
|
|
||||||
from thinc.api import layerize, chain, clone, with_flatten
|
|
||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
|
||||||
from thinc.misc import LayerNorm
|
from thinc.misc import LayerNorm
|
||||||
|
from thinc.neural.ops import CupyOps
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
from .. import util
|
from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
|
||||||
from ..util import get_async, get_cuda_stream
|
|
||||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
|
||||||
from .._ml import Tok2Vec, doc2feats, rebatch
|
|
||||||
from .._ml import Residual, drop_layer, flatten
|
|
||||||
from .._ml import link_vectors_to_models
|
from .._ml import link_vectors_to_models
|
||||||
from .._ml import HistoryFeatures
|
|
||||||
from ..compat import json_dumps, copy_array
|
from ..compat import json_dumps, copy_array
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
from ..gold cimport GoldParse
|
||||||
|
from .. import util
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
from . import nonproj
|
from .transition_system cimport Transition
|
||||||
from .transition_system import OracleError
|
from . import _beam_utils, nonproj
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
|
||||||
from ..structs cimport TokenC
|
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
from ..strings cimport StringStore
|
|
||||||
from ..gold cimport GoldParse
|
|
||||||
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
|
|
||||||
from . import _beam_utils
|
|
||||||
|
|
||||||
|
|
||||||
def get_templates(*args, **kwargs):
|
def get_templates(*args, **kwargs):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
|
||||||
|
|
||||||
def set_debug(val):
|
def set_debug(val):
|
||||||
global DEBUG
|
global DEBUG
|
||||||
DEBUG = val
|
DEBUG = val
|
||||||
|
|
||||||
|
|
||||||
cdef class precompute_hiddens:
|
cdef class precompute_hiddens:
|
||||||
'''Allow a model to be "primed" by pre-computing input features in bulk.
|
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
||||||
|
|
||||||
This is used for the parser, where we want to take a batch of documents,
|
This is used for the parser, where we want to take a batch of documents,
|
||||||
and compute vectors for each (token, position) pair. These vectors can then
|
and compute vectors for each (token, position) pair. These vectors can then
|
||||||
|
@ -92,7 +68,7 @@ cdef class precompute_hiddens:
|
||||||
so we can save the factor k. This also gives a nice CPU/GPU division:
|
so we can save the factor k. This also gives a nice CPU/GPU division:
|
||||||
we can do all our hard maths up front, packed into large multiplications,
|
we can do all our hard maths up front, packed into large multiplications,
|
||||||
and do the hard-to-program parsing on the CPU.
|
and do the hard-to-program parsing on the CPU.
|
||||||
'''
|
"""
|
||||||
cdef int nF, nO, nP
|
cdef int nF, nO, nP
|
||||||
cdef bint _is_synchronized
|
cdef bint _is_synchronized
|
||||||
cdef public object ops
|
cdef public object ops
|
||||||
|
@ -101,7 +77,8 @@ cdef class precompute_hiddens:
|
||||||
cdef object _cuda_stream
|
cdef object _cuda_stream
|
||||||
cdef object _bp_hiddens
|
cdef object _bp_hiddens
|
||||||
|
|
||||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.):
|
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||||
|
drop=0.):
|
||||||
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
||||||
cdef np.ndarray cached
|
cdef np.ndarray cached
|
||||||
if not isinstance(gpu_cached, numpy.ndarray):
|
if not isinstance(gpu_cached, numpy.ndarray):
|
||||||
|
@ -121,8 +98,7 @@ cdef class precompute_hiddens:
|
||||||
self._bp_hiddens = bp_features
|
self._bp_hiddens = bp_features
|
||||||
|
|
||||||
cdef const float* get_feat_weights(self) except NULL:
|
cdef const float* get_feat_weights(self) except NULL:
|
||||||
if not self._is_synchronized \
|
if not self._is_synchronized and self._cuda_stream is not None:
|
||||||
and self._cuda_stream is not None:
|
|
||||||
self._cuda_stream.synchronize()
|
self._cuda_stream.synchronize()
|
||||||
self._is_synchronized = True
|
self._is_synchronized = True
|
||||||
return <float*>self._cached.data
|
return <float*>self._cached.data
|
||||||
|
@ -131,7 +107,8 @@ cdef class precompute_hiddens:
|
||||||
return self.begin_update(X)[0]
|
return self.begin_update(X)[0]
|
||||||
|
|
||||||
def begin_update(self, token_ids, drop=0.):
|
def begin_update(self, token_ids, drop=0.):
|
||||||
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
|
cdef np.ndarray state_vector = numpy.zeros(
|
||||||
|
(token_ids.shape[0], self.nO*self.nP), dtype='f')
|
||||||
# This is tricky, but (assuming GPU available);
|
# This is tricky, but (assuming GPU available);
|
||||||
# - Input to forward on CPU
|
# - Input to forward on CPU
|
||||||
# - Output from forward on CPU
|
# - Output from forward on CPU
|
||||||
|
@ -142,8 +119,8 @@ cdef class precompute_hiddens:
|
||||||
feat_weights = self.get_feat_weights()
|
feat_weights = self.get_feat_weights()
|
||||||
cdef int[:, ::1] ids = token_ids
|
cdef int[:, ::1] ids = token_ids
|
||||||
sum_state_features(<float*>state_vector.data,
|
sum_state_features(<float*>state_vector.data,
|
||||||
feat_weights, &ids[0,0],
|
feat_weights, &ids[0, 0],
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||||
|
|
||||||
def backward(d_state_vector, sgd=None):
|
def backward(d_state_vector, sgd=None):
|
||||||
|
@ -162,10 +139,11 @@ cdef class precompute_hiddens:
|
||||||
state_vector = state_vector.reshape(
|
state_vector = state_vector.reshape(
|
||||||
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
|
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
|
||||||
best, which = self.ops.maxout(state_vector)
|
best, which = self.ops.maxout(state_vector)
|
||||||
|
|
||||||
def backprop(d_best, sgd=None):
|
def backprop(d_best, sgd=None):
|
||||||
return self.ops.backprop_maxout(d_best, which, self.nP)
|
return self.ops.backprop_maxout(d_best, which, self.nP)
|
||||||
return best, backprop
|
|
||||||
|
|
||||||
|
return best, backprop
|
||||||
|
|
||||||
|
|
||||||
cdef void sum_state_features(float* output,
|
cdef void sum_state_features(float* output,
|
||||||
|
@ -240,11 +218,15 @@ cdef class Parser:
|
||||||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
||||||
if depth != 1:
|
if depth != 1:
|
||||||
raise ValueError("Currently parser depth is hard-coded to 1.")
|
raise ValueError("Currently parser depth is hard-coded to 1.")
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||||
|
cfg.get('maxout_pieces', 2))
|
||||||
if parser_maxout_pieces != 2:
|
if parser_maxout_pieces != 2:
|
||||||
raise ValueError("Currently parser_maxout_pieces is hard-coded to 2")
|
raise ValueError("Currently parser_maxout_pieces is hard-coded "
|
||||||
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
|
"to 2")
|
||||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
|
cfg.get('token_vector_width', 128))
|
||||||
|
hidden_width = util.env_opt('hidden_width',
|
||||||
|
cfg.get('hidden_width', 200))
|
||||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
||||||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
||||||
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
||||||
|
@ -280,23 +262,19 @@ cdef class Parser:
|
||||||
return (tok2vec, lower, upper), cfg
|
return (tok2vec, lower, upper), cfg
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
||||||
"""
|
"""Create a Parser.
|
||||||
Create a Parser.
|
|
||||||
|
|
||||||
Arguments:
|
vocab (Vocab): The vocabulary object. Must be shared with documents
|
||||||
vocab (Vocab):
|
to be processed. The value is set to the `.vocab` attribute.
|
||||||
The vocabulary object. Must be shared with documents to be processed.
|
moves (TransitionSystem): Defines how the parse-state is created,
|
||||||
The value is set to the .vocab attribute.
|
updated and evaluated. The value is set to the .moves attribute
|
||||||
moves (TransitionSystem):
|
unless True (default), in which case a new instance is created with
|
||||||
Defines how the parse-state is created, updated and evaluated.
|
`Parser.Moves()`.
|
||||||
The value is set to the .moves attribute unless True (default),
|
model (object): Defines how the parse-state is created, updated and
|
||||||
in which case a new instance is created with Parser.Moves().
|
evaluated. The value is set to the .model attribute unless True
|
||||||
model (object):
|
(default), in which case a new instance is created with
|
||||||
Defines how the parse-state is created, updated and evaluated.
|
`Parser.Model()`.
|
||||||
The value is set to the .model attribute unless True (default),
|
**cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
|
||||||
in which case a new instance is created with Parser.Model().
|
|
||||||
**cfg:
|
|
||||||
Arbitrary configuration parameters. Set to the .cfg attribute
|
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
if moves is True:
|
if moves is True:
|
||||||
|
@ -322,13 +300,10 @@ cdef class Parser:
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
def __call__(self, Doc doc, beam_width=None, beam_density=None):
|
def __call__(self, Doc doc, beam_width=None, beam_density=None):
|
||||||
"""
|
"""Apply the parser or entity recognizer, setting the annotations onto
|
||||||
Apply the parser or entity recognizer, setting the annotations onto the Doc object.
|
the `Doc` object.
|
||||||
|
|
||||||
Arguments:
|
doc (Doc): The document to be processed.
|
||||||
doc (Doc): The document to be processed.
|
|
||||||
Returns:
|
|
||||||
None
|
|
||||||
"""
|
"""
|
||||||
if beam_width is None:
|
if beam_width is None:
|
||||||
beam_width = self.cfg.get('beam_width', 1)
|
beam_width = self.cfg.get('beam_width', 1)
|
||||||
|
@ -350,16 +325,13 @@ cdef class Parser:
|
||||||
|
|
||||||
def pipe(self, docs, int batch_size=256, int n_threads=2,
|
def pipe(self, docs, int batch_size=256, int n_threads=2,
|
||||||
beam_width=None, beam_density=None):
|
beam_width=None, beam_density=None):
|
||||||
"""
|
"""Process a stream of documents.
|
||||||
Process a stream of documents.
|
|
||||||
|
|
||||||
Arguments:
|
stream: The sequence of documents to process.
|
||||||
stream: The sequence of documents to process.
|
batch_size (int): Number of documents to accumulate into a working set.
|
||||||
batch_size (int):
|
n_threads (int): The number of threads with which to work on the buffer
|
||||||
The number of documents to accumulate into a working set.
|
in parallel.
|
||||||
n_threads (int):
|
YIELDS (Doc): Documents, in order.
|
||||||
The number of threads with which to work on the buffer in parallel.
|
|
||||||
Yields (Doc): Documents, in order.
|
|
||||||
"""
|
"""
|
||||||
if beam_width is None:
|
if beam_width is None:
|
||||||
beam_width = self.cfg.get('beam_width', 1)
|
beam_width = self.cfg.get('beam_width', 1)
|
||||||
|
@ -376,8 +348,8 @@ cdef class Parser:
|
||||||
parse_states = self.parse_batch(subbatch)
|
parse_states = self.parse_batch(subbatch)
|
||||||
beams = []
|
beams = []
|
||||||
else:
|
else:
|
||||||
beams = self.beam_parse(subbatch,
|
beams = self.beam_parse(subbatch, beam_width=beam_width,
|
||||||
beam_width=beam_width, beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
parse_states = []
|
parse_states = []
|
||||||
for beam in beams:
|
for beam in beams:
|
||||||
parse_states.append(<StateClass>beam.at(0))
|
parse_states.append(<StateClass>beam.at(0))
|
||||||
|
@ -397,9 +369,9 @@ cdef class Parser:
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = util.get_cuda_stream()
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||||
0.0)
|
docs, cuda_stream, 0.0)
|
||||||
nr_state = len(docs)
|
nr_state = len(docs)
|
||||||
nr_class = self.moves.n_moves
|
nr_class = self.moves.n_moves
|
||||||
nr_dim = tokvecs.shape[1]
|
nr_dim = tokvecs.shape[1]
|
||||||
|
@ -413,7 +385,8 @@ cdef class Parser:
|
||||||
|
|
||||||
feat_weights = state2vec.get_feat_weights()
|
feat_weights = state2vec.get_feat_weights()
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T)
|
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
|
||||||
|
vec2scores._layers[-1].W.T)
|
||||||
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
|
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
|
||||||
|
|
||||||
hW = <float*>hidden_weights.data
|
hW = <float*>hidden_weights.data
|
||||||
|
@ -473,9 +446,9 @@ cdef class Parser:
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef int nr_class = self.moves.n_moves
|
cdef int nr_class = self.moves.n_moves
|
||||||
cdef StateClass stcls, output
|
cdef StateClass stcls, output
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = util.get_cuda_stream()
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||||
0.0)
|
docs, cuda_stream, 0.0)
|
||||||
beams = []
|
beams = []
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
cdef int j = 0
|
cdef int j = 0
|
||||||
|
@ -530,9 +503,7 @@ cdef class Parser:
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
|
cuda_stream = util.get_cuda_stream()
|
||||||
cuda_stream = get_cuda_stream()
|
|
||||||
|
|
||||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||||
drop)
|
drop)
|
||||||
|
@ -547,7 +518,6 @@ cdef class Parser:
|
||||||
n_steps = 0
|
n_steps = 0
|
||||||
while todo:
|
while todo:
|
||||||
states, golds = zip(*todo)
|
states, golds = zip(*todo)
|
||||||
|
|
||||||
token_ids = self.get_token_ids(states)
|
token_ids = self.get_token_ids(states)
|
||||||
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
|
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
|
||||||
if drop != 0:
|
if drop != 0:
|
||||||
|
@ -569,8 +539,8 @@ cdef class Parser:
|
||||||
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
||||||
# Move token_ids and d_vector to GPU, asynchronously
|
# Move token_ids and d_vector to GPU, asynchronously
|
||||||
backprops.append((
|
backprops.append((
|
||||||
get_async(cuda_stream, token_ids),
|
util.get_async(cuda_stream, token_ids),
|
||||||
get_async(cuda_stream, d_vector),
|
util.get_async(cuda_stream, d_vector),
|
||||||
bp_vector
|
bp_vector
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
|
@ -603,15 +573,13 @@ cdef class Parser:
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
self.moves.preprocess_gold(gold)
|
self.moves.preprocess_gold(gold)
|
||||||
|
cuda_stream = util.get_cuda_stream()
|
||||||
cuda_stream = get_cuda_stream()
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
|
docs, cuda_stream, drop)
|
||||||
|
states_d_scores, backprops = _beam_utils.update_beam(
|
||||||
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
self.moves, self.nr_feature, 500, states, golds, state2vec,
|
||||||
states, golds,
|
vec2scores, width, density, self.cfg.get('hist_size', 0),
|
||||||
state2vec, vec2scores,
|
drop=drop, losses=losses)
|
||||||
width, density, self.cfg.get('hist_size', 0),
|
|
||||||
drop=drop, losses=losses)
|
|
||||||
backprop_lower = []
|
backprop_lower = []
|
||||||
cdef float batch_size = len(docs)
|
cdef float batch_size = len(docs)
|
||||||
for i, d_scores in enumerate(states_d_scores):
|
for i, d_scores in enumerate(states_d_scores):
|
||||||
|
@ -623,13 +591,14 @@ cdef class Parser:
|
||||||
if isinstance(self.model[0].ops, CupyOps) \
|
if isinstance(self.model[0].ops, CupyOps) \
|
||||||
and not isinstance(ids, state2vec.ops.xp.ndarray):
|
and not isinstance(ids, state2vec.ops.xp.ndarray):
|
||||||
backprop_lower.append((
|
backprop_lower.append((
|
||||||
get_async(cuda_stream, ids),
|
util.get_async(cuda_stream, ids),
|
||||||
get_async(cuda_stream, d_vector),
|
util.get_async(cuda_stream, d_vector),
|
||||||
bp_vectors))
|
bp_vectors))
|
||||||
else:
|
else:
|
||||||
backprop_lower.append((ids, d_vector, bp_vectors))
|
backprop_lower.append((ids, d_vector, bp_vectors))
|
||||||
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||||
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
|
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
|
||||||
|
cuda_stream)
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
|
@ -779,7 +748,8 @@ cdef class Parser:
|
||||||
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||||
if 'model' in cfg:
|
if 'model' in cfg:
|
||||||
self.model = cfg['model']
|
self.model = cfg['model']
|
||||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
|
gold_tuples = nonproj.preprocess_training_data(gold_tuples,
|
||||||
|
label_freq_cutoff=100)
|
||||||
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
||||||
for action, labels in actions.items():
|
for action, labels in actions.items():
|
||||||
for label in labels:
|
for label in labels:
|
||||||
|
|
|
@ -1,39 +1,37 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""
|
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||||
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
|
||||||
for doing pseudo-projective parsing implementation uses the HEAD decoration
|
for doing pseudo-projective parsing implementation uses the HEAD decoration
|
||||||
scheme.
|
scheme.
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
from ..attrs import DEP, HEAD
|
|
||||||
|
|
||||||
DELIMITER = '||'
|
DELIMITER = '||'
|
||||||
|
|
||||||
|
|
||||||
def ancestors(tokenid, heads):
|
def ancestors(tokenid, heads):
|
||||||
# returns all words going from the word up the path to the root
|
# Returns all words going from the word up the path to the root. The path
|
||||||
# the path to root cannot be longer than the number of words in the sentence
|
# to root cannot be longer than the number of words in the sentence. This
|
||||||
# this function ends after at most len(heads) steps
|
# function ends after at most len(heads) steps, because it would otherwise
|
||||||
# because it would otherwise loop indefinitely on cycles
|
# loop indefinitely on cycles.
|
||||||
head = tokenid
|
head = tokenid
|
||||||
cnt = 0
|
cnt = 0
|
||||||
while heads[head] != head and cnt < len(heads):
|
while heads[head] != head and cnt < len(heads):
|
||||||
head = heads[head]
|
head = heads[head]
|
||||||
cnt += 1
|
cnt += 1
|
||||||
yield head
|
yield head
|
||||||
if head == None:
|
if head is None:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def contains_cycle(heads):
|
def contains_cycle(heads):
|
||||||
# in an acyclic tree, the path from each word following
|
# in an acyclic tree, the path from each word following the head relation
|
||||||
# the head relation upwards always ends at the root node
|
# upwards always ends at the root node
|
||||||
for tokenid in range(len(heads)):
|
for tokenid in range(len(heads)):
|
||||||
seen = set([tokenid])
|
seen = set([tokenid])
|
||||||
for ancestor in ancestors(tokenid,heads):
|
for ancestor in ancestors(tokenid, heads):
|
||||||
if ancestor in seen:
|
if ancestor in seen:
|
||||||
return seen
|
return seen
|
||||||
seen.add(ancestor)
|
seen.add(ancestor)
|
||||||
|
@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads):
|
||||||
# if there is a token k, h < k < d such that h is not
|
# if there is a token k, h < k < d such that h is not
|
||||||
# an ancestor of k. Same for h -> d, h > d
|
# an ancestor of k. Same for h -> d, h > d
|
||||||
head = heads[tokenid]
|
head = heads[tokenid]
|
||||||
if head == tokenid: # root arcs cannot be non-projective
|
if head == tokenid: # root arcs cannot be non-projective
|
||||||
return False
|
return False
|
||||||
elif head == None: # unattached tokens cannot be non-projective
|
elif head is None: # unattached tokens cannot be non-projective
|
||||||
return False
|
return False
|
||||||
|
|
||||||
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
|
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
|
||||||
for k in range(start,end):
|
for k in range(start, end):
|
||||||
for ancestor in ancestors(k,heads):
|
for ancestor in ancestors(k, heads):
|
||||||
if ancestor == None: # for unattached tokens/subtrees
|
if ancestor is None: # for unattached tokens/subtrees
|
||||||
break
|
break
|
||||||
elif ancestor == head: # normal case: k dominated by h
|
elif ancestor == head: # normal case: k dominated by h
|
||||||
break
|
break
|
||||||
else: # head not in ancestors: d -> h is non-projective
|
else: # head not in ancestors: d -> h is non-projective
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def is_nonproj_tree(heads):
|
def is_nonproj_tree(heads):
|
||||||
# a tree is non-projective if at least one arc is non-projective
|
# a tree is non-projective if at least one arc is non-projective
|
||||||
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
|
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
|
||||||
|
|
||||||
|
|
||||||
def decompose(label):
|
def decompose(label):
|
||||||
|
@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
|
||||||
for raw_text, sents in gold_tuples:
|
for raw_text, sents in gold_tuples:
|
||||||
prepro_sents = []
|
prepro_sents = []
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||||
proj_heads,deco_labels = projectivize(heads,labels)
|
proj_heads, deco_labels = projectivize(heads, labels)
|
||||||
# set the label to ROOT for each root dependent
|
# set the label to ROOT for each root dependent
|
||||||
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
|
deco_labels = ['ROOT' if head == i else deco_labels[i]
|
||||||
|
for i, head in enumerate(proj_heads)]
|
||||||
# count label frequencies
|
# count label frequencies
|
||||||
if label_freq_cutoff > 0:
|
if label_freq_cutoff > 0:
|
||||||
for label in deco_labels:
|
for label in deco_labels:
|
||||||
if is_decorated(label):
|
if is_decorated(label):
|
||||||
freqs[label] = freqs.get(label,0) + 1
|
freqs[label] = freqs.get(label, 0) + 1
|
||||||
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
|
prepro_sents.append(
|
||||||
|
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
|
||||||
preprocessed.append((raw_text, prepro_sents))
|
preprocessed.append((raw_text, prepro_sents))
|
||||||
|
|
||||||
if label_freq_cutoff > 0:
|
if label_freq_cutoff > 0:
|
||||||
return _filter_labels(preprocessed,label_freq_cutoff,freqs)
|
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
||||||
return preprocessed
|
return preprocessed
|
||||||
|
|
||||||
|
|
||||||
def projectivize(heads, labels):
|
def projectivize(heads, labels):
|
||||||
# use the algorithm by Nivre & Nilsson 2005
|
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
|
||||||
# assumes heads to be a proper tree, i.e. connected and cycle-free
|
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
||||||
# returns a new pair (heads,labels) which encode
|
# which encode a projective and decorated tree.
|
||||||
# a projective and decorated tree
|
|
||||||
proj_heads = copy(heads)
|
proj_heads = copy(heads)
|
||||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||||
if smallest_np_arc == None: # this sentence is already projective
|
if smallest_np_arc is None: # this sentence is already projective
|
||||||
return proj_heads, copy(labels)
|
return proj_heads, copy(labels)
|
||||||
while smallest_np_arc != None:
|
while smallest_np_arc is not None:
|
||||||
_lift(smallest_np_arc, proj_heads)
|
_lift(smallest_np_arc, proj_heads)
|
||||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||||
deco_labels = _decorate(heads, proj_heads, labels)
|
deco_labels = _decorate(heads, proj_heads, labels)
|
||||||
|
@ -114,24 +112,26 @@ def projectivize(heads, labels):
|
||||||
|
|
||||||
|
|
||||||
def deprojectivize(tokens):
|
def deprojectivize(tokens):
|
||||||
# reattach arcs with decorated labels (following HEAD scheme)
|
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
||||||
# for each decorated arc X||Y, search top-down, left-to-right,
|
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
||||||
# breadth-first until hitting a Y then make this the new head
|
# hitting a Y then make this the new head.
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
if is_decorated(token.dep_):
|
if is_decorated(token.dep_):
|
||||||
newlabel,headlabel = decompose(token.dep_)
|
newlabel, headlabel = decompose(token.dep_)
|
||||||
newhead = _find_new_head(token,headlabel)
|
newhead = _find_new_head(token, headlabel)
|
||||||
token.head = newhead
|
token.head = newhead
|
||||||
token.dep_ = newlabel
|
token.dep_ = newlabel
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def _decorate(heads, proj_heads, labels):
|
def _decorate(heads, proj_heads, labels):
|
||||||
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
||||||
assert(len(heads) == len(proj_heads) == len(labels))
|
assert(len(heads) == len(proj_heads) == len(labels))
|
||||||
deco_labels = []
|
deco_labels = []
|
||||||
for tokenid,head in enumerate(heads):
|
for tokenid, head in enumerate(heads):
|
||||||
if head != proj_heads[tokenid]:
|
if head != proj_heads[tokenid]:
|
||||||
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
|
deco_labels.append(
|
||||||
|
'%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
|
||||||
else:
|
else:
|
||||||
deco_labels.append(labels[tokenid])
|
deco_labels.append(labels[tokenid])
|
||||||
return deco_labels
|
return deco_labels
|
||||||
|
@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads):
|
||||||
# and ties are broken left to right
|
# and ties are broken left to right
|
||||||
smallest_size = float('inf')
|
smallest_size = float('inf')
|
||||||
smallest_np_arc = None
|
smallest_np_arc = None
|
||||||
for tokenid,head in enumerate(heads):
|
for tokenid, head in enumerate(heads):
|
||||||
size = abs(tokenid-head)
|
size = abs(tokenid-head)
|
||||||
if size < smallest_size and is_nonproj_arc(tokenid,heads):
|
if size < smallest_size and is_nonproj_arc(tokenid, heads):
|
||||||
smallest_size = size
|
smallest_size = size
|
||||||
smallest_np_arc = tokenid
|
smallest_np_arc = tokenid
|
||||||
return smallest_np_arc
|
return smallest_np_arc
|
||||||
|
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
|
||||||
next_queue = []
|
next_queue = []
|
||||||
for qtoken in queue:
|
for qtoken in queue:
|
||||||
for child in qtoken.children:
|
for child in qtoken.children:
|
||||||
if child.is_space: continue
|
if child.is_space:
|
||||||
if child == token: continue
|
continue
|
||||||
|
if child == token:
|
||||||
|
continue
|
||||||
if child.dep_ == headlabel:
|
if child.dep_ == headlabel:
|
||||||
return child
|
return child
|
||||||
next_queue.append(child)
|
next_queue.append(child)
|
||||||
|
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
|
||||||
for raw_text, sents in gold_tuples:
|
for raw_text, sents in gold_tuples:
|
||||||
filtered_sents = []
|
filtered_sents = []
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||||
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
|
filtered_labels = [decompose(label)[0]
|
||||||
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
if freqs.get(label, cutoff) < cutoff
|
||||||
|
else label for label in labels]
|
||||||
|
filtered_sents.append(
|
||||||
|
((ids, words, tags, heads, filtered_labels, iob), ctnts))
|
||||||
filtered.append((raw_text, filtered_sents))
|
filtered.append((raw_text, filtered_sents))
|
||||||
return filtered
|
return filtered
|
||||||
|
|
|
@ -2,17 +2,8 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
|
||||||
from ..structs cimport Entity
|
|
||||||
from ..lexeme cimport Lexeme
|
|
||||||
from ..symbols cimport punct
|
|
||||||
from ..attrs cimport IS_SPACE
|
|
||||||
from ..attrs cimport attr_id_t
|
|
||||||
from ..tokens.token cimport Token
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,17 +2,17 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport Py_INCREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from collections import defaultdict, OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
from .. import util
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
from ..compat import json_dumps
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
||||||
|
@ -136,11 +136,12 @@ cdef class TransitionSystem:
|
||||||
print([gold.c.ner[i].clas for i in range(gold.length)])
|
print([gold.c.ner[i].clas for i in range(gold.length)])
|
||||||
print([gold.c.ner[i].move for i in range(gold.length)])
|
print([gold.c.ner[i].move for i in range(gold.length)])
|
||||||
print([gold.c.ner[i].label for i in range(gold.length)])
|
print([gold.c.ner[i].label for i in range(gold.length)])
|
||||||
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
|
print("Self labels",
|
||||||
|
[self.c[i].label for i in range(self.n_moves)])
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not find a gold-standard action to supervise "
|
"Could not find a gold-standard action to supervise "
|
||||||
"the entity recognizer\n"
|
"the entity recognizer. The transition system has "
|
||||||
"The transition system has %d actions." % (self.n_moves))
|
"%d actions." % (self.n_moves))
|
||||||
|
|
||||||
def get_class_name(self, int clas):
|
def get_class_name(self, int clas):
|
||||||
act = self.c[clas]
|
act = self.c[clas]
|
||||||
|
@ -149,7 +150,7 @@ cdef class TransitionSystem:
|
||||||
def add_action(self, int action, label_name):
|
def add_action(self, int action, label_name):
|
||||||
cdef attr_t label_id
|
cdef attr_t label_id
|
||||||
if not isinstance(label_name, int) and \
|
if not isinstance(label_name, int) and \
|
||||||
not isinstance(label_name, long):
|
not isinstance(label_name, long):
|
||||||
label_id = self.strings.add(label_name)
|
label_id = self.strings.add(label_name)
|
||||||
else:
|
else:
|
||||||
label_id = label_name
|
label_id = label_name
|
||||||
|
@ -186,7 +187,7 @@ cdef class TransitionSystem:
|
||||||
'name': self.move_name(trans.move, trans.label)
|
'name': self.move_name(trans.move, trans.label)
|
||||||
})
|
})
|
||||||
serializers = {
|
serializers = {
|
||||||
'transitions': lambda: ujson.dumps(transitions),
|
'transitions': lambda: json_dumps(transitions),
|
||||||
'strings': lambda: self.strings.to_bytes()
|
'strings': lambda: self.strings.to_bytes()
|
||||||
}
|
}
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
|
@ -1,17 +0,0 @@
|
||||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
|
||||||
from thinc.extra.eg cimport Example
|
|
||||||
from thinc.structs cimport ExampleC
|
|
||||||
|
|
||||||
from .structs cimport TokenC
|
|
||||||
from .vocab cimport Vocab
|
|
||||||
|
|
||||||
|
|
||||||
cdef class TaggerModel(AveragedPerceptron):
|
|
||||||
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
|
||||||
cdef readonly Vocab vocab
|
|
||||||
cdef readonly TaggerModel model
|
|
||||||
cdef public dict freqs
|
|
||||||
cdef public object cfg
|
|
253
spacy/tagger.pyx
253
spacy/tagger.pyx
|
@ -1,253 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from thinc.typedefs cimport atom_t
|
|
||||||
from thinc.extra.eg cimport Example
|
|
||||||
from thinc.structs cimport ExampleC
|
|
||||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
|
||||||
from thinc.linalg cimport VecVec
|
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
|
||||||
from .attrs cimport TAG
|
|
||||||
from .gold cimport GoldParse
|
|
||||||
from .attrs cimport *
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
|
||||||
P2_orth
|
|
||||||
P2_cluster
|
|
||||||
P2_shape
|
|
||||||
P2_prefix
|
|
||||||
P2_suffix
|
|
||||||
P2_pos
|
|
||||||
P2_lemma
|
|
||||||
P2_flags
|
|
||||||
|
|
||||||
P1_orth
|
|
||||||
P1_cluster
|
|
||||||
P1_shape
|
|
||||||
P1_prefix
|
|
||||||
P1_suffix
|
|
||||||
P1_pos
|
|
||||||
P1_lemma
|
|
||||||
P1_flags
|
|
||||||
|
|
||||||
W_orth
|
|
||||||
W_cluster
|
|
||||||
W_shape
|
|
||||||
W_prefix
|
|
||||||
W_suffix
|
|
||||||
W_pos
|
|
||||||
W_lemma
|
|
||||||
W_flags
|
|
||||||
|
|
||||||
N1_orth
|
|
||||||
N1_cluster
|
|
||||||
N1_shape
|
|
||||||
N1_prefix
|
|
||||||
N1_suffix
|
|
||||||
N1_pos
|
|
||||||
N1_lemma
|
|
||||||
N1_flags
|
|
||||||
|
|
||||||
N2_orth
|
|
||||||
N2_cluster
|
|
||||||
N2_shape
|
|
||||||
N2_prefix
|
|
||||||
N2_suffix
|
|
||||||
N2_pos
|
|
||||||
N2_lemma
|
|
||||||
N2_flags
|
|
||||||
|
|
||||||
N_CONTEXT_FIELDS
|
|
||||||
|
|
||||||
|
|
||||||
cdef class TaggerModel(AveragedPerceptron):
|
|
||||||
def update(self, Example eg):
|
|
||||||
self.time += 1
|
|
||||||
guess = eg.guess
|
|
||||||
best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)
|
|
||||||
if guess != best:
|
|
||||||
for feat in eg.c.features[:eg.c.nr_feat]:
|
|
||||||
self.update_weight(feat.key, best, -feat.value)
|
|
||||||
self.update_weight(feat.key, guess, feat.value)
|
|
||||||
|
|
||||||
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
|
|
||||||
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
|
|
||||||
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
|
|
||||||
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
|
|
||||||
_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
|
|
||||||
_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])
|
|
||||||
|
|
||||||
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|
||||||
context[0] = t.lex.lower
|
|
||||||
context[1] = t.lex.cluster
|
|
||||||
context[2] = t.lex.shape
|
|
||||||
context[3] = t.lex.prefix
|
|
||||||
context[4] = t.lex.suffix
|
|
||||||
context[5] = t.tag
|
|
||||||
context[6] = t.lemma
|
|
||||||
if t.lex.flags & (1 << IS_ALPHA):
|
|
||||||
context[7] = 1
|
|
||||||
elif t.lex.flags & (1 << IS_PUNCT):
|
|
||||||
context[7] = 2
|
|
||||||
elif t.lex.flags & (1 << LIKE_URL):
|
|
||||||
context[7] = 3
|
|
||||||
elif t.lex.flags & (1 << LIKE_NUM):
|
|
||||||
context[7] = 4
|
|
||||||
else:
|
|
||||||
context[7] = 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
|
||||||
"""Annotate part-of-speech tags on Doc objects."""
|
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
|
||||||
"""Create a Tagger.
|
|
||||||
|
|
||||||
vocab (Vocab): The vocabulary object. Must be shared with documents to
|
|
||||||
be processed.
|
|
||||||
model (thinc.linear.AveragedPerceptron): The statistical model.
|
|
||||||
RETURNS (Tagger): The newly constructed object.
|
|
||||||
"""
|
|
||||||
if model is None:
|
|
||||||
model = TaggerModel(cfg.get('features', self.feature_templates),
|
|
||||||
L1=0.0)
|
|
||||||
self.vocab = vocab
|
|
||||||
self.model = model
|
|
||||||
self.model.l1_penalty = 0.0
|
|
||||||
# TODO: Move this to tag map
|
|
||||||
self.freqs = {TAG: defaultdict(int)}
|
|
||||||
for tag in self.tag_names:
|
|
||||||
self.freqs[TAG][self.vocab.strings[tag]] = 1
|
|
||||||
self.freqs[TAG][0] = 1
|
|
||||||
self.cfg = cfg
|
|
||||||
|
|
||||||
@property
|
|
||||||
def tag_names(self):
|
|
||||||
return self.vocab.morphology.tag_names
|
|
||||||
|
|
||||||
def __reduce__(self):
|
|
||||||
return (self.__class__, (self.vocab, self.model), None, None)
|
|
||||||
|
|
||||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
|
||||||
cdef int i
|
|
||||||
for i in range(tokens.length):
|
|
||||||
self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])
|
|
||||||
tokens.is_tagged = True
|
|
||||||
tokens._py_tokens = [None] * tokens.length
|
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
|
||||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
|
||||||
|
|
||||||
doc (Doc): The tokens to be tagged.
|
|
||||||
"""
|
|
||||||
if tokens.length == 0:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
cdef Pool mem = Pool()
|
|
||||||
|
|
||||||
cdef int i, tag
|
|
||||||
cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS,
|
|
||||||
nr_class=self.vocab.morphology.n_tags,
|
|
||||||
nr_feat=self.model.nr_feat)
|
|
||||||
for i in range(tokens.length):
|
|
||||||
if tokens.c[i].pos == 0:
|
|
||||||
self.model.set_featuresC(&eg.c, tokens.c, i)
|
|
||||||
self.model.set_scoresC(eg.c.scores,
|
|
||||||
eg.c.features, eg.c.nr_feat)
|
|
||||||
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
|
||||||
self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
|
|
||||||
eg.fill_scores(0, eg.c.nr_class)
|
|
||||||
tokens.is_tagged = True
|
|
||||||
tokens._py_tokens = [None] * tokens.length
|
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
|
||||||
"""Tag a stream of documents.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
stream: The sequence of documents to tag.
|
|
||||||
batch_size (int): The number of documents to accumulate into a working set.
|
|
||||||
n_threads (int): The number of threads with which to work on the buffer
|
|
||||||
in parallel, if the Matcher implementation supports multi-threading.
|
|
||||||
YIELDS (Doc): Documents, in order.
|
|
||||||
"""
|
|
||||||
for doc in stream:
|
|
||||||
self(doc)
|
|
||||||
yield doc
|
|
||||||
|
|
||||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
|
||||||
"""Update the statistical model, with tags supplied for the given document.
|
|
||||||
|
|
||||||
doc (Doc): The document to update on.
|
|
||||||
gold (GoldParse): Manager for the gold-standard tags.
|
|
||||||
RETURNS (int): Number of tags predicted correctly.
|
|
||||||
"""
|
|
||||||
gold_tag_strs = gold.tags
|
|
||||||
assert len(tokens) == len(gold_tag_strs)
|
|
||||||
for tag in gold_tag_strs:
|
|
||||||
if tag != None and tag not in self.tag_names:
|
|
||||||
msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "
|
|
||||||
"gold tags, to maintain coarse-grained mapping.")
|
|
||||||
raise ValueError(msg % tag)
|
|
||||||
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
|
|
||||||
cdef int correct = 0
|
|
||||||
cdef Pool mem = Pool()
|
|
||||||
cdef Example eg = Example(
|
|
||||||
nr_atom=N_CONTEXT_FIELDS,
|
|
||||||
nr_class=self.vocab.morphology.n_tags,
|
|
||||||
nr_feat=self.model.nr_feat)
|
|
||||||
for i in range(tokens.length):
|
|
||||||
self.model.set_featuresC(&eg.c, tokens.c, i)
|
|
||||||
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
|
|
||||||
self.model.set_scoresC(eg.c.scores,
|
|
||||||
eg.c.features, eg.c.nr_feat)
|
|
||||||
self.model.update(eg)
|
|
||||||
|
|
||||||
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
|
|
||||||
|
|
||||||
correct += eg.cost == 0
|
|
||||||
self.freqs[TAG][tokens.c[i].tag] += 1
|
|
||||||
eg.fill_scores(0, eg.c.nr_class)
|
|
||||||
eg.fill_costs(0, eg.c.nr_class)
|
|
||||||
tokens.is_tagged = True
|
|
||||||
tokens._py_tokens = [None] * tokens.length
|
|
||||||
return correct
|
|
||||||
|
|
||||||
|
|
||||||
feature_templates = (
|
|
||||||
(W_orth,),
|
|
||||||
(P1_lemma, P1_pos),
|
|
||||||
(P2_lemma, P2_pos),
|
|
||||||
(N1_orth,),
|
|
||||||
(N2_orth,),
|
|
||||||
|
|
||||||
(W_suffix,),
|
|
||||||
(W_prefix,),
|
|
||||||
|
|
||||||
(P1_pos,),
|
|
||||||
(P2_pos,),
|
|
||||||
(P1_pos, P2_pos),
|
|
||||||
(P1_pos, W_orth),
|
|
||||||
(P1_suffix,),
|
|
||||||
(N1_suffix,),
|
|
||||||
|
|
||||||
(W_shape,),
|
|
||||||
(W_cluster,),
|
|
||||||
(N1_cluster,),
|
|
||||||
(N2_cluster,),
|
|
||||||
(P1_cluster,),
|
|
||||||
(P2_cluster,),
|
|
||||||
|
|
||||||
(W_flags,),
|
|
||||||
(N1_flags,),
|
|
||||||
(N2_flags,),
|
|
||||||
(P1_flags,),
|
|
||||||
(P2_flags,),
|
|
||||||
)
|
|
|
@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
from .strings cimport hash_string
|
|
||||||
from . import util
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
from .strings cimport hash_string
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -21,7 +20,7 @@ cdef class Tokenizer:
|
||||||
boundaries.
|
boundaries.
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||||
suffix_search=None, infix_finditer=None, token_match=None):
|
suffix_search=None, infix_finditer=None, token_match=None):
|
||||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||||
|
|
||||||
vocab (Vocab): A storage container for lexical types.
|
vocab (Vocab): A storage container for lexical types.
|
||||||
|
@ -74,9 +73,8 @@ cdef class Tokenizer:
|
||||||
RETURNS (Doc): A container for linguistic annotations.
|
RETURNS (Doc): A container for linguistic annotations.
|
||||||
"""
|
"""
|
||||||
if len(string) >= (2 ** 30):
|
if len(string) >= (2 ** 30):
|
||||||
raise ValueError(
|
msg = "String is too long: %d characters. Max is 2**30."
|
||||||
"String is too long: %d characters. Max is 2**30." % len(string)
|
raise ValueError(msg % len(string))
|
||||||
)
|
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Doc doc = Doc(self.vocab)
|
cdef Doc doc = Doc(self.vocab)
|
||||||
if length == 0:
|
if length == 0:
|
||||||
|
@ -122,8 +120,8 @@ cdef class Tokenizer:
|
||||||
"""Tokenize a stream of texts.
|
"""Tokenize a stream of texts.
|
||||||
|
|
||||||
texts: A sequence of unicode texts.
|
texts: A sequence of unicode texts.
|
||||||
batch_size (int): The number of texts to accumulate in an internal buffer.
|
batch_size (int): Number of texts to accumulate in an internal buffer.
|
||||||
n_threads (int): The number of threads to use, if the implementation
|
n_threads (int): Number of threads to use, if the implementation
|
||||||
supports multi-threading. The default tokenizer is single-threaded.
|
supports multi-threading. The default tokenizer is single-threaded.
|
||||||
YIELDS (Doc): A sequence of Doc objects, in order.
|
YIELDS (Doc): A sequence of Doc objects, in order.
|
||||||
"""
|
"""
|
||||||
|
@ -232,8 +230,8 @@ cdef class Tokenizer:
|
||||||
if not matches:
|
if not matches:
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||||
else:
|
else:
|
||||||
# let's say we have dyn-o-mite-dave
|
# let's say we have dyn-o-mite-dave - the regex finds the
|
||||||
# the regex finds the start and end positions of the hyphens
|
# start and end positions of the hyphens
|
||||||
start = 0
|
start = 0
|
||||||
for match in matches:
|
for match in matches:
|
||||||
infix_start = match.start()
|
infix_start = match.start()
|
||||||
|
@ -293,8 +291,8 @@ cdef class Tokenizer:
|
||||||
return list(self.infix_finditer(string))
|
return list(self.infix_finditer(string))
|
||||||
|
|
||||||
def find_prefix(self, unicode string):
|
def find_prefix(self, unicode string):
|
||||||
"""Find the length of a prefix that should be segmented from the string,
|
"""Find the length of a prefix that should be segmented from the
|
||||||
or None if no prefix rules match.
|
string, or None if no prefix rules match.
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (unicode): The string to segment.
|
||||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||||
|
@ -305,8 +303,8 @@ cdef class Tokenizer:
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def find_suffix(self, unicode string):
|
def find_suffix(self, unicode string):
|
||||||
"""Find the length of a suffix that should be segmented from the string,
|
"""Find the length of a suffix that should be segmented from the
|
||||||
or None if no suffix rules match.
|
string, or None if no suffix rules match.
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (unicode): The string to segment.
|
||||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||||
|
@ -326,8 +324,8 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
string (unicode): The string to specially tokenize.
|
string (unicode): The string to specially tokenize.
|
||||||
token_attrs (iterable): A sequence of dicts, where each dict describes
|
token_attrs (iterable): A sequence of dicts, where each dict describes
|
||||||
a token and its attributes. The `ORTH` fields of the attributes must
|
a token and its attributes. The `ORTH` fields of the attributes
|
||||||
exactly match the string when they are concatenated.
|
must exactly match the string when they are concatenated.
|
||||||
"""
|
"""
|
||||||
substrings = list(substrings)
|
substrings = list(substrings)
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
|
@ -343,7 +341,7 @@ cdef class Tokenizer:
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||||
"""
|
"""
|
||||||
with path.open('wb') as file_:
|
with path.open('wb') as file_:
|
||||||
file_.write(self.to_bytes(**exclude))
|
file_.write(self.to_bytes(**exclude))
|
||||||
|
|
|
@ -2,4 +2,4 @@ from .doc import Doc
|
||||||
from .token import Token
|
from .token import Token
|
||||||
from .span import Span
|
from .span import Span
|
||||||
|
|
||||||
__all__ = [Doc, Token, Span]
|
__all__ = ['Doc', 'Token', 'Span']
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
cdef class Binder:
|
|
||||||
def __init__(self, *docs):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def __reduce__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def to_bytes(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def from_bytes(cls, data):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def to_disk(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def from_disk(self, path):
|
|
||||||
pass
|
|
|
@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs import intify_attrs, IDS
|
from ..attrs import intify_attrs, IDS
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||||
from ..attrs cimport SENT_START
|
from ..attrs cimport ENT_TYPE, SENT_START
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..compat import is_config, copy_reg, pickle
|
from ..compat import is_config, copy_reg, pickle
|
||||||
|
@ -78,17 +78,18 @@ def _get_chunker(lang):
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
"""A sequence of Token objects. Access sentences and named entities, export
|
"""A sequence of Token objects. Access sentences and named entities, export
|
||||||
annotations to numpy arrays, losslessly serialize to compressed binary strings.
|
annotations to numpy arrays, losslessly serialize to compressed binary
|
||||||
The `Doc` object holds an array of `TokenC` structs. The Python-level
|
strings. The `Doc` object holds an array of `TokenC` structs. The
|
||||||
`Token` and `Span` objects are views of this array, i.e. they don't own
|
Python-level `Token` and `Span` objects are views of this array, i.e.
|
||||||
the data themselves.
|
they don't own the data themselves.
|
||||||
|
|
||||||
EXAMPLE: Construction 1
|
EXAMPLE: Construction 1
|
||||||
>>> doc = nlp(u'Some text')
|
>>> doc = nlp(u'Some text')
|
||||||
|
|
||||||
Construction 2
|
Construction 2
|
||||||
>>> from spacy.tokens import Doc
|
>>> from spacy.tokens import Doc
|
||||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
|
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||||
|
spaces=[True, False, False])
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_extension(cls, name, default=None, method=None,
|
def set_extension(cls, name, default=None, method=None,
|
||||||
|
@ -109,15 +110,14 @@ cdef class Doc:
|
||||||
orths_and_spaces=None):
|
orths_and_spaces=None):
|
||||||
"""Create a Doc object.
|
"""Create a Doc object.
|
||||||
|
|
||||||
vocab (Vocab): A vocabulary object, which must match any models you want
|
vocab (Vocab): A vocabulary object, which must match any models you
|
||||||
to use (e.g. tokenizer, parser, entity recognizer).
|
want to use (e.g. tokenizer, parser, entity recognizer).
|
||||||
words (list or None): A list of unicode strings to add to the document
|
words (list or None): A list of unicode strings to add to the document
|
||||||
as words. If `None`, defaults to empty list.
|
as words. If `None`, defaults to empty list.
|
||||||
spaces (list or None): A list of boolean values, of the same length as
|
spaces (list or None): A list of boolean values, of the same length as
|
||||||
words. True means that the word is followed by a space, False means
|
words. True means that the word is followed by a space, False means
|
||||||
it is not. If `None`, defaults to `[True]*len(words)`
|
it is not. If `None`, defaults to `[True]*len(words)`
|
||||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||||
|
|
||||||
RETURNS (Doc): The newly constructed object.
|
RETURNS (Doc): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -153,10 +153,10 @@ cdef class Doc:
|
||||||
spaces = [True] * len(words)
|
spaces = [True] * len(words)
|
||||||
elif len(spaces) != len(words):
|
elif len(spaces) != len(words):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Arguments 'words' and 'spaces' should be sequences of the "
|
"Arguments 'words' and 'spaces' should be sequences of "
|
||||||
"same length, or 'spaces' should be left default at None. "
|
"the same length, or 'spaces' should be left default at "
|
||||||
"spaces should be a sequence of booleans, with True meaning "
|
"None. spaces should be a sequence of booleans, with True "
|
||||||
"that the word owns a ' ' character following it.")
|
"meaning that the word owns a ' ' character following it.")
|
||||||
orths_and_spaces = zip(words, spaces)
|
orths_and_spaces = zip(words, spaces)
|
||||||
if orths_and_spaces is not None:
|
if orths_and_spaces is not None:
|
||||||
for orth_space in orths_and_spaces:
|
for orth_space in orths_and_spaces:
|
||||||
|
@ -166,7 +166,8 @@ cdef class Doc:
|
||||||
elif isinstance(orth_space, bytes):
|
elif isinstance(orth_space, bytes):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"orths_and_spaces expects either List(unicode) or "
|
"orths_and_spaces expects either List(unicode) or "
|
||||||
"List((unicode, bool)). Got bytes instance: %s" % (str(orth_space)))
|
"List((unicode, bool)). "
|
||||||
|
"Got bytes instance: %s" % (str(orth_space)))
|
||||||
else:
|
else:
|
||||||
orth, has_space = orth_space
|
orth, has_space = orth_space
|
||||||
# Note that we pass self.mem here --- we have ownership, if LexemeC
|
# Note that we pass self.mem here --- we have ownership, if LexemeC
|
||||||
|
@ -186,7 +187,8 @@ cdef class Doc:
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a `Token` or `Span` object.
|
"""Get a `Token` or `Span` object.
|
||||||
|
|
||||||
i (int or tuple) The index of the token, or the slice of the document to get.
|
i (int or tuple) The index of the token, or the slice of the document
|
||||||
|
to get.
|
||||||
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
|
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
|
||||||
`doc[start : end]`.
|
`doc[start : end]`.
|
||||||
|
|
||||||
|
@ -199,11 +201,11 @@ cdef class Doc:
|
||||||
>>> doc[start : end]]
|
>>> doc[start : end]]
|
||||||
Get a `Span` object, starting at position `start` and ending at
|
Get a `Span` object, starting at position `start` and ending at
|
||||||
position `end`, where `start` and `end` are token indices. For
|
position `end`, where `start` and `end` are token indices. For
|
||||||
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
|
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
|
||||||
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
|
4. Stepped slices (e.g. `doc[start : end : step]`) are not
|
||||||
as `Span` objects must be contiguous (cannot have gaps). You can use
|
supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||||
negative indices and open-ended ranges, which have their normal
|
You can use negative indices and open-ended ranges, which have
|
||||||
Python semantics.
|
their normal Python semantics.
|
||||||
"""
|
"""
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||||
|
@ -262,8 +264,10 @@ cdef class Doc:
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
start (int): The index of the first character of the span.
|
start (int): The index of the first character of the span.
|
||||||
end (int): The index of the first character after the span.
|
end (int): The index of the first character after the span.
|
||||||
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
|
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
named entities.
|
||||||
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||||
|
the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
if not isinstance(label, int):
|
if not isinstance(label, int):
|
||||||
|
@ -322,7 +326,8 @@ cdef class Doc:
|
||||||
if self._vector is not None:
|
if self._vector is not None:
|
||||||
return self._vector
|
return self._vector
|
||||||
elif not len(self):
|
elif not len(self):
|
||||||
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
self._vector = numpy.zeros((self.vocab.vectors_length,),
|
||||||
|
dtype='f')
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.has_vector:
|
elif self.has_vector:
|
||||||
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||||
|
@ -334,7 +339,8 @@ cdef class Doc:
|
||||||
self._vector = self.tensor.mean(axis=0)
|
self._vector = self.tensor.mean(axis=0)
|
||||||
return self._vector
|
return self._vector
|
||||||
else:
|
else:
|
||||||
return numpy.zeros((self.vocab.vectors_length,), dtype='float32')
|
return numpy.zeros((self.vocab.vectors_length,),
|
||||||
|
dtype='float32')
|
||||||
|
|
||||||
def __set__(self, value):
|
def __set__(self, value):
|
||||||
self._vector = value
|
self._vector = value
|
||||||
|
@ -377,13 +383,14 @@ cdef class Doc:
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property ents:
|
property ents:
|
||||||
"""Iterate over the entities in the document. Yields named-entity `Span`
|
"""Iterate over the entities in the document. Yields named-entity
|
||||||
objects, if the entity recognizer has been applied to the document.
|
`Span` objects, if the entity recognizer has been applied to the
|
||||||
|
document.
|
||||||
|
|
||||||
YIELDS (Span): Entities in the document.
|
YIELDS (Span): Entities in the document.
|
||||||
|
|
||||||
EXAMPLE: Iterate over the span to get individual Token objects, or access
|
EXAMPLE: Iterate over the span to get individual Token objects,
|
||||||
the label:
|
or access the label:
|
||||||
|
|
||||||
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||||
>>> ents = list(tokens.ents)
|
>>> ents = list(tokens.ents)
|
||||||
|
@ -419,7 +426,8 @@ cdef class Doc:
|
||||||
def __set__(self, ents):
|
def __set__(self, ents):
|
||||||
# TODO:
|
# TODO:
|
||||||
# 1. Allow negative matches
|
# 1. Allow negative matches
|
||||||
# 2. Ensure pre-set NERs are not over-written during statistical prediction
|
# 2. Ensure pre-set NERs are not over-written during statistical
|
||||||
|
# prediction
|
||||||
# 3. Test basic data-driven ORTH gazetteer
|
# 3. Test basic data-driven ORTH gazetteer
|
||||||
# 4. Test more nuanced date and currency regex
|
# 4. Test more nuanced date and currency regex
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -428,7 +436,7 @@ cdef class Doc:
|
||||||
# At this point we don't know whether the NER has run over the
|
# At this point we don't know whether the NER has run over the
|
||||||
# Doc. If the ent_iob is missing, leave it missing.
|
# Doc. If the ent_iob is missing, leave it missing.
|
||||||
if self.c[i].ent_iob != 0:
|
if self.c[i].ent_iob != 0:
|
||||||
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
|
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
|
||||||
cdef attr_t ent_type
|
cdef attr_t ent_type
|
||||||
cdef int start, end
|
cdef int start, end
|
||||||
for ent_info in ents:
|
for ent_info in ents:
|
||||||
|
@ -456,10 +464,11 @@ cdef class Doc:
|
||||||
|
|
||||||
property noun_chunks:
|
property noun_chunks:
|
||||||
"""Iterate over the base noun phrases in the document. Yields base
|
"""Iterate over the base noun phrases in the document. Yields base
|
||||||
noun-phrase #[code Span] objects, if the document has been syntactically
|
noun-phrase #[code Span] objects, if the document has been
|
||||||
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
|
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
||||||
not permit other NPs to be nested within it – so no NP-level
|
phrase that does not permit other NPs to be nested within it – so no
|
||||||
coordination, no prepositional phrases, and no relative clauses.
|
NP-level coordination, no prepositional phrases, and no relative
|
||||||
|
clauses.
|
||||||
|
|
||||||
YIELDS (Span): Noun chunks in the document.
|
YIELDS (Span): Noun chunks in the document.
|
||||||
"""
|
"""
|
||||||
|
@ -467,12 +476,14 @@ cdef class Doc:
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"noun_chunks requires the dependency parse, which "
|
"noun_chunks requires the dependency parse, which "
|
||||||
"requires data to be installed. For more info, see the "
|
"requires a statistical model to be installed and loaded. "
|
||||||
|
"For more info, see the "
|
||||||
"documentation: \n%s\n" % about.__docs_models__)
|
"documentation: \n%s\n" % about.__docs_models__)
|
||||||
# Accumulate the result before beginning to iterate over it. This prevents
|
# Accumulate the result before beginning to iterate over it. This
|
||||||
# the tokenisation from being changed out from under us during the iteration.
|
# prevents the tokenisation from being changed out from under us
|
||||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
# during the iteration. The tricky thing here is that Span accepts
|
||||||
# so it's okay once we have the Span objects. See Issue #375
|
# its tokenisation changing, so it's okay once we have the Span
|
||||||
|
# objects. See Issue #375.
|
||||||
spans = []
|
spans = []
|
||||||
for start, end, label in self.noun_chunks_iterator(self):
|
for start, end, label in self.noun_chunks_iterator(self):
|
||||||
spans.append(Span(self, start, end, label=label))
|
spans.append(Span(self, start, end, label=label))
|
||||||
|
@ -497,8 +508,9 @@ cdef class Doc:
|
||||||
|
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"sentence boundary detection requires the dependency parse, which "
|
"Sentence boundary detection requires the dependency "
|
||||||
"requires data to be installed. For more info, see the "
|
"parse, which requires a statistical model to be "
|
||||||
|
"installed and loaded. For more info, see the "
|
||||||
"documentation: \n%s\n" % about.__docs_models__)
|
"documentation: \n%s\n" % about.__docs_models__)
|
||||||
cdef int i
|
cdef int i
|
||||||
start = 0
|
start = 0
|
||||||
|
@ -537,12 +549,11 @@ cdef class Doc:
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
"""Export given token attributes to a numpy `ndarray`.
|
"""Export given token attributes to a numpy `ndarray`.
|
||||||
|
If `attr_ids` is a sequence of M attributes, the output array will be
|
||||||
If `attr_ids` is a sequence of M attributes, the output array will
|
of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
|
||||||
be of shape `(N, M)`, where N is the length of the `Doc`
|
`attr_ids` is a single attribute, the output shape will be (N,). You
|
||||||
(in tokens). If `attr_ids` is a single attribute, the output shape will
|
can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
|
||||||
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
|
string name (e.g. 'LEMMA' or 'lemma').
|
||||||
or string name (e.g. 'LEMMA' or 'lemma').
|
|
||||||
|
|
||||||
attr_ids (list[]): A list of attributes (int IDs or string names).
|
attr_ids (list[]): A list of attributes (int IDs or string names).
|
||||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||||
|
@ -566,18 +577,19 @@ cdef class Doc:
|
||||||
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
||||||
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
|
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
|
||||||
for id_ in py_attr_ids]
|
for id_ in py_attr_ids]
|
||||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
# Make an array from the attributes --- otherwise our inner loop is
|
||||||
# dict iteration.
|
# Python dict iteration.
|
||||||
attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
output = numpy.ndarray(shape=(self.length, len(attr_ids)),
|
||||||
|
dtype=numpy.uint64)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
for j, feature in enumerate(attr_ids):
|
for j, feature in enumerate(attr_ids):
|
||||||
output[i, j] = get_token_attr(&self.c[i], feature)
|
output[i, j] = get_token_attr(&self.c[i], feature)
|
||||||
# Handle 1d case
|
# Handle 1d case
|
||||||
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
|
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
|
||||||
|
|
||||||
|
def count_by(self, attr_id_t attr_id, exclude=None,
|
||||||
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
PreshCounter counts=None):
|
||||||
"""Count the frequencies of a given attribute. Produces a dict of
|
"""Count the frequencies of a given attribute. Produces a dict of
|
||||||
`{attribute (int): count (ints)}` frequencies, keyed by the values of
|
`{attribute (int): count (ints)}` frequencies, keyed by the values of
|
||||||
the given attribute ID.
|
the given attribute ID.
|
||||||
|
@ -641,13 +653,12 @@ cdef class Doc:
|
||||||
def from_array(self, attrs, array):
|
def from_array(self, attrs, array):
|
||||||
if SENT_START in attrs and HEAD in attrs:
|
if SENT_START in attrs and HEAD in attrs:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Conflicting attributes specified in doc.from_array():\n"
|
"Conflicting attributes specified in doc.from_array(): "
|
||||||
"(HEAD, SENT_START)\n"
|
"(HEAD, SENT_START)\n"
|
||||||
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
|
"The HEAD attribute currently sets sentence boundaries "
|
||||||
"based on the tree structure. This means the HEAD attribute would "
|
"implicitly, based on the tree structure. This means the HEAD "
|
||||||
"potentially override the sentence boundaries set by SENT_START.\n"
|
"attribute would potentially override the sentence boundaries "
|
||||||
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
|
"set by SENT_START.")
|
||||||
"workarounds, and to propose solutions.")
|
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
|
@ -675,18 +686,14 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def get_lca_matrix(self):
|
def get_lca_matrix(self):
|
||||||
'''
|
"""Calculates the lowest common ancestor matrix for a given `Doc`.
|
||||||
Calculates the lowest common ancestor matrix
|
Returns LCA matrix containing the integer index of the ancestor, or -1
|
||||||
for a given Spacy doc.
|
if no common ancestor is found (ex if span excludes a necessary
|
||||||
Returns LCA matrix containing the integer index
|
ancestor). Apologies about the recursion, but the impact on
|
||||||
of the ancestor, or -1 if no common ancestor is
|
performance is negligible given the natural limitations on the depth
|
||||||
found (ex if span excludes a necessary ancestor).
|
of a typical human sentence.
|
||||||
Apologies about the recursion, but the
|
"""
|
||||||
impact on performance is negligible given
|
|
||||||
the natural limitations on the depth of a typical human sentence.
|
|
||||||
'''
|
|
||||||
# Efficiency notes:
|
# Efficiency notes:
|
||||||
#
|
|
||||||
# We can easily improve the performance here by iterating in Cython.
|
# We can easily improve the performance here by iterating in Cython.
|
||||||
# To loop over the tokens in Cython, the easiest way is:
|
# To loop over the tokens in Cython, the easiest way is:
|
||||||
# for token in doc.c[:doc.c.length]:
|
# for token in doc.c[:doc.c.length]:
|
||||||
|
@ -705,7 +712,8 @@ cdef class Doc:
|
||||||
elif (token_j.head == token_j) and (token_k.head == token_k):
|
elif (token_j.head == token_j) and (token_k.head == token_k):
|
||||||
lca_index = -1
|
lca_index = -1
|
||||||
else:
|
else:
|
||||||
lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix)
|
lca_index = __pairwise_lca(token_j.head, token_k.head,
|
||||||
|
lca_matrix)
|
||||||
lca_matrix[token_j.i][token_k.i] = lca_index
|
lca_matrix[token_j.i][token_k.i] = lca_index
|
||||||
lca_matrix[token_k.i][token_j.i] = lca_index
|
lca_matrix[token_k.i][token_j.i] = lca_index
|
||||||
|
|
||||||
|
@ -719,14 +727,13 @@ cdef class Doc:
|
||||||
token_k = self[k]
|
token_k = self[k]
|
||||||
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
|
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
|
||||||
lca_matrix[k][j] = lca_matrix[j][k]
|
lca_matrix[k][j] = lca_matrix[j][k]
|
||||||
|
|
||||||
return lca_matrix
|
return lca_matrix
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||||
"""
|
"""
|
||||||
with path.open('wb') as file_:
|
with path.open('wb') as file_:
|
||||||
file_.write(self.to_bytes(**exclude))
|
file_.write(self.to_bytes(**exclude))
|
||||||
|
@ -749,7 +756,7 @@ cdef class Doc:
|
||||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||||
all annotations.
|
all annotations.
|
||||||
"""
|
"""
|
||||||
array_head = [LENGTH,SPACY,TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE]
|
array_head = [LENGTH, SPACY, TAG, LEMMA, HEAD, DEP, ENT_IOB, ENT_TYPE]
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
|
@ -792,7 +799,8 @@ cdef class Doc:
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
# users don't mind getting a list instead of a tuple.
|
# users don't mind getting a list instead of a tuple.
|
||||||
if 'user_data' not in exclude and 'user_data_keys' in msg:
|
if 'user_data' not in exclude and 'user_data_keys' in msg:
|
||||||
user_data_keys = msgpack.loads(msg['user_data_keys'], use_list=False)
|
user_data_keys = msgpack.loads(msg['user_data_keys'],
|
||||||
|
use_list=False)
|
||||||
user_data_values = msgpack.loads(msg['user_data_values'])
|
user_data_values = msgpack.loads(msg['user_data_values'])
|
||||||
for key, value in zip(user_data_keys, user_data_values):
|
for key, value in zip(user_data_keys, user_data_values):
|
||||||
self.user_data[key] = value
|
self.user_data[key] = value
|
||||||
|
@ -819,14 +827,15 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
|
"""Retokenize the document, such that the span at
|
||||||
is merged into a single token. If `start_idx` and `end_idx `do not mark
|
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
||||||
start and end token boundaries, the document remains unchanged.
|
`start_idx` and `end_idx `do not mark start and end token boundaries,
|
||||||
|
the document remains unchanged.
|
||||||
|
|
||||||
start_idx (int): The character index of the start of the slice to merge.
|
start_idx (int): Character index of the start of the slice to merge.
|
||||||
end_idx (int): The character index after the end of the slice to merge.
|
end_idx (int): Character index after the end of the slice to merge.
|
||||||
**attributes: Attributes to assign to the merged token. By default,
|
**attributes: Attributes to assign to the merged token. By default,
|
||||||
attributes are inherited from the syntactic root token of the span.
|
attributes are inherited from the syntactic root of the span.
|
||||||
RETURNS (Token): The newly merged token, or `None` if the start and end
|
RETURNS (Token): The newly merged token, or `None` if the start and end
|
||||||
indices did not fall at token boundaries.
|
indices did not fall at token boundaries.
|
||||||
"""
|
"""
|
||||||
|
@ -847,10 +856,11 @@ cdef class Doc:
|
||||||
attributes[ENT_TYPE] = attributes['ent_type']
|
attributes[ENT_TYPE] = attributes['ent_type']
|
||||||
elif args:
|
elif args:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Doc.merge received %d non-keyword arguments. "
|
"Doc.merge received %d non-keyword arguments. Expected either "
|
||||||
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
|
"3 arguments (deprecated), or 0 (use keyword arguments). "
|
||||||
"Arguments supplied:\n%s\n"
|
"Arguments supplied:\n%s\n"
|
||||||
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
"Keyword arguments: %s\n" % (len(args), repr(args),
|
||||||
|
repr(attributes)))
|
||||||
|
|
||||||
# More deprecated attribute handling =/
|
# More deprecated attribute handling =/
|
||||||
if 'label' in attributes:
|
if 'label' in attributes:
|
||||||
|
@ -882,8 +892,9 @@ cdef class Doc:
|
||||||
Token.set_struct_attr(token, attr_name, attr_value)
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a dependency
|
# Before thinking of something simpler, beware the case where a
|
||||||
# bridges over the entity. Here the alignment of the tokens changes.
|
# dependency bridges over the entity. Here the alignment of the
|
||||||
|
# tokens changes.
|
||||||
span_root = span.root.i
|
span_root = span.root.i
|
||||||
token.dep = span.root.dep
|
token.dep = span.root.dep
|
||||||
# We update token.lex after keeping span root and dep, since
|
# We update token.lex after keeping span root and dep, since
|
||||||
|
@ -932,8 +943,9 @@ cdef class Doc:
|
||||||
>>> trees = doc.print_tree()
|
>>> trees = doc.print_tree()
|
||||||
>>> trees[1]
|
>>> trees[1]
|
||||||
{'modifiers': [
|
{'modifiers': [
|
||||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
|
||||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
|
||||||
|
'lemma': 'Alice'},
|
||||||
{'modifiers': [
|
{'modifiers': [
|
||||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||||
|
@ -1018,4 +1030,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
||||||
|
|
||||||
|
|
||||||
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
|
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
|
||||||
|
|
||||||
|
|
|
@ -43,8 +43,8 @@ def POS_tree(root, light=False, flat=False):
|
||||||
|
|
||||||
|
|
||||||
def parse_tree(doc, light=False, flat=False):
|
def parse_tree(doc, light=False, flat=False):
|
||||||
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
"""Make a copy of the doc and construct a syntactic parse tree similar to
|
||||||
the one used in displaCy. Generates the POS tree for all sentences in a doc.
|
displaCy. Generates the POS tree for all sentences in a doc.
|
||||||
|
|
||||||
doc (Doc): The doc for parsing.
|
doc (Doc): The doc for parsing.
|
||||||
RETURNS (dict): The parse tree.
|
RETURNS (dict): The parse tree.
|
||||||
|
@ -66,8 +66,9 @@ def parse_tree(doc, light=False, flat=False):
|
||||||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||||
"""
|
"""
|
||||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||||
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
||||||
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
|
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
|
||||||
merge_ents(doc_clone) # merge the entities into single tokens first
|
merge_ents(doc_clone) # merge the entities into single tokens first
|
||||||
return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
|
return [POS_tree(sent.root, light=light, flat=flat)
|
||||||
|
for sent in doc_clone.sents]
|
||||||
|
|
|
@ -35,15 +35,16 @@ cdef class Span:
|
||||||
def has_extension(cls, name):
|
def has_extension(cls, name):
|
||||||
return name in Underscore.span_extensions
|
return name in Underscore.span_extensions
|
||||||
|
|
||||||
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
|
def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
|
||||||
vector_norm=None):
|
vector=None, vector_norm=None):
|
||||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||||
|
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
start (int): The index of the first token of the span.
|
start (int): The index of the first token of the span.
|
||||||
end (int): The index of the first token after the span.
|
end (int): The index of the first token after the span.
|
||||||
label (uint64): A label to attach to the Span, e.g. for named entities.
|
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
||||||
|
of the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
if not (0 <= start <= end <= len(doc)):
|
if not (0 <= start <= end <= len(doc)):
|
||||||
|
@ -127,14 +128,17 @@ cdef class Span:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _(self):
|
def _(self):
|
||||||
|
"""User space for adding custom attribute extensions."""
|
||||||
return Underscore(Underscore.span_extensions, self,
|
return Underscore(Underscore.span_extensions, self,
|
||||||
start=self.start_char, end=self.end_char)
|
start=self.start_char, end=self.end_char)
|
||||||
|
|
||||||
def as_doc(self):
|
def as_doc(self):
|
||||||
'''Create a Doc object view of the Span's data.
|
# TODO: fix
|
||||||
|
"""Create a `Doc` object view of the Span's data. This is mostly
|
||||||
|
useful for C-typed interfaces.
|
||||||
|
|
||||||
This is mostly useful for C-typed interfaces.
|
RETURNS (Doc): The `Doc` view of the span.
|
||||||
'''
|
"""
|
||||||
cdef Doc doc = Doc(self.doc.vocab)
|
cdef Doc doc = Doc(self.doc.vocab)
|
||||||
doc.length = self.end-self.start
|
doc.length = self.end-self.start
|
||||||
doc.c = &self.doc.c[self.start]
|
doc.c = &self.doc.c[self.start]
|
||||||
|
@ -162,7 +166,8 @@ cdef class Span:
|
||||||
attributes are inherited from the syntactic root token of the span.
|
attributes are inherited from the syntactic root token of the span.
|
||||||
RETURNS (Token): The newly merged token.
|
RETURNS (Token): The newly merged token.
|
||||||
"""
|
"""
|
||||||
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
return self.doc.merge(self.start_char, self.end_char, *args,
|
||||||
|
**attributes)
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||||
|
@ -179,24 +184,19 @@ cdef class Span:
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
def get_lca_matrix(self):
|
def get_lca_matrix(self):
|
||||||
'''
|
"""Calculates the lowest common ancestor matrix for a given `Span`.
|
||||||
Calculates the lowest common ancestor matrix
|
Returns LCA matrix containing the integer index of the ancestor, or -1
|
||||||
for a given Spacy span.
|
if no common ancestor is found (ex if span excludes a necessary
|
||||||
Returns LCA matrix containing the integer index
|
ancestor). Apologies about the recursion, but the impact on
|
||||||
of the ancestor, or -1 if no common ancestor is
|
performance is negligible given the natural limitations on the depth
|
||||||
found (ex if span excludes a necessary ancestor).
|
of a typical human sentence.
|
||||||
Apologies about the recursion, but the
|
"""
|
||||||
impact on performance is negligible given
|
|
||||||
the natural limitations on the depth of a typical human sentence.
|
|
||||||
'''
|
|
||||||
|
|
||||||
def __pairwise_lca(token_j, token_k, lca_matrix, margins):
|
def __pairwise_lca(token_j, token_k, lca_matrix, margins):
|
||||||
offset = margins[0]
|
offset = margins[0]
|
||||||
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
|
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
|
||||||
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
|
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
|
||||||
token_j_i = token_j.i - offset
|
token_j_i = token_j.i - offset
|
||||||
token_k_i = token_k.i - offset
|
token_k_i = token_k.i - offset
|
||||||
|
|
||||||
if lca_matrix[token_j_i][token_k_i] != -2:
|
if lca_matrix[token_j_i][token_k_i] != -2:
|
||||||
return lca_matrix[token_j_i][token_k_i]
|
return lca_matrix[token_j_i][token_k_i]
|
||||||
elif token_j == token_k:
|
elif token_j == token_k:
|
||||||
|
@ -209,23 +209,19 @@ cdef class Span:
|
||||||
lca_index = -1
|
lca_index = -1
|
||||||
else:
|
else:
|
||||||
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
|
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
|
||||||
|
|
||||||
lca_matrix[token_j_i][token_k_i] = lca_index
|
lca_matrix[token_j_i][token_k_i] = lca_index
|
||||||
lca_matrix[token_k_i][token_j_i] = lca_index
|
lca_matrix[token_k_i][token_j_i] = lca_index
|
||||||
|
|
||||||
return lca_index
|
return lca_index
|
||||||
|
|
||||||
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
|
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
|
||||||
lca_matrix.fill(-2)
|
lca_matrix.fill(-2)
|
||||||
margins = [self.start, self.end]
|
margins = [self.start, self.end]
|
||||||
|
|
||||||
for j in range(len(self)):
|
for j in range(len(self)):
|
||||||
token_j = self[j]
|
token_j = self[j]
|
||||||
for k in range(len(self)):
|
for k in range(len(self)):
|
||||||
token_k = self[k]
|
token_k = self[k]
|
||||||
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
|
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
|
||||||
lca_matrix[k][j] = lca_matrix[j][k]
|
lca_matrix[k][j] = lca_matrix[j][k]
|
||||||
|
|
||||||
return lca_matrix
|
return lca_matrix
|
||||||
|
|
||||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||||
|
@ -266,10 +262,7 @@ cdef class Span:
|
||||||
self.end = end + 1
|
self.end = end + 1
|
||||||
|
|
||||||
property sent:
|
property sent:
|
||||||
"""The sentence span that this span is a part of.
|
"""RETURNS (Span): The sentence span that the span is a part of."""
|
||||||
|
|
||||||
RETURNS (Span): The sentence span that the span is a part of.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sent' in self.doc.user_span_hooks:
|
if 'sent' in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks['sent'](self)
|
return self.doc.user_span_hooks['sent'](self)
|
||||||
|
@ -282,13 +275,10 @@ cdef class Span:
|
||||||
n += 1
|
n += 1
|
||||||
if n >= self.doc.length:
|
if n >= self.doc.length:
|
||||||
raise RuntimeError
|
raise RuntimeError
|
||||||
return self.doc[root.l_edge : root.r_edge + 1]
|
return self.doc[root.l_edge:root.r_edge + 1]
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
"""A boolean value indicating whether a word vector is associated with
|
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
the object.
|
|
||||||
|
|
||||||
RETURNS (bool): Whether a word vector is associated with the object.
|
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.user_span_hooks:
|
if 'has_vector' in self.doc.user_span_hooks:
|
||||||
|
@ -310,10 +300,7 @@ cdef class Span:
|
||||||
return self._vector
|
return self._vector
|
||||||
|
|
||||||
property vector_norm:
|
property vector_norm:
|
||||||
"""The L2 norm of the document's vector representation.
|
"""RETURNS (float): The L2 norm of the vector representation."""
|
||||||
|
|
||||||
RETURNS (float): The L2 norm of the vector representation.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector_norm' in self.doc.user_span_hooks:
|
if 'vector_norm' in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks['vector'](self)
|
return self.doc.user_span_hooks['vector'](self)
|
||||||
|
@ -327,7 +314,9 @@ cdef class Span:
|
||||||
return self._vector_norm
|
return self._vector_norm
|
||||||
|
|
||||||
property sentiment:
|
property sentiment:
|
||||||
# TODO: docstring
|
"""RETURNS (float): A scalar value indicating the positivity or
|
||||||
|
negativity of the span.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sentiment' in self.doc.user_span_hooks:
|
if 'sentiment' in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks['sentiment'](self)
|
return self.doc.user_span_hooks['sentiment'](self)
|
||||||
|
@ -335,10 +324,7 @@ cdef class Span:
|
||||||
return sum([token.sentiment for token in self]) / len(self)
|
return sum([token.sentiment for token in self]) / len(self)
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
"""A unicode representation of the span text.
|
"""RETURNS (unicode): The original verbatim text of the span."""
|
||||||
|
|
||||||
RETURNS (unicode): The original verbatim text of the span.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
text = self.text_with_ws
|
text = self.text_with_ws
|
||||||
if self[-1].whitespace_:
|
if self[-1].whitespace_:
|
||||||
|
@ -349,7 +335,8 @@ cdef class Span:
|
||||||
"""The text content of the span with a trailing whitespace character if
|
"""The text content of the span with a trailing whitespace character if
|
||||||
the last token has one.
|
the last token has one.
|
||||||
|
|
||||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
RETURNS (unicode): The text content of the span (with trailing
|
||||||
|
whitespace).
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return u''.join([t.text_with_ws for t in self])
|
return u''.join([t.text_with_ws for t in self])
|
||||||
|
@ -358,7 +345,8 @@ cdef class Span:
|
||||||
"""Yields base noun-phrase `Span` objects, if the document has been
|
"""Yields base noun-phrase `Span` objects, if the document has been
|
||||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
||||||
phrase that does not permit other NPs to be nested within it – so no
|
phrase that does not permit other NPs to be nested within it – so no
|
||||||
NP-level coordination, no prepositional phrases, and no relative clauses.
|
NP-level coordination, no prepositional phrases, and no relative
|
||||||
|
clauses.
|
||||||
|
|
||||||
YIELDS (Span): Base noun-phrase `Span` objects
|
YIELDS (Span): Base noun-phrase `Span` objects
|
||||||
"""
|
"""
|
||||||
|
@ -366,12 +354,14 @@ cdef class Span:
|
||||||
if not self.doc.is_parsed:
|
if not self.doc.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"noun_chunks requires the dependency parse, which "
|
"noun_chunks requires the dependency parse, which "
|
||||||
"requires data to be installed. For more info, see the "
|
"requires a statistical model to be installed and loaded. "
|
||||||
|
"For more info, see the "
|
||||||
"documentation: \n%s\n" % about.__docs_models__)
|
"documentation: \n%s\n" % about.__docs_models__)
|
||||||
# Accumulate the result before beginning to iterate over it. This prevents
|
# Accumulate the result before beginning to iterate over it. This
|
||||||
# the tokenisation from being changed out from under us during the iteration.
|
# prevents the tokenisation from being changed out from under us
|
||||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
# during the iteration. The tricky thing here is that Span accepts
|
||||||
# so it's okay once we have the Span objects. See Issue #375
|
# its tokenisation changing, so it's okay once we have the Span
|
||||||
|
# objects. See Issue #375
|
||||||
spans = []
|
spans = []
|
||||||
cdef attr_t label
|
cdef attr_t label
|
||||||
for start, end, label in self.doc.noun_chunks_iterator(self):
|
for start, end, label in self.doc.noun_chunks_iterator(self):
|
||||||
|
@ -385,9 +375,9 @@ cdef class Span:
|
||||||
|
|
||||||
RETURNS (Token): The root token.
|
RETURNS (Token): The root token.
|
||||||
|
|
||||||
EXAMPLE: The root token has the shortest path to the root of the sentence
|
EXAMPLE: The root token has the shortest path to the root of the
|
||||||
(or is the root itself). If multiple words are equally high in the
|
sentence (or is the root itself). If multiple words are equally
|
||||||
tree, the first word is taken. For example:
|
high in the tree, the first word is taken. For example:
|
||||||
|
|
||||||
>>> toks = nlp(u'I like New York in Autumn.')
|
>>> toks = nlp(u'I like New York in Autumn.')
|
||||||
|
|
||||||
|
@ -437,11 +427,11 @@ cdef class Span:
|
||||||
if self.doc.c[i].head == 0:
|
if self.doc.c[i].head == 0:
|
||||||
return self.doc[i]
|
return self.doc[i]
|
||||||
# If we don't have a sentence root, we do something that's not so
|
# If we don't have a sentence root, we do something that's not so
|
||||||
# algorithmically clever, but I think should be quite fast, especially
|
# algorithmically clever, but I think should be quite fast,
|
||||||
# for short spans.
|
# especially for short spans.
|
||||||
# For each word, we count the path length, and arg min this measure.
|
# For each word, we count the path length, and arg min this measure.
|
||||||
# We could use better tree logic to save steps here...But I think this
|
# We could use better tree logic to save steps here...But I
|
||||||
# should be okay.
|
# think this should be okay.
|
||||||
cdef int current_best = self.doc.length
|
cdef int current_best = self.doc.length
|
||||||
cdef int root = -1
|
cdef int root = -1
|
||||||
for i in range(self.start, self.end):
|
for i in range(self.start, self.end):
|
||||||
|
@ -463,7 +453,7 @@ cdef class Span:
|
||||||
YIELDS (Token):A left-child of a token of the span.
|
YIELDS (Token):A left-child of a token of the span.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for token in reversed(self): # Reverse, so we get the tokens in order
|
for token in reversed(self): # Reverse, so we get tokens in order
|
||||||
for left in token.lefts:
|
for left in token.lefts:
|
||||||
if left.i < self.start:
|
if left.i < self.start:
|
||||||
yield left
|
yield left
|
||||||
|
@ -480,6 +470,22 @@ cdef class Span:
|
||||||
if right.i >= self.end:
|
if right.i >= self.end:
|
||||||
yield right
|
yield right
|
||||||
|
|
||||||
|
property n_lefts:
|
||||||
|
"""RETURNS (int): The number of leftward immediate children of the
|
||||||
|
span, in the syntactic dependency parse.
|
||||||
|
"""
|
||||||
|
# TODO: implement
|
||||||
|
def __get__(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
property n_rights:
|
||||||
|
"""RETURNS (int): The number of rightward immediate children of the
|
||||||
|
span, in the syntactic dependency parse.
|
||||||
|
"""
|
||||||
|
# TODO: implement
|
||||||
|
def __get__(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
"""Tokens that descend from tokens in the span, but fall outside it.
|
||||||
|
|
||||||
|
@ -493,66 +499,55 @@ cdef class Span:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
"""RETURNS (uint64): The entity ID."""
|
||||||
|
|
||||||
RETURNS (uint64): The entity ID.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id
|
return self.root.ent_id
|
||||||
|
|
||||||
def __set__(self, hash_t key):
|
def __set__(self, hash_t key):
|
||||||
# TODO
|
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
"Can't yet set ent_id from Span. Vote for this feature on "
|
||||||
"tracker: http://github.com/explosion/spaCy/issues")
|
"the issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
|
"""RETURNS (unicode): The (string) entity ID."""
|
||||||
|
|
||||||
RETURNS (unicode): The entity ID.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id_
|
return self.root.ent_id_
|
||||||
|
|
||||||
def __set__(self, hash_t key):
|
def __set__(self, hash_t key):
|
||||||
# TODO
|
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
|
"Can't yet set ent_id_ from Span. Vote for this feature on the "
|
||||||
"tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
# TODO: docstring
|
"""Verbatim text content (identical to Span.text). Exists mostly for
|
||||||
|
consistency with other attributes.
|
||||||
|
|
||||||
|
RETURNS (unicode): The span's text."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ''.join([t.string for t in self]).strip()
|
return ''.join([t.orth_ for t in self]).strip()
|
||||||
|
|
||||||
property lemma_:
|
property lemma_:
|
||||||
"""The span's lemma.
|
"""RETURNS (unicode): The span's lemma."""
|
||||||
|
|
||||||
RETURNS (unicode): The span's lemma.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ' '.join([t.lemma_ for t in self]).strip()
|
return ' '.join([t.lemma_ for t in self]).strip()
|
||||||
|
|
||||||
property upper_:
|
property upper_:
|
||||||
# TODO: docstring
|
"""Deprecated. Use Span.text.upper() instead."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ''.join([t.string.upper() for t in self]).strip()
|
return ''.join([t.text_with_ws.upper() for t in self]).strip()
|
||||||
|
|
||||||
property lower_:
|
property lower_:
|
||||||
# TODO: docstring
|
"""Deprecated. Use Span.text.lower() instead."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ''.join([t.string.lower() for t in self]).strip()
|
return ''.join([t.text_with_ws.lower() for t in self]).strip()
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
# TODO: docstring
|
"""Deprecated: Use Span.text_with_ws instead."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ''.join([t.string for t in self])
|
return ''.join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
property label_:
|
property label_:
|
||||||
"""The span's label.
|
"""RETURNS (unicode): The span's label."""
|
||||||
|
|
||||||
RETURNS (unicode): The span's label.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.label]
|
return self.doc.vocab.strings[self.label]
|
||||||
|
|
||||||
|
@ -570,7 +565,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||||
n += 1
|
n += 1
|
||||||
if n >= sent_length:
|
if n >= sent_length:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Array bounds exceeded while searching for root word. This likely "
|
"Array bounds exceeded while searching for root word. This "
|
||||||
"means the parse tree is in an invalid state. Please report this "
|
"likely means the parse tree is in an invalid state. Please "
|
||||||
"issue here: http://github.com/explosion/spaCy/issues")
|
"report this issue here: "
|
||||||
|
"http://github.com/explosion/spaCy/issues")
|
||||||
return n
|
return n
|
||||||
|
|
|
@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .. import parts_of_speech
|
from .. import parts_of_speech
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
|
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
||||||
from ..attrs cimport LEMMA, POS, TAG, DEP
|
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
||||||
from ..compat import is_config
|
from ..compat import is_config
|
||||||
from .. import about
|
from .. import about
|
||||||
from .underscore import Underscore
|
from .underscore import Underscore
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
|
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
||||||
|
etc."""
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_extension(cls, name, default=None, method=None,
|
def set_extension(cls, name, default=None, method=None,
|
||||||
getter=None, setter=None):
|
getter=None, setter=None):
|
||||||
|
@ -144,37 +145,33 @@ cdef class Token:
|
||||||
return self.doc.user_token_hooks['similarity'](self)
|
return self.doc.user_token_hooks['similarity'](self)
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return (numpy.dot(self.vector, other.vector) /
|
||||||
|
(self.vector_norm * other.vector_norm))
|
||||||
|
|
||||||
property lex_id:
|
property lex_id:
|
||||||
"""ID of the token's lexical type.
|
"""RETURNS (int): Sequential ID of the token's lexical type."""
|
||||||
|
|
||||||
RETURNS (int): ID of the token's lexical type."""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.id
|
return self.c.lex.id
|
||||||
|
|
||||||
property rank:
|
property rank:
|
||||||
# TODO: add docstring
|
"""RETURNS (int): Sequential ID of the token's lexical type, used to
|
||||||
|
index into tables, e.g. for word vectors."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.id
|
return self.c.lex.id
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
|
"""Deprecated: Use Token.text_with_ws instead."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.text_with_ws
|
return self.text_with_ws
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
"""A unicode representation of the token text.
|
"""RETURNS (unicode): The original verbatim text of the token."""
|
||||||
|
|
||||||
RETURNS (unicode): The original verbatim text of the token.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
property text_with_ws:
|
property text_with_ws:
|
||||||
"""The text content of the token with a trailing whitespace character if
|
"""RETURNS (unicode): The text content of the span (with trailing
|
||||||
it has one.
|
whitespace).
|
||||||
|
|
||||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
||||||
|
@ -184,74 +181,104 @@ cdef class Token:
|
||||||
return orth
|
return orth
|
||||||
|
|
||||||
property prob:
|
property prob:
|
||||||
|
"""RETURNS (float): Smoothed log probability estimate of token type."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.prob
|
return self.c.lex.prob
|
||||||
|
|
||||||
property sentiment:
|
property sentiment:
|
||||||
|
"""RETURNS (float): A scalar value indicating the positivity or
|
||||||
|
negativity of the token."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sentiment' in self.doc.user_token_hooks:
|
if 'sentiment' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['sentiment'](self)
|
return self.doc.user_token_hooks['sentiment'](self)
|
||||||
return self.c.lex.sentiment
|
return self.c.lex.sentiment
|
||||||
|
|
||||||
property lang:
|
property lang:
|
||||||
|
"""RETURNS (uint64): ID of the language of the parent document's
|
||||||
|
vocabulary.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.lang
|
return self.c.lex.lang
|
||||||
|
|
||||||
property idx:
|
property idx:
|
||||||
|
"""RETURNS (int): The character offset of the token within the parent
|
||||||
|
document.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.idx
|
return self.c.idx
|
||||||
|
|
||||||
property cluster:
|
property cluster:
|
||||||
|
"""RETURNS (int): Brown cluster ID."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.cluster
|
return self.c.lex.cluster
|
||||||
|
|
||||||
property orth:
|
property orth:
|
||||||
|
"""RETURNS (uint64): ID of the verbatim text content."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.orth
|
return self.c.lex.orth
|
||||||
|
|
||||||
property lower:
|
property lower:
|
||||||
|
"""RETURNS (uint64): ID of the lowercase token text."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.lower
|
return self.c.lex.lower
|
||||||
|
|
||||||
property norm:
|
property norm:
|
||||||
|
"""RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
|
||||||
|
the token text. Usually set in the language's tokenizer exceptions
|
||||||
|
or norm exceptions.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.norm
|
return self.c.lex.norm
|
||||||
|
|
||||||
property shape:
|
property shape:
|
||||||
|
"""RETURNS (uint64): ID of the token's shape, a transform of the
|
||||||
|
tokens's string, to show orthographic features (e.g. "Xxxx", "dd").
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.shape
|
return self.c.lex.shape
|
||||||
|
|
||||||
property prefix:
|
property prefix:
|
||||||
|
"""RETURNS (uint64): ID of a length-N substring from the start of the
|
||||||
|
token. Defaults to `N=1`.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.prefix
|
return self.c.lex.prefix
|
||||||
|
|
||||||
property suffix:
|
property suffix:
|
||||||
|
"""RETURNS (uint64): ID of a length-N substring from the end of the
|
||||||
|
token. Defaults to `N=3`.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lex.suffix
|
return self.c.lex.suffix
|
||||||
|
|
||||||
property lemma:
|
property lemma:
|
||||||
"""Base form of the word, with no inflectional suffixes.
|
"""RETURNS (uint64): ID of the base form of the word, with no
|
||||||
|
inflectional suffixes.
|
||||||
RETURNS (uint64): Token lemma.
|
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.lemma
|
return self.c.lemma
|
||||||
|
|
||||||
def __set__(self, attr_t lemma):
|
def __set__(self, attr_t lemma):
|
||||||
self.c.lemma = lemma
|
self.c.lemma = lemma
|
||||||
|
|
||||||
property pos:
|
property pos:
|
||||||
|
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.pos
|
return self.c.pos
|
||||||
|
|
||||||
property tag:
|
property tag:
|
||||||
|
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.tag
|
return self.c.tag
|
||||||
|
|
||||||
def __set__(self, attr_t tag):
|
def __set__(self, attr_t tag):
|
||||||
self.vocab.morphology.assign_tag(self.c, tag)
|
self.vocab.morphology.assign_tag(self.c, tag)
|
||||||
|
|
||||||
property dep:
|
property dep:
|
||||||
|
"""RETURNS (uint64): ID of syntactic dependency label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.dep
|
return self.c.dep
|
||||||
|
|
||||||
def __set__(self, attr_t label):
|
def __set__(self, attr_t label):
|
||||||
self.c.dep = label
|
self.c.dep = label
|
||||||
|
|
||||||
|
@ -292,23 +319,29 @@ cdef class Token:
|
||||||
return numpy.sqrt((vector ** 2).sum())
|
return numpy.sqrt((vector ** 2).sum())
|
||||||
|
|
||||||
property n_lefts:
|
property n_lefts:
|
||||||
|
"""RETURNS (int): The number of leftward immediate children of the
|
||||||
|
word, in the syntactic dependency parse.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.l_kids
|
return self.c.l_kids
|
||||||
|
|
||||||
property n_rights:
|
property n_rights:
|
||||||
|
"""RETURNS (int): The number of rightward immediate children of the
|
||||||
|
word, in the syntactic dependency parse.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.r_kids
|
return self.c.r_kids
|
||||||
|
|
||||||
property sent_start:
|
property sent_start:
|
||||||
|
# TODO: fix and document
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.sent_start
|
return self.c.sent_start
|
||||||
|
|
||||||
def __set__(self, value):
|
def __set__(self, value):
|
||||||
if self.doc.is_parsed:
|
if self.doc.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
'Refusing to write to token.sent_start if its document is parsed, '
|
"Refusing to write to token.sent_start if its document "
|
||||||
'because this may cause inconsistent state. '
|
"is parsed, because this may cause inconsistent state.")
|
||||||
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
|
|
||||||
if value is None:
|
if value is None:
|
||||||
self.c.sent_start = 0
|
self.c.sent_start = 0
|
||||||
elif value is True:
|
elif value is True:
|
||||||
|
@ -316,15 +349,16 @@ cdef class Token:
|
||||||
elif value is False:
|
elif value is False:
|
||||||
self.c.sent_start = -1
|
self.c.sent_start = -1
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid value for token.sent_start -- must be one of "
|
raise ValueError("Invalid value for token.sent_start. Must be "
|
||||||
"None, True, False")
|
"one of: None, True, False")
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
|
"""The leftward immediate children of the word, in the syntactic
|
||||||
|
dependency parse.
|
||||||
|
|
||||||
|
YIELDS (Token): A left-child of the token.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""
|
|
||||||
The leftward immediate children of the word, in the syntactic
|
|
||||||
dependency parse.
|
|
||||||
"""
|
|
||||||
cdef int nr_iter = 0
|
cdef int nr_iter = 0
|
||||||
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
|
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
|
||||||
while ptr < self.c:
|
while ptr < self.c:
|
||||||
|
@ -334,15 +368,16 @@ cdef class Token:
|
||||||
nr_iter += 1
|
nr_iter += 1
|
||||||
# This is ugly, but it's a way to guard out infinite loops
|
# This is ugly, but it's a way to guard out infinite loops
|
||||||
if nr_iter >= 10000000:
|
if nr_iter >= 10000000:
|
||||||
raise RuntimeError(
|
raise RuntimeError("Possibly infinite loop encountered "
|
||||||
"Possibly infinite loop encountered while looking for token.lefts")
|
"while looking for token.lefts")
|
||||||
|
|
||||||
property rights:
|
property rights:
|
||||||
|
"""The rightward immediate children of the word, in the syntactic
|
||||||
|
dependency parse.
|
||||||
|
|
||||||
|
YIELDS (Token): A right-child of the token.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""
|
|
||||||
The rightward immediate children of the word, in the syntactic
|
|
||||||
dependency parse.
|
|
||||||
"""
|
|
||||||
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
||||||
tokens = []
|
tokens = []
|
||||||
cdef int nr_iter = 0
|
cdef int nr_iter = 0
|
||||||
|
@ -352,27 +387,26 @@ cdef class Token:
|
||||||
ptr -= 1
|
ptr -= 1
|
||||||
nr_iter += 1
|
nr_iter += 1
|
||||||
if nr_iter >= 10000000:
|
if nr_iter >= 10000000:
|
||||||
raise RuntimeError(
|
raise RuntimeError("Possibly infinite loop encountered "
|
||||||
"Possibly infinite loop encountered while looking for token.rights")
|
"while looking for token.rights")
|
||||||
tokens.reverse()
|
tokens.reverse()
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
yield t
|
yield t
|
||||||
|
|
||||||
property children:
|
property children:
|
||||||
"""
|
"""A sequence of the token's immediate syntactic children.
|
||||||
A sequence of the token's immediate syntactic children.
|
|
||||||
|
|
||||||
Yields: Token A child token such that child.head==self
|
YIELDS (Token): A child token such that child.head==self
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
yield from self.lefts
|
yield from self.lefts
|
||||||
yield from self.rights
|
yield from self.rights
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
"""
|
"""A sequence of all the token's syntactic descendents.
|
||||||
A sequence of all the token's syntactic descendents.
|
|
||||||
|
|
||||||
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
YIELDS (Token): A descendent token such that
|
||||||
|
`self.is_ancestor(descendent)`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for word in self.lefts:
|
for word in self.lefts:
|
||||||
|
@ -422,18 +456,17 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
if self.doc is not descendant.doc:
|
if self.doc is not descendant.doc:
|
||||||
return False
|
return False
|
||||||
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
"""The syntactic parent, or "governor", of this token.
|
"""The syntactic parent, or "governor", of this token.
|
||||||
|
|
||||||
RETURNS (Token): The token head.
|
RETURNS (Token): The token predicted by the parser to be the head of
|
||||||
|
the current token.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""The token predicted by the parser to be the head of the current
|
|
||||||
token.
|
|
||||||
"""
|
|
||||||
return self.doc[self.i + self.c.head]
|
return self.doc[self.i + self.c.head]
|
||||||
|
|
||||||
def __set__(self, Token new_head):
|
def __set__(self, Token new_head):
|
||||||
# this function sets the head of self to new_head
|
# this function sets the head of self to new_head
|
||||||
# and updates the counters for left/right dependents
|
# and updates the counters for left/right dependents
|
||||||
|
@ -453,16 +486,18 @@ cdef class Token:
|
||||||
cdef Token anc, child
|
cdef Token anc, child
|
||||||
|
|
||||||
# update number of deps of old head
|
# update number of deps of old head
|
||||||
if self.c.head > 0: # left dependent
|
if self.c.head > 0: # left dependent
|
||||||
old_head.c.l_kids -= 1
|
old_head.c.l_kids -= 1
|
||||||
if self.c.l_edge == old_head.c.l_edge:
|
if self.c.l_edge == old_head.c.l_edge:
|
||||||
# the token dominates the left edge so the left edge of the head
|
# the token dominates the left edge so the left edge of
|
||||||
# may change when the token is reattached
|
# the head may change when the token is reattached, it may
|
||||||
# it may not change if the new head is a descendant of the current head
|
# not change if the new head is a descendant of the current
|
||||||
|
# head
|
||||||
|
|
||||||
new_edge = self.c.l_edge
|
new_edge = self.c.l_edge
|
||||||
# the new l_edge is the left-most l_edge on any of the other dependents
|
# the new l_edge is the left-most l_edge on any of the
|
||||||
# where the l_edge is left of the head, otherwise it is the head
|
# other dependents where the l_edge is left of the head,
|
||||||
|
# otherwise it is the head
|
||||||
if not is_desc:
|
if not is_desc:
|
||||||
new_edge = old_head.i
|
new_edge = old_head.i
|
||||||
for child in old_head.children:
|
for child in old_head.children:
|
||||||
|
@ -472,14 +507,15 @@ cdef class Token:
|
||||||
new_edge = child.c.l_edge
|
new_edge = child.c.l_edge
|
||||||
old_head.c.l_edge = new_edge
|
old_head.c.l_edge = new_edge
|
||||||
|
|
||||||
# walk up the tree from old_head and assign new l_edge to ancestors
|
# walk up the tree from old_head and assign new l_edge to
|
||||||
# until an ancestor already has an l_edge that's further left
|
# ancestors until an ancestor already has an l_edge that's
|
||||||
|
# further left
|
||||||
for anc in old_head.ancestors:
|
for anc in old_head.ancestors:
|
||||||
if anc.c.l_edge <= new_edge:
|
if anc.c.l_edge <= new_edge:
|
||||||
break
|
break
|
||||||
anc.c.l_edge = new_edge
|
anc.c.l_edge = new_edge
|
||||||
|
|
||||||
elif self.c.head < 0: # right dependent
|
elif self.c.head < 0: # right dependent
|
||||||
old_head.c.r_kids -= 1
|
old_head.c.r_kids -= 1
|
||||||
# do the same thing as for l_edge
|
# do the same thing as for l_edge
|
||||||
if self.c.r_edge == old_head.c.r_edge:
|
if self.c.r_edge == old_head.c.r_edge:
|
||||||
|
@ -500,7 +536,7 @@ cdef class Token:
|
||||||
anc.c.r_edge = new_edge
|
anc.c.r_edge = new_edge
|
||||||
|
|
||||||
# update number of deps of new head
|
# update number of deps of new head
|
||||||
if rel_newhead_i > 0: # left dependent
|
if rel_newhead_i > 0: # left dependent
|
||||||
new_head.c.l_kids += 1
|
new_head.c.l_kids += 1
|
||||||
# walk up the tree from new head and set l_edge to self.l_edge
|
# walk up the tree from new head and set l_edge to self.l_edge
|
||||||
# until you hit a token with an l_edge further to the left
|
# until you hit a token with an l_edge further to the left
|
||||||
|
@ -511,7 +547,7 @@ cdef class Token:
|
||||||
break
|
break
|
||||||
anc.c.l_edge = self.c.l_edge
|
anc.c.l_edge = self.c.l_edge
|
||||||
|
|
||||||
elif rel_newhead_i < 0: # right dependent
|
elif rel_newhead_i < 0: # right dependent
|
||||||
new_head.c.r_kids += 1
|
new_head.c.r_kids += 1
|
||||||
# do the same as for l_edge
|
# do the same as for l_edge
|
||||||
if self.c.r_edge > new_head.c.r_edge:
|
if self.c.r_edge > new_head.c.r_edge:
|
||||||
|
@ -542,12 +578,10 @@ cdef class Token:
|
||||||
yield from word.conjuncts
|
yield from word.conjuncts
|
||||||
|
|
||||||
property ent_type:
|
property ent_type:
|
||||||
"""Named entity type.
|
"""RETURNS (uint64): Named entity type."""
|
||||||
|
|
||||||
RETURNS (uint64): Named entity type.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.ent_type
|
return self.c.ent_type
|
||||||
|
|
||||||
def __set__(self, ent_type):
|
def __set__(self, ent_type):
|
||||||
self.c.ent_type = ent_type
|
self.c.ent_type = ent_type
|
||||||
|
|
||||||
|
@ -561,19 +595,17 @@ cdef class Token:
|
||||||
return self.c.ent_iob
|
return self.c.ent_iob
|
||||||
|
|
||||||
property ent_type_:
|
property ent_type_:
|
||||||
"""Named entity type.
|
"""RETURNS (unicode): Named entity type."""
|
||||||
|
|
||||||
RETURNS (unicode): Named entity type.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.ent_type]
|
return self.vocab.strings[self.c.ent_type]
|
||||||
|
|
||||||
def __set__(self, ent_type):
|
def __set__(self, ent_type):
|
||||||
self.c.ent_type = self.vocab.strings.add(ent_type)
|
self.c.ent_type = self.vocab.strings.add(ent_type)
|
||||||
|
|
||||||
property ent_iob_:
|
property ent_iob_:
|
||||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||||
"I" means it is inside an entity, "O" means it is outside an entity, and
|
"I" means it is inside an entity, "O" means it is outside an entity,
|
||||||
"" means no entity tag is set.
|
and "" means no entity tag is set.
|
||||||
|
|
||||||
RETURNS (unicode): IOB code of named entity tag.
|
RETURNS (unicode): IOB code of named entity tag.
|
||||||
"""
|
"""
|
||||||
|
@ -582,10 +614,8 @@ cdef class Token:
|
||||||
return iob_strings[self.c.ent_iob]
|
return iob_strings[self.c.ent_iob]
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""ID of the entity the token is an instance of, if any. Usually
|
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
||||||
assigned by patterns in the Matcher.
|
if any.
|
||||||
|
|
||||||
RETURNS (uint64): ID of the entity.
|
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.ent_id
|
return self.c.ent_id
|
||||||
|
@ -594,10 +624,8 @@ cdef class Token:
|
||||||
self.c.ent_id = key
|
self.c.ent_id = key
|
||||||
|
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
"""ID of the entity the token is an instance of, if any. Usually
|
"""RETURNS (unicode): ID of the entity the token is an instance of,
|
||||||
assigned by patterns in the Matcher.
|
if any.
|
||||||
|
|
||||||
RETURNS (unicode): ID of the entity.
|
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.ent_id]
|
return self.vocab.strings[self.c.ent_id]
|
||||||
|
@ -606,107 +634,192 @@ cdef class Token:
|
||||||
self.c.ent_id = self.vocab.strings.add(name)
|
self.c.ent_id = self.vocab.strings.add(name)
|
||||||
|
|
||||||
property whitespace_:
|
property whitespace_:
|
||||||
|
"""RETURNS (unicode): The trailing whitespace character, if present.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return ' ' if self.c.spacy else ''
|
return ' ' if self.c.spacy else ''
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
|
"""RETURNS (unicode): Verbatim text content (identical to
|
||||||
|
`Token.text`). Existst mostly for consistency with the other
|
||||||
|
attributes.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.orth]
|
return self.vocab.strings[self.c.lex.orth]
|
||||||
|
|
||||||
property lower_:
|
property lower_:
|
||||||
|
"""RETURNS (unicode): The lowercase token text. Equivalent to
|
||||||
|
`Token.text.lower()`.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.lower]
|
return self.vocab.strings[self.c.lex.lower]
|
||||||
|
|
||||||
property norm_:
|
property norm_:
|
||||||
|
"""RETURNS (unicode): The token's norm, i.e. a normalised form of the
|
||||||
|
token text. Usually set in the language's tokenizer exceptions or
|
||||||
|
norm exceptions.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.norm]
|
return self.vocab.strings[self.c.lex.norm]
|
||||||
|
|
||||||
property shape_:
|
property shape_:
|
||||||
|
"""RETURNS (unicode): Transform of the tokens's string, to show
|
||||||
|
orthographic features. For example, "Xxxx" or "dd".
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.shape]
|
return self.vocab.strings[self.c.lex.shape]
|
||||||
|
|
||||||
property prefix_:
|
property prefix_:
|
||||||
|
"""RETURNS (unicode): A length-N substring from the start of the token.
|
||||||
|
Defaults to `N=1`.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.prefix]
|
return self.vocab.strings[self.c.lex.prefix]
|
||||||
|
|
||||||
property suffix_:
|
property suffix_:
|
||||||
|
"""RETURNS (unicode): A length-N substring from the end of the token.
|
||||||
|
Defaults to `N=3`.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.suffix]
|
return self.vocab.strings[self.c.lex.suffix]
|
||||||
|
|
||||||
property lang_:
|
property lang_:
|
||||||
|
"""RETURNS (unicode): Language of the parent document's vocabulary,
|
||||||
|
e.g. 'en'.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.lang]
|
return self.vocab.strings[self.c.lex.lang]
|
||||||
|
|
||||||
property lemma_:
|
property lemma_:
|
||||||
"""Base form of the word, with no inflectional suffixes.
|
"""RETURNS (unicode): The token lemma, i.e. the base form of the word,
|
||||||
|
with no inflectional suffixes.
|
||||||
RETURNS (unicode): Token lemma.
|
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lemma]
|
return self.vocab.strings[self.c.lemma]
|
||||||
|
|
||||||
def __set__(self, unicode lemma_):
|
def __set__(self, unicode lemma_):
|
||||||
self.c.lemma = self.vocab.strings.add(lemma_)
|
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||||
|
|
||||||
property pos_:
|
property pos_:
|
||||||
|
"""RETURNS (unicode): Coarse-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return parts_of_speech.NAMES[self.c.pos]
|
return parts_of_speech.NAMES[self.c.pos]
|
||||||
|
|
||||||
property tag_:
|
property tag_:
|
||||||
|
"""RETURNS (unicode): Fine-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.tag]
|
return self.vocab.strings[self.c.tag]
|
||||||
|
|
||||||
def __set__(self, tag):
|
def __set__(self, tag):
|
||||||
self.tag = self.vocab.strings.add(tag)
|
self.tag = self.vocab.strings.add(tag)
|
||||||
|
|
||||||
property dep_:
|
property dep_:
|
||||||
|
"""RETURNS (unicode): The syntactic dependency label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.dep]
|
return self.vocab.strings[self.c.dep]
|
||||||
|
|
||||||
def __set__(self, unicode label):
|
def __set__(self, unicode label):
|
||||||
self.c.dep = self.vocab.strings.add(label)
|
self.c.dep = self.vocab.strings.add(label)
|
||||||
|
|
||||||
property is_oov:
|
property is_oov:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
"""RETURNS (bool): Whether the token is out-of-vocabulary."""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||||
|
|
||||||
property is_stop:
|
property is_stop:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
|
"""RETURNS (bool): Whether the token is a stop word, i.e. part of a
|
||||||
|
"stop list" defined by the language data.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_STOP)
|
||||||
|
|
||||||
property is_alpha:
|
property is_alpha:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
|
"""RETURNS (bool): Whether the token consists of alpha characters.
|
||||||
|
Equivalent to `token.text.isalpha()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
|
||||||
|
|
||||||
property is_ascii:
|
property is_ascii:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
|
"""RETURNS (bool): Whether the token consists of ASCII characters.
|
||||||
|
Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
|
||||||
|
|
||||||
property is_digit:
|
property is_digit:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
|
"""RETURNS (bool): Whether the token consists of digits. Equivalent to
|
||||||
|
`token.text.isdigit()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
|
||||||
|
|
||||||
property is_lower:
|
property is_lower:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
|
"""RETURNS (bool): Whether the token is in lowercase. Equivalent to
|
||||||
|
`token.text.islower()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
|
||||||
|
|
||||||
|
property is_upper:
|
||||||
|
"""RETURNS (bool): Whether the token is in uppercase. Equivalent to
|
||||||
|
`token.text.isupper()`
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
|
||||||
|
|
||||||
property is_title:
|
property is_title:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
|
"""RETURNS (bool): Whether the token is in titlecase. Equivalent to
|
||||||
|
`token.text.istitle()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
|
||||||
|
|
||||||
property is_punct:
|
property is_punct:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
|
"""RETURNS (bool): Whether the token is punctuation."""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
|
||||||
|
|
||||||
property is_space:
|
property is_space:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
"""RETURNS (bool): Whether the token consists of whitespace characters.
|
||||||
|
Equivalent to `token.text.isspace()`.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
||||||
|
|
||||||
property is_bracket:
|
property is_bracket:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
"""RETURNS (bool): Whether the token is a bracket."""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
||||||
|
|
||||||
property is_quote:
|
property is_quote:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
"""RETURNS (bool): Whether the token is a quotation mark."""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
||||||
|
|
||||||
property is_left_punct:
|
property is_left_punct:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
"""RETURNS (bool): Whether the token is a left punctuation mark."""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
||||||
|
|
||||||
property is_right_punct:
|
property is_right_punct:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
"""RETURNS (bool): Whether the token is a left punctuation mark."""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
||||||
|
|
||||||
property like_url:
|
property like_url:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
"""RETURNS (bool): Whether the token resembles a URL."""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
||||||
|
|
||||||
property like_num:
|
property like_num:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
|
"""RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
|
||||||
|
"10", "ten", etc.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
|
||||||
|
|
||||||
property like_email:
|
property like_email:
|
||||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
|
"""RETURNS (bool): Whether the token resembles an email address."""
|
||||||
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
|
|
||||||
class Underscore(object):
|
class Underscore(object):
|
||||||
doc_extensions = {}
|
doc_extensions = {}
|
||||||
span_extensions = {}
|
span_extensions = {}
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
|
|
|
@ -10,25 +10,27 @@ from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import textwrap
|
import textwrap
|
||||||
import random
|
import random
|
||||||
import numpy
|
|
||||||
import io
|
|
||||||
import dill
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
|
from .symbols import ORTH
|
||||||
|
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
||||||
|
from .compat import import_file
|
||||||
|
|
||||||
import msgpack
|
import msgpack
|
||||||
import msgpack_numpy
|
import msgpack_numpy
|
||||||
msgpack_numpy.patch()
|
msgpack_numpy.patch()
|
||||||
import ujson
|
|
||||||
|
|
||||||
from .symbols import ORTH
|
|
||||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
|
||||||
from .compat import copy_array, normalize_string_keys, getattr_, import_file
|
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
_data_path = Path(__file__).parent / 'data'
|
_data_path = Path(__file__).parent / 'data'
|
||||||
|
_PRINT_ENV = False
|
||||||
|
|
||||||
|
|
||||||
|
def set_env_log(value):
|
||||||
|
global _PRINT_ENV
|
||||||
|
_PRINT_ENV = value
|
||||||
|
|
||||||
|
|
||||||
def get_lang_class(lang):
|
def get_lang_class(lang):
|
||||||
|
@ -38,11 +40,12 @@ def get_lang_class(lang):
|
||||||
RETURNS (Language): Language class.
|
RETURNS (Language): Language class.
|
||||||
"""
|
"""
|
||||||
global LANGUAGES
|
global LANGUAGES
|
||||||
if not lang in LANGUAGES:
|
if lang not in LANGUAGES:
|
||||||
try:
|
try:
|
||||||
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("Can't import language %s from spacy.lang." %lang)
|
msg = "Can't import language %s from spacy.lang."
|
||||||
|
raise ImportError(msg % lang)
|
||||||
LANGUAGES[lang] = getattr(module, module.__all__[0])
|
LANGUAGES[lang] = getattr(module, module.__all__[0])
|
||||||
return LANGUAGES[lang]
|
return LANGUAGES[lang]
|
||||||
|
|
||||||
|
@ -100,14 +103,14 @@ def load_model(name, **overrides):
|
||||||
data_path = get_data_path()
|
data_path = get_data_path()
|
||||||
if not data_path or not data_path.exists():
|
if not data_path or not data_path.exists():
|
||||||
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
||||||
if isinstance(name, basestring_):
|
if isinstance(name, basestring_): # in data dir / shortcut
|
||||||
if name in set([d.name for d in data_path.iterdir()]): # in data dir / shortcut
|
if name in set([d.name for d in data_path.iterdir()]):
|
||||||
return load_model_from_link(name, **overrides)
|
return load_model_from_link(name, **overrides)
|
||||||
if is_package(name): # installed as package
|
if is_package(name): # installed as package
|
||||||
return load_model_from_package(name, **overrides)
|
return load_model_from_package(name, **overrides)
|
||||||
if Path(name).exists(): # path to model data directory
|
if Path(name).exists(): # path to model data directory
|
||||||
return load_model_from_path(Path(name), **overrides)
|
return load_model_from_path(Path(name), **overrides)
|
||||||
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
||||||
return load_model_from_path(name, **overrides)
|
return load_model_from_path(name, **overrides)
|
||||||
raise IOError("Can't find model '%s'" % name)
|
raise IOError("Can't find model '%s'" % name)
|
||||||
|
|
||||||
|
@ -120,7 +123,7 @@ def load_model_from_link(name, **overrides):
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
raise IOError(
|
raise IOError(
|
||||||
"Cant' load '%s'. If you're using a shortcut link, make sure it "
|
"Cant' load '%s'. If you're using a shortcut link, make sure it "
|
||||||
"points to a valid model package (not just a data directory)." % name)
|
"points to a valid package (not just a data directory)." % name)
|
||||||
return cls.load(**overrides)
|
return cls.load(**overrides)
|
||||||
|
|
||||||
|
|
||||||
|
@ -164,7 +167,8 @@ def load_model_from_init_py(init_file, **overrides):
|
||||||
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
||||||
data_path = model_path / data_dir
|
data_path = model_path / data_dir
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
raise ValueError("Can't find model directory: %s" % path2str(data_path))
|
msg = "Can't find model directory: %s"
|
||||||
|
raise ValueError(msg % path2str(data_path))
|
||||||
return load_model_from_path(data_path, meta, **overrides)
|
return load_model_from_path(data_path, meta, **overrides)
|
||||||
|
|
||||||
|
|
||||||
|
@ -176,14 +180,16 @@ def get_model_meta(path):
|
||||||
"""
|
"""
|
||||||
model_path = ensure_path(path)
|
model_path = ensure_path(path)
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
raise ValueError("Can't find model directory: %s" % path2str(model_path))
|
msg = "Can't find model directory: %s"
|
||||||
|
raise ValueError(msg % path2str(model_path))
|
||||||
meta_path = model_path / 'meta.json'
|
meta_path = model_path / 'meta.json'
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
raise IOError("Could not read meta.json from %s" % meta_path)
|
raise IOError("Could not read meta.json from %s" % meta_path)
|
||||||
meta = read_json(meta_path)
|
meta = read_json(meta_path)
|
||||||
for setting in ['lang', 'name', 'version']:
|
for setting in ['lang', 'name', 'version']:
|
||||||
if setting not in meta or not meta[setting]:
|
if setting not in meta or not meta[setting]:
|
||||||
raise ValueError("No valid '%s' setting found in model meta.json" % setting)
|
msg = "No valid '%s' setting found in model meta.json"
|
||||||
|
raise ValueError(msg % setting)
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
@ -240,7 +246,7 @@ def get_async(stream, numpy_array):
|
||||||
return numpy_array
|
return numpy_array
|
||||||
else:
|
else:
|
||||||
array = cupy.ndarray(numpy_array.shape, order='C',
|
array = cupy.ndarray(numpy_array.shape, order='C',
|
||||||
dtype=numpy_array.dtype)
|
dtype=numpy_array.dtype)
|
||||||
array.set(numpy_array, stream=stream)
|
array.set(numpy_array, stream=stream)
|
||||||
return array
|
return array
|
||||||
|
|
||||||
|
@ -274,12 +280,6 @@ def itershuffle(iterable, bufsize=1000):
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
|
||||||
|
|
||||||
_PRINT_ENV = False
|
|
||||||
def set_env_log(value):
|
|
||||||
global _PRINT_ENV
|
|
||||||
_PRINT_ENV = value
|
|
||||||
|
|
||||||
|
|
||||||
def env_opt(name, default=None):
|
def env_opt(name, default=None):
|
||||||
if type(default) is float:
|
if type(default) is float:
|
||||||
type_convert = float
|
type_convert = float
|
||||||
|
@ -305,17 +305,20 @@ def read_regex(path):
|
||||||
path = ensure_path(path)
|
path = ensure_path(path)
|
||||||
with path.open() as file_:
|
with path.open() as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
expression = '|'.join(['^' + re.escape(piece)
|
||||||
|
for piece in entries if piece.strip()])
|
||||||
return re.compile(expression)
|
return re.compile(expression)
|
||||||
|
|
||||||
|
|
||||||
def compile_prefix_regex(entries):
|
def compile_prefix_regex(entries):
|
||||||
if '(' in entries:
|
if '(' in entries:
|
||||||
# Handle deprecated data
|
# Handle deprecated data
|
||||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
expression = '|'.join(['^' + re.escape(piece)
|
||||||
|
for piece in entries if piece.strip()])
|
||||||
return re.compile(expression)
|
return re.compile(expression)
|
||||||
else:
|
else:
|
||||||
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
|
expression = '|'.join(['^' + piece
|
||||||
|
for piece in entries if piece.strip()])
|
||||||
return re.compile(expression)
|
return re.compile(expression)
|
||||||
|
|
||||||
|
|
||||||
|
@ -359,16 +362,15 @@ def update_exc(base_exceptions, *addition_dicts):
|
||||||
exc = dict(base_exceptions)
|
exc = dict(base_exceptions)
|
||||||
for additions in addition_dicts:
|
for additions in addition_dicts:
|
||||||
for orth, token_attrs in additions.items():
|
for orth, token_attrs in additions.items():
|
||||||
if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
|
if not all(isinstance(attr[ORTH], unicode_)
|
||||||
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
|
for attr in token_attrs):
|
||||||
|
msg = "Invalid ORTH value in exception: key='%s', orths='%s'"
|
||||||
raise ValueError(msg % (orth, token_attrs))
|
raise ValueError(msg % (orth, token_attrs))
|
||||||
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
||||||
if orth != described_orth:
|
if orth != described_orth:
|
||||||
raise ValueError("Invalid tokenizer exception: ORTH values "
|
msg = ("Invalid tokenizer exception: ORTH values combined "
|
||||||
"combined don't match original string. "
|
"don't match original string. key='%s', orths='%s'")
|
||||||
"key='%s', orths='%s'" % (orth, described_orth))
|
raise ValueError(msg % (orth, described_orth))
|
||||||
# overlap = set(exc.keys()).intersection(set(additions))
|
|
||||||
# assert not overlap, overlap
|
|
||||||
exc.update(additions)
|
exc.update(additions)
|
||||||
exc = expand_exc(exc, "'", "’")
|
exc = expand_exc(exc, "'", "’")
|
||||||
return exc
|
return exc
|
||||||
|
@ -401,17 +403,15 @@ def normalize_slice(length, start, stop, step=None):
|
||||||
raise ValueError("Stepped slices not supported in Span objects."
|
raise ValueError("Stepped slices not supported in Span objects."
|
||||||
"Try: list(tokens)[start:stop:step] instead.")
|
"Try: list(tokens)[start:stop:step] instead.")
|
||||||
if start is None:
|
if start is None:
|
||||||
start = 0
|
start = 0
|
||||||
elif start < 0:
|
elif start < 0:
|
||||||
start += length
|
start += length
|
||||||
start = min(length, max(0, start))
|
start = min(length, max(0, start))
|
||||||
|
|
||||||
if stop is None:
|
if stop is None:
|
||||||
stop = length
|
stop = length
|
||||||
elif stop < 0:
|
elif stop < 0:
|
||||||
stop += length
|
stop += length
|
||||||
stop = min(length, max(start, stop))
|
stop = min(length, max(start, stop))
|
||||||
|
|
||||||
assert 0 <= start <= stop <= length
|
assert 0 <= start <= stop <= length
|
||||||
return start, stop
|
return start, stop
|
||||||
|
|
||||||
|
@ -428,7 +428,7 @@ def compounding(start, stop, compound):
|
||||||
>>> assert next(sizes) == 1.5 * 1.5
|
>>> assert next(sizes) == 1.5 * 1.5
|
||||||
"""
|
"""
|
||||||
def clip(value):
|
def clip(value):
|
||||||
return max(value, stop) if (start>stop) else min(value, stop)
|
return max(value, stop) if (start > stop) else min(value, stop)
|
||||||
curr = float(start)
|
curr = float(start)
|
||||||
while True:
|
while True:
|
||||||
yield clip(curr)
|
yield clip(curr)
|
||||||
|
@ -438,7 +438,7 @@ def compounding(start, stop, compound):
|
||||||
def decaying(start, stop, decay):
|
def decaying(start, stop, decay):
|
||||||
"""Yield an infinite series of linearly decaying values."""
|
"""Yield an infinite series of linearly decaying values."""
|
||||||
def clip(value):
|
def clip(value):
|
||||||
return max(value, stop) if (start>stop) else min(value, stop)
|
return max(value, stop) if (start > stop) else min(value, stop)
|
||||||
nr_upd = 1.
|
nr_upd = 1.
|
||||||
while True:
|
while True:
|
||||||
yield clip(start * 1./(1. + decay * nr_upd))
|
yield clip(start * 1./(1. + decay * nr_upd))
|
||||||
|
@ -530,17 +530,19 @@ def print_markdown(data, title=None):
|
||||||
|
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
data = list(data.items())
|
data = list(data.items())
|
||||||
markdown = ["* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)]
|
markdown = ["* **{}:** {}".format(l, unicode_(v))
|
||||||
|
for l, v in data if not excl_value(v)]
|
||||||
if title:
|
if title:
|
||||||
print("\n## {}".format(title))
|
print("\n## {}".format(title))
|
||||||
print('\n{}\n'.format('\n'.join(markdown)))
|
print('\n{}\n'.format('\n'.join(markdown)))
|
||||||
|
|
||||||
|
|
||||||
def prints(*texts, **kwargs):
|
def prints(*texts, **kwargs):
|
||||||
"""Print formatted message (manual ANSI escape sequences to avoid dependency)
|
"""Print formatted message (manual ANSI escape sequences to avoid
|
||||||
|
dependency)
|
||||||
|
|
||||||
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
|
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
|
||||||
**kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
|
**kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
|
||||||
"""
|
"""
|
||||||
exits = kwargs.get('exits', None)
|
exits = kwargs.get('exits', None)
|
||||||
title = kwargs.get('title', None)
|
title = kwargs.get('title', None)
|
||||||
|
@ -570,7 +572,8 @@ def _wrap(text, wrap_max=80, indent=4):
|
||||||
|
|
||||||
def minify_html(html):
|
def minify_html(html):
|
||||||
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
||||||
Disclaimer: NOT a general-purpose solution, only removes indentation/newlines.
|
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
||||||
|
newlines.
|
||||||
|
|
||||||
html (unicode): Markup to minify.
|
html (unicode): Markup to minify.
|
||||||
RETURNS (unicode): "Minified" HTML.
|
RETURNS (unicode): "Minified" HTML.
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from libc.stdint cimport int32_t, uint64_t
|
|
||||||
import numpy
|
import numpy
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import msgpack
|
import msgpack
|
||||||
|
@ -9,23 +10,20 @@ cimport numpy as np
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from . import util
|
|
||||||
from .compat import basestring_, path2str
|
from .compat import basestring_, path2str
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
cdef class Vectors:
|
cdef class Vectors:
|
||||||
'''Store, save and load word vectors.
|
"""Store, save and load word vectors.
|
||||||
|
|
||||||
Vectors data is kept in the vectors.data attribute, which should be an
|
Vectors data is kept in the vectors.data attribute, which should be an
|
||||||
instance of numpy.ndarray (for CPU vectors)
|
instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
|
||||||
or cupy.ndarray (for GPU vectors).
|
(for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
|
||||||
|
rows in the vectors.data table. The array `vectors.keys` keeps the keys in
|
||||||
vectors.key2row is a dictionary mapping word hashes to rows
|
order, such that `keys[vectors.key2row[key]] == key`.
|
||||||
in the vectors.data table. The array `vectors.keys` keeps
|
"""
|
||||||
the keys in order, such that keys[vectors.key2row[key]] == key.
|
|
||||||
'''
|
|
||||||
cdef public object data
|
cdef public object data
|
||||||
cdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
cdef public object key2row
|
cdef public object key2row
|
||||||
|
@ -33,6 +31,16 @@ cdef class Vectors:
|
||||||
cdef public int i
|
cdef public int i
|
||||||
|
|
||||||
def __init__(self, strings, width=0, data=None):
|
def __init__(self, strings, width=0, data=None):
|
||||||
|
"""Create a new vector store. To keep the vector table empty, pass
|
||||||
|
`width=0`. You can also create the vector table and add vectors one by
|
||||||
|
one, or set the vector values directly on initialisation.
|
||||||
|
|
||||||
|
strings (StringStore or list): List of strings or StringStore that maps
|
||||||
|
strings to hash values, and vice versa.
|
||||||
|
width (int): Number of dimensions.
|
||||||
|
data (numpy.ndarray): The vector data.
|
||||||
|
RETURNS (Vectors): The newly created object.
|
||||||
|
"""
|
||||||
if isinstance(strings, StringStore):
|
if isinstance(strings, StringStore):
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
else:
|
else:
|
||||||
|
@ -55,11 +63,13 @@ cdef class Vectors:
|
||||||
return (Vectors, (self.strings, self.data))
|
return (Vectors, (self.strings, self.data))
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
'''Get a vector by key. If key is a string, it is hashed
|
"""Get a vector by key. If key is a string, it is hashed to an integer
|
||||||
to an integer ID using the vectors.strings table.
|
ID using the vectors.strings table. If the integer key is not found in
|
||||||
|
the table, a KeyError is raised.
|
||||||
|
|
||||||
If the integer key is not found in the table, a KeyError is raised.
|
key (unicode / int): The key to get the vector for.
|
||||||
'''
|
RETURNS (numpy.ndarray): The vector for the key.
|
||||||
|
"""
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
key = self.strings[key]
|
key = self.strings[key]
|
||||||
i = self.key2row[key]
|
i = self.key2row[key]
|
||||||
|
@ -69,30 +79,47 @@ cdef class Vectors:
|
||||||
return self.data[i]
|
return self.data[i]
|
||||||
|
|
||||||
def __setitem__(self, key, vector):
|
def __setitem__(self, key, vector):
|
||||||
'''Set a vector for the given key. If key is a string, it is hashed
|
"""Set a vector for the given key. If key is a string, it is hashed
|
||||||
to an integer ID using the vectors.strings table.
|
to an integer ID using the vectors.strings table.
|
||||||
'''
|
|
||||||
|
key (unicode / int): The key to set the vector for.
|
||||||
|
vector (numpy.ndarray): The vector to set.
|
||||||
|
"""
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
key = self.strings.add(key)
|
key = self.strings.add(key)
|
||||||
i = self.key2row[key]
|
i = self.key2row[key]
|
||||||
self.data[i] = vector
|
self.data[i] = vector
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
'''Yield vectors from the table.'''
|
"""Yield vectors from the table.
|
||||||
|
|
||||||
|
YIELDS (numpy.ndarray): A vector.
|
||||||
|
"""
|
||||||
yield from self.data
|
yield from self.data
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
'''Return the number of vectors that have been assigned.'''
|
"""Return the number of vectors that have been assigned.
|
||||||
|
|
||||||
|
RETURNS (int): The number of vectors in the data.
|
||||||
|
"""
|
||||||
return self.i
|
return self.i
|
||||||
|
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
'''Check whether a key has a vector entry in the table.'''
|
"""Check whether a key has a vector entry in the table.
|
||||||
|
|
||||||
|
key (unicode / int): The key to check.
|
||||||
|
RETURNS (bool): Whether the key has a vector entry.
|
||||||
|
"""
|
||||||
if isinstance(key, basestring_):
|
if isinstance(key, basestring_):
|
||||||
key = self.strings[key]
|
key = self.strings[key]
|
||||||
return key in self.key2row
|
return key in self.key2row
|
||||||
|
|
||||||
def add(self, key, vector=None):
|
def add(self, key, vector=None):
|
||||||
'''Add a key to the table, optionally setting a vector value as well.'''
|
"""Add a key to the table, optionally setting a vector value as well.
|
||||||
|
|
||||||
|
key (unicode / int): The key to add.
|
||||||
|
vector (numpy.ndarray): An optional vector to add.
|
||||||
|
"""
|
||||||
if isinstance(key, basestring_):
|
if isinstance(key, basestring_):
|
||||||
key = self.strings.add(key)
|
key = self.strings.add(key)
|
||||||
if key not in self.key2row:
|
if key not in self.key2row:
|
||||||
|
@ -110,24 +137,36 @@ cdef class Vectors:
|
||||||
return i
|
return i
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
'''Iterate over (string key, vector) pairs, in order.'''
|
"""Iterate over `(string key, vector)` pairs, in order.
|
||||||
|
|
||||||
|
YIELDS (tuple): A key/vector pair.
|
||||||
|
"""
|
||||||
for i, key in enumerate(self.keys):
|
for i, key in enumerate(self.keys):
|
||||||
string = self.strings[key]
|
string = self.strings[key]
|
||||||
yield string, self.data[i]
|
yield string, self.data[i]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def shape(self):
|
def shape(self):
|
||||||
|
"""Get `(rows, dims)` tuples of number of rows and number of dimensions
|
||||||
|
in the vector table.
|
||||||
|
|
||||||
|
RETURNS (tuple): A `(rows, dims)` pair.
|
||||||
|
"""
|
||||||
return self.data.shape
|
return self.data.shape
|
||||||
|
|
||||||
def most_similar(self, key):
|
def most_similar(self, key):
|
||||||
|
# TODO: implement
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def from_glove(self, path):
|
def from_glove(self, path):
|
||||||
'''Load GloVe vectors from a directory. Assumes binary format,
|
"""Load GloVe vectors from a directory. Assumes binary format,
|
||||||
that the vocab is in a vocab.txt, and that vectors are named
|
that the vocab is in a vocab.txt, and that vectors are named
|
||||||
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
|
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
|
||||||
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
|
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
|
||||||
By default GloVe outputs 64-bit vectors.'''
|
By default GloVe outputs 64-bit vectors.
|
||||||
|
|
||||||
|
path (unicode / Path): The path to load the GloVe vectors from.
|
||||||
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
for name in path.iterdir():
|
for name in path.iterdir():
|
||||||
if name.parts[-1].startswith('vectors'):
|
if name.parts[-1].startswith('vectors'):
|
||||||
|
@ -150,9 +189,15 @@ cdef class Vectors:
|
||||||
self.data
|
self.data
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
path (unicode / Path): A path to a directory, which will be created if
|
||||||
|
it doesn't exists. Either a string or a Path-like object.
|
||||||
|
"""
|
||||||
xp = get_array_module(self.data)
|
xp = get_array_module(self.data)
|
||||||
if xp is numpy:
|
if xp is numpy:
|
||||||
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
|
save_array = lambda arr, file_: xp.save(file_, arr,
|
||||||
|
allow_pickle=False)
|
||||||
else:
|
else:
|
||||||
save_array = lambda arr, file_: xp.save(file_, arr)
|
save_array = lambda arr, file_: xp.save(file_, arr)
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
|
@ -162,6 +207,12 @@ cdef class Vectors:
|
||||||
return util.to_disk(path, serializers, exclude)
|
return util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
returns it.
|
||||||
|
|
||||||
|
path (unicode / Path): Directory path, string or Path-like object.
|
||||||
|
RETURNS (Vectors): The modified object.
|
||||||
|
"""
|
||||||
def load_keys(path):
|
def load_keys(path):
|
||||||
if path.exists():
|
if path.exists():
|
||||||
self.keys = numpy.load(path2str(path))
|
self.keys = numpy.load(path2str(path))
|
||||||
|
@ -182,6 +233,11 @@ cdef class Vectors:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
|
RETURNS (bytes): The serialized form of the `Vectors` object.
|
||||||
|
"""
|
||||||
def serialize_weights():
|
def serialize_weights():
|
||||||
if hasattr(self.data, 'to_bytes'):
|
if hasattr(self.data, 'to_bytes'):
|
||||||
return self.data.to_bytes()
|
return self.data.to_bytes()
|
||||||
|
@ -194,6 +250,12 @@ cdef class Vectors:
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, data, **exclude):
|
def from_bytes(self, data, **exclude):
|
||||||
|
"""Load state from a binary string.
|
||||||
|
|
||||||
|
data (bytes): The data to load from.
|
||||||
|
**exclude: Named attributes to prevent from being loaded.
|
||||||
|
RETURNS (Vectors): The `Vectors` object.
|
||||||
|
"""
|
||||||
def deserialize_weights(b):
|
def deserialize_weights(b):
|
||||||
if hasattr(self.data, 'from_bytes'):
|
if hasattr(self.data, 'from_bytes'):
|
||||||
self.data.from_bytes()
|
self.data.from_bytes()
|
||||||
|
|
120
spacy/vocab.pyx
120
spacy/vocab.pyx
|
@ -1,33 +1,24 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import bz2
|
|
||||||
import ujson
|
|
||||||
import re
|
|
||||||
import numpy
|
import numpy
|
||||||
import dill
|
import dill
|
||||||
|
|
||||||
from libc.string cimport memset, memcpy
|
|
||||||
from libc.stdint cimport int32_t
|
|
||||||
from libc.math cimport sqrt
|
|
||||||
from cymem.cymem cimport Address
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .tokens.token cimport Token
|
from .tokens.token cimport Token
|
||||||
from .attrs cimport PROB, LANG
|
from .attrs cimport PROB, LANG, ORTH, TAG
|
||||||
from .structs cimport SerializedLexemeC
|
from .structs cimport SerializedLexemeC
|
||||||
|
|
||||||
from .compat import copy_reg, pickle, basestring_
|
from .compat import copy_reg, basestring_
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .vectors import Vectors
|
from .vectors import Vectors
|
||||||
from . import util
|
|
||||||
from . import attrs
|
|
||||||
from . import symbols
|
|
||||||
from ._ml import link_vectors_to_models
|
from ._ml import link_vectors_to_models
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
|
@ -36,23 +27,22 @@ cdef class Vocab:
|
||||||
C-data that is shared between `Doc` objects.
|
C-data that is shared between `Doc` objects.
|
||||||
"""
|
"""
|
||||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||||
strings=tuple(), **deprecated_kwargs):
|
strings=tuple(), **deprecated_kwargs):
|
||||||
"""Create the vocabulary.
|
"""Create the vocabulary.
|
||||||
|
|
||||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
|
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||||
to compute them. Defaults to `None`.
|
functions to compute them. Defaults to `None`.
|
||||||
tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
|
tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
|
||||||
parts-of-speech, and optionally morphological attributes.
|
parts-of-speech, and optionally morphological attributes.
|
||||||
lemmatizer (object): A lemmatizer. Defaults to `None`.
|
lemmatizer (object): A lemmatizer. Defaults to `None`.
|
||||||
strings (StringStore): StringStore that maps strings to integers, and
|
strings (StringStore): StringStore that maps strings to integers, and
|
||||||
vice versa.
|
vice versa.
|
||||||
RETURNS (Vocab): The newly constructed vocab object.
|
RETURNS (Vocab): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
tag_map = tag_map if tag_map is not None else {}
|
tag_map = tag_map if tag_map is not None else {}
|
||||||
if lemmatizer in (None, True, False):
|
if lemmatizer in (None, True, False):
|
||||||
lemmatizer = Lemmatizer({}, {}, {})
|
lemmatizer = Lemmatizer({}, {}, {})
|
||||||
|
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._by_hash = PreshMap()
|
self._by_hash = PreshMap()
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
|
@ -84,19 +74,20 @@ cdef class Vocab:
|
||||||
|
|
||||||
The flag_getter function will be called over the words currently in the
|
The flag_getter function will be called over the words currently in the
|
||||||
vocab, and then applied to new words as they occur. You'll then be able
|
vocab, and then applied to new words as they occur. You'll then be able
|
||||||
to access the flag value on each token, using token.check_flag(flag_id).
|
to access the flag value on each token using token.check_flag(flag_id).
|
||||||
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
|
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
|
||||||
`Token.check_flag`.
|
`Token.check_flag`.
|
||||||
|
|
||||||
flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
|
flag_getter (callable): A function `f(unicode) -> bool`, to get the
|
||||||
value.
|
flag value.
|
||||||
flag_id (int): An integer between 1 and 63 (inclusive), specifying
|
flag_id (int): An integer between 1 and 63 (inclusive), specifying
|
||||||
the bit at which the flag will be stored. If -1, the lowest
|
the bit at which the flag will be stored. If -1, the lowest
|
||||||
available bit will be chosen.
|
available bit will be chosen.
|
||||||
RETURNS (int): The integer ID by which the flag value can be checked.
|
RETURNS (int): The integer ID by which the flag value can be checked.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
|
>>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy']
|
||||||
|
>>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter)
|
||||||
>>> doc = nlp(u'I like spaCy')
|
>>> doc = nlp(u'I like spaCy')
|
||||||
>>> assert doc[2].check_flag(MY_PRODUCT) == True
|
>>> assert doc[2].check_flag(MY_PRODUCT) == True
|
||||||
"""
|
"""
|
||||||
|
@ -107,9 +98,10 @@ cdef class Vocab:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Cannot find empty bit for new lexical flag. All bits between "
|
"Cannot find empty bit for new lexical flag. All bits "
|
||||||
"0 and 63 are occupied. You can replace one by specifying the "
|
"between 0 and 63 are occupied. You can replace one by "
|
||||||
"flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA")
|
"specifying the flag_id explicitly, e.g. "
|
||||||
|
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
|
||||||
elif flag_id >= 64 or flag_id < 1:
|
elif flag_id >= 64 or flag_id < 1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Invalid value for flag_id: %d. Flag IDs must be between "
|
"Invalid value for flag_id: %d. Flag IDs must be between "
|
||||||
|
@ -120,9 +112,9 @@ cdef class Vocab:
|
||||||
return flag_id
|
return flag_id
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
`Lexeme` if necessary using memory acquired from the given pool. If the
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
"""
|
"""
|
||||||
if string == u'':
|
if string == u'':
|
||||||
return &EMPTY_LEXEME
|
return &EMPTY_LEXEME
|
||||||
|
@ -139,9 +131,9 @@ cdef class Vocab:
|
||||||
return self._new_lexeme(mem, string)
|
return self._new_lexeme(mem, string)
|
||||||
|
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
`Lexeme` if necessary using memory acquired from the given pool. If the
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
"""
|
"""
|
||||||
if orth == 0:
|
if orth == 0:
|
||||||
return &EMPTY_LEXEME
|
return &EMPTY_LEXEME
|
||||||
|
@ -203,8 +195,8 @@ cdef class Vocab:
|
||||||
for orth, addr in self._by_orth.items():
|
for orth, addr in self._by_orth.items():
|
||||||
yield Lexeme(self, orth)
|
yield Lexeme(self, orth)
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
"""Retrieve a lexeme, given an int ID or a unicode string. If a
|
"""Retrieve a lexeme, given an int ID or a unicode string. If a
|
||||||
previously unseen unicode string is given, a new lexeme is created and
|
previously unseen unicode string is given, a new lexeme is created and
|
||||||
stored.
|
stored.
|
||||||
|
|
||||||
|
@ -229,13 +221,14 @@ cdef class Vocab:
|
||||||
cdef int i
|
cdef int i
|
||||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||||
for i, props in enumerate(substrings):
|
for i, props in enumerate(substrings):
|
||||||
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
|
props = intify_attrs(props, strings_map=self.strings,
|
||||||
|
_do_deprecated=True)
|
||||||
token = &tokens[i]
|
token = &tokens[i]
|
||||||
# Set the special tokens up to have arbitrary attributes
|
# Set the special tokens up to have arbitrary attributes
|
||||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
|
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
||||||
token.lex = lex
|
token.lex = lex
|
||||||
if attrs.TAG in props:
|
if TAG in props:
|
||||||
self.morphology.assign_tag(token, props[attrs.TAG])
|
self.morphology.assign_tag(token, props[TAG])
|
||||||
for attr_id, value in props.items():
|
for attr_id, value in props.items():
|
||||||
Token.set_struct_attr(token, attr_id, value)
|
Token.set_struct_attr(token, attr_id, value)
|
||||||
Lexeme.set_struct_attr(lex, attr_id, value)
|
Lexeme.set_struct_attr(lex, attr_id, value)
|
||||||
|
@ -254,16 +247,13 @@ cdef class Vocab:
|
||||||
self.vectors = Vectors(self.strings, width=new_dim)
|
self.vectors = Vectors(self.strings, width=new_dim)
|
||||||
|
|
||||||
def get_vector(self, orth):
|
def get_vector(self, orth):
|
||||||
"""Retrieve a vector for a word in the vocabulary.
|
"""Retrieve a vector for a word in the vocabulary. Words can be looked
|
||||||
|
up by string or int ID. If no vectors data is loaded, ValueError is
|
||||||
|
raised.
|
||||||
|
|
||||||
Words can be looked up by string or int ID.
|
RETURNS (numpy.ndarray): A word vector. Size
|
||||||
|
and shape determined by the `vocab.vectors` instance. Usually, a
|
||||||
RETURNS:
|
numpy ndarray of shape (300,) and dtype float32.
|
||||||
A word vector. Size and shape determined by the
|
|
||||||
vocab.vectors instance. Usually, a numpy ndarray
|
|
||||||
of shape (300,) and dtype float32.
|
|
||||||
|
|
||||||
RAISES: If no vectors data is loaded, ValueError is raised.
|
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, basestring_):
|
if isinstance(orth, basestring_):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
|
@ -273,21 +263,16 @@ cdef class Vocab:
|
||||||
return numpy.zeros((self.vectors_length,), dtype='f')
|
return numpy.zeros((self.vectors_length,), dtype='f')
|
||||||
|
|
||||||
def set_vector(self, orth, vector):
|
def set_vector(self, orth, vector):
|
||||||
"""Set a vector for a word in the vocabulary.
|
"""Set a vector for a word in the vocabulary. Words can be referenced
|
||||||
|
by string or int ID.
|
||||||
Words can be referenced by string or int ID.
|
|
||||||
|
|
||||||
RETURNS:
|
|
||||||
None
|
|
||||||
"""
|
"""
|
||||||
if not isinstance(orth, basestring_):
|
if not isinstance(orth, basestring_):
|
||||||
orth = self.strings[orth]
|
orth = self.strings[orth]
|
||||||
self.vectors.add(orth, vector=vector)
|
self.vectors.add(orth, vector=vector)
|
||||||
|
|
||||||
def has_vector(self, orth):
|
def has_vector(self, orth):
|
||||||
"""Check whether a word has a vector. Returns False if no
|
"""Check whether a word has a vector. Returns False if no vectors have
|
||||||
vectors have been loaded. Words can be looked up by string
|
been loaded. Words can be looked up by string or int ID."""
|
||||||
or int ID."""
|
|
||||||
if isinstance(orth, basestring_):
|
if isinstance(orth, basestring_):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
return orth in self.vectors
|
return orth in self.vectors
|
||||||
|
@ -296,7 +281,7 @@ cdef class Vocab:
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
path (unicode or Path): A path to a directory, which will be created if
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
|
@ -421,16 +406,13 @@ def pickle_vocab(vocab):
|
||||||
length = vocab.length
|
length = vocab.length
|
||||||
data_dir = vocab.data_dir
|
data_dir = vocab.data_dir
|
||||||
lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
|
lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
|
||||||
|
|
||||||
lexemes_data = vocab.lexemes_to_bytes()
|
lexemes_data = vocab.lexemes_to_bytes()
|
||||||
|
|
||||||
return (unpickle_vocab,
|
return (unpickle_vocab,
|
||||||
(sstore, morph, data_dir, lex_attr_getters,
|
(sstore, morph, data_dir, lex_attr_getters, lexemes_data, length))
|
||||||
lexemes_data, length))
|
|
||||||
|
|
||||||
|
|
||||||
def unpickle_vocab(sstore, morphology, data_dir,
|
def unpickle_vocab(sstore, morphology, data_dir,
|
||||||
lex_attr_getters, bytes lexemes_data, int length):
|
lex_attr_getters, bytes lexemes_data, int length):
|
||||||
cdef Vocab vocab = Vocab()
|
cdef Vocab vocab = Vocab()
|
||||||
vocab.length = length
|
vocab.length = length
|
||||||
vocab.strings = sstore
|
vocab.strings = sstore
|
||||||
|
@ -450,12 +432,10 @@ class LookupError(Exception):
|
||||||
@classmethod
|
@classmethod
|
||||||
def mismatched_strings(cls, id_, id_string, original_string):
|
def mismatched_strings(cls, id_, id_string, original_string):
|
||||||
return cls(
|
return cls(
|
||||||
"Error fetching a Lexeme from the Vocab. When looking up a string, "
|
"Error fetching a Lexeme from the Vocab. When looking up a "
|
||||||
"the lexeme returned had an orth ID that did not match the query string. "
|
"string, the lexeme returned had an orth ID that did not match "
|
||||||
"This means that the cached lexeme structs are mismatched to the "
|
"the query string. This means that the cached lexeme structs are "
|
||||||
"string encoding table. The mismatched:\n"
|
"mismatched to the string encoding table. The mismatched:\n"
|
||||||
"Query string: {query}\n"
|
"Query string: {}\n"
|
||||||
"Orth cached: {orth_str}\n"
|
"Orth cached: {}\n"
|
||||||
"ID of orth: {orth_id}".format(
|
"Orth ID: {}".format(repr(original_string), repr(id_string), id_))
|
||||||
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
|
|
||||||
)
|
|
||||||
|
|
|
@ -784,3 +784,10 @@ p
|
||||||
+cell
|
+cell
|
||||||
| A dictionary that allows customisation of properties of
|
| A dictionary that allows customisation of properties of
|
||||||
| #[code Span] children.
|
| #[code Span] children.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code _]
|
||||||
|
+cell #[code Underscore]
|
||||||
|
+cell
|
||||||
|
| User space for adding custom
|
||||||
|
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
|
||||||
|
|
|
@ -157,27 +157,61 @@ p The L2 norm of the lexeme's vector representation.
|
||||||
+row
|
+row
|
||||||
+cell #[code vocab]
|
+cell #[code vocab]
|
||||||
+cell #[code Vocab]
|
+cell #[code Vocab]
|
||||||
+cell
|
+cell The lexeme's vocabulary.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code text]
|
+cell #[code text]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Verbatim text content.
|
+cell Verbatim text content.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code orth]
|
||||||
|
+cell int
|
||||||
|
+cell ID of the verbatim text content.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code orth_]
|
||||||
|
+cell unicode
|
||||||
|
+cell
|
||||||
|
| Verbatim text content (identical to #[code Lexeme.text]). Existst
|
||||||
|
| mostly for consistency with the other attributes.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lex_id]
|
+cell #[code lex_id]
|
||||||
+cell int
|
+cell int
|
||||||
+cell ID of the lexeme's lexical type.
|
+cell ID of the lexeme's lexical type.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code rank]
|
||||||
|
+cell int
|
||||||
|
+cell
|
||||||
|
| Sequential ID of the lexemes's lexical type, used to index into
|
||||||
|
| tables, e.g. for word vectors.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code flags]
|
||||||
|
+cell int
|
||||||
|
+cell Container of the lexeme's binary flags.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code norm]
|
||||||
|
+cell int
|
||||||
|
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code norm_]
|
||||||
|
+cell unicode
|
||||||
|
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lower]
|
+cell #[code lower]
|
||||||
+cell int
|
+cell int
|
||||||
+cell Lower-case form of the word.
|
+cell Lowercase form of the word.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lower_]
|
+cell #[code lower_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Lower-case form of the word.
|
+cell Lowercase form of the word.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code shape]
|
+cell #[code shape]
|
||||||
|
@ -192,22 +226,30 @@ p The L2 norm of the lexeme's vector representation.
|
||||||
+row
|
+row
|
||||||
+cell #[code prefix]
|
+cell #[code prefix]
|
||||||
+cell int
|
+cell int
|
||||||
+cell Length-N substring from the start of the word. Defaults to #[code N=1].
|
+cell
|
||||||
|
| Length-N substring from the start of the word. Defaults to
|
||||||
|
| #[code N=1].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code prefix_]
|
+cell #[code prefix_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Length-N substring from the start of the word. Defaults to #[code N=1].
|
+cell
|
||||||
|
| Length-N substring from the start of the word. Defaults to
|
||||||
|
| #[code N=1].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code suffix]
|
+cell #[code suffix]
|
||||||
+cell int
|
+cell int
|
||||||
+cell Length-N substring from the end of the word. Defaults to #[code N=3].
|
+cell
|
||||||
|
| Length-N substring from the end of the word. Defaults to
|
||||||
|
| #[code N=3].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code suffix_]
|
+cell #[code suffix_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Length-N substring from the start of the word. Defaults to #[code N=3].
|
+cell
|
||||||
|
| Length-N substring from the start of the word. Defaults to
|
||||||
|
| #[code N=3].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code is_alpha]
|
+cell #[code is_alpha]
|
||||||
|
@ -237,6 +279,13 @@ p The L2 norm of the lexeme's vector representation.
|
||||||
| Is the lexeme in lowercase? Equivalent to
|
| Is the lexeme in lowercase? Equivalent to
|
||||||
| #[code lexeme.text.islower()].
|
| #[code lexeme.text.islower()].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_upper]
|
||||||
|
+cell bool
|
||||||
|
+cell
|
||||||
|
| Is the lexeme in uppercase? Equivalent to
|
||||||
|
| #[code lexeme.text.isupper()].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code is_title]
|
+cell #[code is_title]
|
||||||
+cell bool
|
+cell bool
|
||||||
|
@ -249,6 +298,16 @@ p The L2 norm of the lexeme's vector representation.
|
||||||
+cell bool
|
+cell bool
|
||||||
+cell Is the lexeme punctuation?
|
+cell Is the lexeme punctuation?
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_left_punct]
|
||||||
|
+cell bool
|
||||||
|
+cell Is the lexeme a left punctuation mark, e.g. #[code (]?
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_right_punct]
|
||||||
|
+cell bool
|
||||||
|
+cell Is the lexeme a right punctuation mark, e.g. #[code )]?
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code is_space]
|
+cell #[code is_space]
|
||||||
+cell bool
|
+cell bool
|
||||||
|
@ -256,6 +315,16 @@ p The L2 norm of the lexeme's vector representation.
|
||||||
| Does the lexeme consist of whitespace characters? Equivalent to
|
| Does the lexeme consist of whitespace characters? Equivalent to
|
||||||
| #[code lexeme.text.isspace()].
|
| #[code lexeme.text.isspace()].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_bracket]
|
||||||
|
+cell bool
|
||||||
|
+cell Is the lexeme a bracket?
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code is_quote]
|
||||||
|
+cell bool
|
||||||
|
+cell Is the lexeme a quotation mark?
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code like_url]
|
+cell #[code like_url]
|
||||||
+cell bool
|
+cell bool
|
||||||
|
@ -285,6 +354,7 @@ p The L2 norm of the lexeme's vector representation.
|
||||||
+cell #[code lang]
|
+cell #[code lang]
|
||||||
+cell int
|
+cell int
|
||||||
+cell Language of the parent vocabulary.
|
+cell Language of the parent vocabulary.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lang_]
|
+cell #[code lang_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
|
@ -293,9 +363,16 @@ p The L2 norm of the lexeme's vector representation.
|
||||||
+row
|
+row
|
||||||
+cell #[code prob]
|
+cell #[code prob]
|
||||||
+cell float
|
+cell float
|
||||||
+cell Smoothed log probability estimate of lexeme's type.
|
+cell Smoothed log probability estimate of the lexeme's type.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code cluster]
|
||||||
|
+cell int
|
||||||
|
+cell Brown cluster ID.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code sentiment]
|
+cell #[code sentiment]
|
||||||
+cell float
|
+cell float
|
||||||
+cell A scalar value indicating the positivity or negativity of the lexeme.
|
+cell
|
||||||
|
| A scalar value indicating the positivity or negativity of the
|
||||||
|
| lexeme.
|
||||||
|
|
|
@ -248,6 +248,28 @@ p
|
||||||
+cell float
|
+cell float
|
||||||
+cell A scalar similarity score. Higher is more similar.
|
+cell A scalar similarity score. Higher is more similar.
|
||||||
|
|
||||||
|
+h(2, "get_lca_matrix") Span.get_lca_matrix
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Calculates the lowest common ancestor matrix for a given #[code Span].
|
||||||
|
| Returns LCA matrix containing the integer index of the ancestor, or
|
||||||
|
| #[code -1] if no common ancestor is found, e.g. if span excludes a
|
||||||
|
| necessary ancestor.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York in Autumn')
|
||||||
|
span = doc[1:4]
|
||||||
|
matrix = span.get_lca_matrix()
|
||||||
|
# array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||||
|
+cell The lowest common ancestor matrix of the #[code Span].
|
||||||
|
|
||||||
|
|
||||||
+h(2, "to_array") Span.to_array
|
+h(2, "to_array") Span.to_array
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
@ -347,7 +369,7 @@ p
|
||||||
+tag property
|
+tag property
|
||||||
+tag-model("parse")
|
+tag-model("parse")
|
||||||
|
|
||||||
p Tokens that are to the left of the span, whose head is within the span.
|
p Tokens that are to the left of the span, whose heads are within the span.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
doc = nlp(u'I like New York in Autumn.')
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
@ -364,7 +386,7 @@ p Tokens that are to the left of the span, whose head is within the span.
|
||||||
+tag property
|
+tag property
|
||||||
+tag-model("parse")
|
+tag-model("parse")
|
||||||
|
|
||||||
p Tokens that are to the right of the span, whose head is within the span.
|
p Tokens that are to the right of the span, whose heads are within the span.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
doc = nlp(u'I like New York in Autumn.')
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
@ -377,6 +399,42 @@ p Tokens that are to the right of the span, whose head is within the span.
|
||||||
+cell #[code Token]
|
+cell #[code Token]
|
||||||
+cell A right-child of a token of the span.
|
+cell A right-child of a token of the span.
|
||||||
|
|
||||||
|
+h(2, "n_lefts") Span.n_lefts
|
||||||
|
+tag property
|
||||||
|
+tag-model("parse")
|
||||||
|
|
||||||
|
p
|
||||||
|
| The number of tokens that are to the left of the span, whose heads are
|
||||||
|
| within the span.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
assert doc[3:7].n_lefts == 1
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell int
|
||||||
|
+cell The number of left-child tokens.
|
||||||
|
|
||||||
|
+h(2, "n_rights") Span.n_rights
|
||||||
|
+tag property
|
||||||
|
+tag-model("parse")
|
||||||
|
|
||||||
|
p
|
||||||
|
| The number of tokens that are to the right of the span, whose heads are
|
||||||
|
| within the span.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
assert doc[2:4].n_rights == 1
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell int
|
||||||
|
+cell The number of right-child tokens.
|
||||||
|
|
||||||
+h(2, "subtree") Span.subtree
|
+h(2, "subtree") Span.subtree
|
||||||
+tag property
|
+tag property
|
||||||
+tag-model("parse")
|
+tag-model("parse")
|
||||||
|
@ -495,6 +553,18 @@ p
|
||||||
| The text content of the span with a trailing whitespace character
|
| The text content of the span with a trailing whitespace character
|
||||||
| if the last token has one.
|
| if the last token has one.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code orth]
|
||||||
|
+cell int
|
||||||
|
+cell ID of the verbatim text content.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code orth_]
|
||||||
|
+cell unicode
|
||||||
|
+cell
|
||||||
|
| Verbatim text content (identical to #[code Span.text]). Existst
|
||||||
|
| mostly for consistency with the other attributes.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code label]
|
+cell #[code label]
|
||||||
+cell int
|
+cell int
|
||||||
|
@ -519,3 +589,17 @@ p
|
||||||
+cell #[code ent_id_]
|
+cell #[code ent_id_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell The string ID of the named entity the token is an instance of.
|
+cell The string ID of the named entity the token is an instance of.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code sentiment]
|
||||||
|
+cell float
|
||||||
|
+cell
|
||||||
|
| A scalar value indicating the positivity or negativity of the
|
||||||
|
| span.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code _]
|
||||||
|
+cell #[code Underscore]
|
||||||
|
+cell
|
||||||
|
| User space for adding custom
|
||||||
|
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
|
||||||
|
|
|
@ -302,6 +302,80 @@ p A sequence of the token's immediate syntactic children.
|
||||||
+cell #[code Token]
|
+cell #[code Token]
|
||||||
+cell A child token such that #[code child.head==self].
|
+cell A child token such that #[code child.head==self].
|
||||||
|
|
||||||
|
+h(2, "lefts") Token.lefts
|
||||||
|
+tag property
|
||||||
|
+tag-model("parse")
|
||||||
|
|
||||||
|
p
|
||||||
|
| The leftward immediate children of the word, in the syntactic dependency
|
||||||
|
| parse.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
lefts = [t.text for t in doc[3].lefts]
|
||||||
|
assert lefts == [u'New']
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row("foot")
|
||||||
|
+cell yields
|
||||||
|
+cell #[code Token]
|
||||||
|
+cell A left-child of the token.
|
||||||
|
|
||||||
|
+h(2, "rights") Token.rights
|
||||||
|
+tag property
|
||||||
|
+tag-model("parse")
|
||||||
|
|
||||||
|
p
|
||||||
|
| The rightward immediate children of the word, in the syntactic
|
||||||
|
| dependency parse.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
rights = [t.text for t in doc[3].rights]
|
||||||
|
assert rights == [u'in']
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row("foot")
|
||||||
|
+cell yields
|
||||||
|
+cell #[code Token]
|
||||||
|
+cell A right-child of the token.
|
||||||
|
|
||||||
|
+h(2, "n_lefts") Token.n_lefts
|
||||||
|
+tag property
|
||||||
|
+tag-model("parse")
|
||||||
|
|
||||||
|
p
|
||||||
|
| The number of leftward immediate children of the word, in the syntactic
|
||||||
|
| dependency parse.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
assert doc[3].n_lefts == 1
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell int
|
||||||
|
+cell The number of left-child tokens.
|
||||||
|
|
||||||
|
+h(2, "n_rights") Token.n_rights
|
||||||
|
+tag property
|
||||||
|
+tag-model("parse")
|
||||||
|
|
||||||
|
p
|
||||||
|
| The number of rightward immediate children of the word, in the syntactic
|
||||||
|
| dependency parse.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
doc = nlp(u'I like New York in Autumn.')
|
||||||
|
assert doc[3].n_rights == 1
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell int
|
||||||
|
+cell The number of right-child tokens.
|
||||||
|
|
||||||
+h(2, "subtree") Token.subtree
|
+h(2, "subtree") Token.subtree
|
||||||
+tag property
|
+tag property
|
||||||
+tag-model("parse")
|
+tag-model("parse")
|
||||||
|
@ -489,15 +563,35 @@ p The L2 norm of the token's vector representation.
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Base form of the token, with no inflectional suffixes.
|
+cell Base form of the token, with no inflectional suffixes.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code norm]
|
||||||
|
+cell int
|
||||||
|
+cell
|
||||||
|
| The token's norm, i.e. a normalised form of the token text.
|
||||||
|
| Usually set in the language's
|
||||||
|
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
|
||||||
|
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code norm_]
|
||||||
|
+cell unicode
|
||||||
|
+cell
|
||||||
|
| The token's norm, i.e. a normalised form of the token text.
|
||||||
|
| Usually set in the language's
|
||||||
|
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
|
||||||
|
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lower]
|
+cell #[code lower]
|
||||||
+cell int
|
+cell int
|
||||||
+cell Lower-case form of the token.
|
+cell Lowercase form of the token.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lower_]
|
+cell #[code lower_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Lower-case form of the token.
|
+cell
|
||||||
|
| Lowercase form of the token text. Equivalent to
|
||||||
|
| #[code Token.text.lower()].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code shape]
|
+cell #[code shape]
|
||||||
|
@ -537,7 +631,9 @@ p The L2 norm of the token's vector representation.
|
||||||
+row
|
+row
|
||||||
+cell #[code suffix_]
|
+cell #[code suffix_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
|
+cell
|
||||||
|
| Length-N substring from the end of the token. Defaults to
|
||||||
|
| #[code N=3].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code is_alpha]
|
+cell #[code is_alpha]
|
||||||
|
@ -672,6 +768,7 @@ p The L2 norm of the token's vector representation.
|
||||||
+cell #[code lang]
|
+cell #[code lang]
|
||||||
+cell int
|
+cell int
|
||||||
+cell Language of the parent document's vocabulary.
|
+cell Language of the parent document's vocabulary.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lang_]
|
+cell #[code lang_]
|
||||||
+cell unicode
|
+cell unicode
|
||||||
|
@ -690,9 +787,30 @@ p The L2 norm of the token's vector representation.
|
||||||
+row
|
+row
|
||||||
+cell #[code sentiment]
|
+cell #[code sentiment]
|
||||||
+cell float
|
+cell float
|
||||||
+cell A scalar value indicating the positivity or negativity of the token.
|
+cell
|
||||||
|
| A scalar value indicating the positivity or negativity of the
|
||||||
|
| token.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code lex_id]
|
+cell #[code lex_id]
|
||||||
+cell int
|
+cell int
|
||||||
+cell ID of the token's lexical type.
|
+cell Sequential ID of the token's lexical type.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code rank]
|
||||||
|
+cell int
|
||||||
|
+cell
|
||||||
|
| Sequential ID of the token's lexical type, used to index into
|
||||||
|
| tables, e.g. for word vectors.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code cluster]
|
||||||
|
+cell int
|
||||||
|
+cell Brown cluster ID.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code _]
|
||||||
|
+cell #[code Underscore]
|
||||||
|
+cell
|
||||||
|
| User space for adding custom
|
||||||
|
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
|
||||||
|
|
|
@ -36,12 +36,14 @@ p
|
||||||
| that maps strings to hash values, and vice versa.
|
| that maps strings to hash values, and vice versa.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code data]
|
+cell #[code width]
|
||||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
+cell int
|
||||||
|
+cell Number of dimensions.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code width]
|
+cell #[code data]
|
||||||
+cell Number of dimensions.
|
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||||
|
+cell The vector data.
|
||||||
|
|
||||||
+row("foot")
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
|
@ -208,7 +210,7 @@ p
|
||||||
+row("foot")
|
+row("foot")
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell tuple
|
+cell tuple
|
||||||
+cell #[code (rows, dims)] pairs.
|
+cell A #[code (rows, dims)] pair.
|
||||||
|
|
||||||
+h(2, "from_glove") Vectors.from_glove
|
+h(2, "from_glove") Vectors.from_glove
|
||||||
+tag method
|
+tag method
|
||||||
|
@ -238,11 +240,16 @@ p Save the current state to a directory.
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code path]
|
+cell #[code path]
|
||||||
+cell unicode or #[code Path]
|
+cell unicode / #[code Path]
|
||||||
+cell
|
+cell
|
||||||
| A path to a directory, which will be created if it doesn't exist.
|
| A path to a directory, which will be created if it doesn't exist.
|
||||||
| Paths may be either strings or #[code Path]-like objects.
|
| Paths may be either strings or #[code Path]-like objects.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being saved.
|
||||||
|
|
||||||
+h(2, "from_disk") Vectors.from_disk
|
+h(2, "from_disk") Vectors.from_disk
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
|
@ -255,7 +262,7 @@ p Loads state from a directory. Modifies the object in place and returns it.
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code path]
|
+cell #[code path]
|
||||||
+cell unicode or #[code Path]
|
+cell unicode / #[code Path]
|
||||||
+cell
|
+cell
|
||||||
| A path to a directory. Paths may be either strings or
|
| A path to a directory. Paths may be either strings or
|
||||||
| #[code Path]-like objects.
|
| #[code Path]-like objects.
|
||||||
|
@ -297,7 +304,7 @@ p Load state from a binary string.
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code bytes_data]
|
+cell #[code data]
|
||||||
+cell bytes
|
+cell bytes
|
||||||
+cell The data to load from.
|
+cell The data to load from.
|
||||||
|
|
||||||
|
|
|
@ -111,11 +111,13 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| A few more convenience attributes are provided for iterating around the
|
| A few more convenience attributes are provided for iterating around the
|
||||||
| local tree from the token. The #[code .lefts] and #[code .rights]
|
| local tree from the token. The #[+api("token#lefts") #[code Token.lefts]]
|
||||||
| attributes provide sequences of syntactic children that occur before and
|
| and #[+api("token#rights") #[code Token.rights]] attributes provide
|
||||||
| after the token. Both sequences are in sentences order. There are also
|
| sequences of syntactic children that occur before and after the token.
|
||||||
| two integer-typed attributes, #[code .n_rights] and #[code .n_lefts],
|
| Both sequences are in sentence order. There are also two integer-typed
|
||||||
| that give the number of left and right children.
|
| attributes, #[+api("token#n_rights") #[code Token.n_rights]] and
|
||||||
|
| #[+api("token#n_lefts") #[code Token.n_lefts]], that give the number of
|
||||||
|
| left and right children.
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
doc = nlp(u'bright red apples on the tree')
|
doc = nlp(u'bright red apples on the tree')
|
||||||
|
@ -126,10 +128,11 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| You can get a whole phrase by its syntactic head using the
|
| You can get a whole phrase by its syntactic head using the
|
||||||
| #[code .subtree] attribute. This returns an ordered sequence of tokens.
|
| #[+api("token#subtree") #[code Token.subtree]] attribute. This returns an
|
||||||
| You can walk up the tree with the #[code .ancestors] attribute, and
|
| ordered sequence of tokens. You can walk up the tree with the
|
||||||
| check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]]
|
| #[+api("token#ancestors") #[code Token.ancestors]] attribute, and
|
||||||
| method.
|
| check dominance with
|
||||||
|
| #[+api("token#is_ancestor") #[code Token.is_ancestor()]].
|
||||||
|
|
||||||
+aside("Projective vs. non-projective")
|
+aside("Projective vs. non-projective")
|
||||||
| For the #[+a("/models/en") default English model], the
|
| For the #[+a("/models/en") default English model], the
|
||||||
|
|
Loading…
Reference in New Issue
Block a user