mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Merge 'tidy-up' changes into branch. Resolve conflicts
This commit is contained in:
commit
64e4ff7c4b
|
@ -1,93 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import plac
|
||||
import joblib
|
||||
from os import path
|
||||
import os
|
||||
import bz2
|
||||
import ujson
|
||||
from preshed.counter import PreshCounter
|
||||
from joblib import Parallel, delayed
|
||||
import io
|
||||
|
||||
from spacy.en import English
|
||||
from spacy.strings import StringStore
|
||||
from spacy.attrs import ORTH
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def iter_comments(loc):
|
||||
with bz2.BZ2File(loc) as file_:
|
||||
for line in file_:
|
||||
yield ujson.loads(line)
|
||||
|
||||
|
||||
def count_freqs(input_loc, output_loc):
|
||||
print(output_loc)
|
||||
vocab = English.default_vocab(get_lex_attr=None)
|
||||
tokenizer = Tokenizer.from_dir(vocab,
|
||||
path.join(English.default_data_dir(), 'tokenizer'))
|
||||
|
||||
counts = PreshCounter()
|
||||
for json_comment in iter_comments(input_loc):
|
||||
doc = tokenizer(json_comment['body'])
|
||||
doc.count_by(ORTH, counts=counts)
|
||||
|
||||
with io.open(output_loc, 'w', 'utf8') as file_:
|
||||
for orth, freq in counts:
|
||||
string = tokenizer.vocab.strings[orth]
|
||||
if not string.isspace():
|
||||
file_.write('%d\t%s\n' % (freq, string))
|
||||
|
||||
|
||||
def parallelize(func, iterator, n_jobs):
|
||||
Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator)
|
||||
|
||||
|
||||
def merge_counts(locs, out_loc):
|
||||
string_map = StringStore()
|
||||
counts = PreshCounter()
|
||||
for loc in locs:
|
||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||
for line in file_:
|
||||
freq, word = line.strip().split('\t', 1)
|
||||
orth = string_map[word]
|
||||
counts.inc(orth, int(freq))
|
||||
with io.open(out_loc, 'w', encoding='utf8') as file_:
|
||||
for orth, count in counts:
|
||||
string = string_map[orth]
|
||||
file_.write('%d\t%s\n' % (count, string))
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_loc=("Location of input file list"),
|
||||
freqs_dir=("Directory for frequency files"),
|
||||
output_loc=("Location for output file"),
|
||||
n_jobs=("Number of workers", "option", "n", int),
|
||||
skip_existing=("Skip inputs where an output file exists", "flag", "s", bool),
|
||||
)
|
||||
def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
|
||||
tasks = []
|
||||
outputs = []
|
||||
for input_path in open(input_loc):
|
||||
input_path = input_path.strip()
|
||||
if not input_path:
|
||||
continue
|
||||
filename = input_path.split('/')[-1]
|
||||
output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
|
||||
outputs.append(output_path)
|
||||
if not path.exists(output_path) or not skip_existing:
|
||||
tasks.append((input_path, output_path))
|
||||
|
||||
if tasks:
|
||||
parallelize(count_freqs, tasks, n_jobs)
|
||||
|
||||
print("Merge")
|
||||
merge_counts(outputs, output_loc)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,89 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from xml.etree import cElementTree as ElementTree
|
||||
import json
|
||||
import re
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from os import path
|
||||
|
||||
|
||||
escaped_tokens = {
|
||||
'-LRB-': '(',
|
||||
'-RRB-': ')',
|
||||
'-LSB-': '[',
|
||||
'-RSB-': ']',
|
||||
'-LCB-': '{',
|
||||
'-RCB-': '}',
|
||||
}
|
||||
|
||||
def read_parses(parse_loc):
|
||||
offset = 0
|
||||
doc = []
|
||||
for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'):
|
||||
parse = _adjust_token_ids(parse, offset)
|
||||
offset += len(parse.split('\n'))
|
||||
doc.append(parse)
|
||||
return doc
|
||||
|
||||
def _adjust_token_ids(parse, offset):
|
||||
output = []
|
||||
for line in parse.split('\n'):
|
||||
pieces = line.split()
|
||||
pieces[0] = str(int(pieces[0]) + offset)
|
||||
pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0'
|
||||
output.append('\t'.join(pieces))
|
||||
return '\n'.join(output)
|
||||
|
||||
|
||||
def _fmt_doc(filename, paras):
|
||||
return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]}
|
||||
|
||||
|
||||
def _fmt_para(raw, sents):
|
||||
return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]}
|
||||
|
||||
|
||||
def _fmt_sent(sent):
|
||||
return {
|
||||
'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')],
|
||||
'brackets': []}
|
||||
|
||||
|
||||
def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3):
|
||||
head = int(head) - 1
|
||||
id_ = int(id_) - 1
|
||||
head = (head - id_) if head != -1 else 0
|
||||
return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head}
|
||||
|
||||
|
||||
tags_re = re.compile(r'<[\w\?/][^>]+>')
|
||||
def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'):
|
||||
ewtb_dir = Path(ewtb_dir)
|
||||
out_dir = Path(out_dir)
|
||||
if not out_dir.exists():
|
||||
out_dir.mkdir()
|
||||
for genre_dir in ewtb_dir.joinpath('data').iterdir():
|
||||
#if 'answers' in str(genre_dir): continue
|
||||
parse_dir = genre_dir.joinpath('penntree')
|
||||
docs = []
|
||||
for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir():
|
||||
filename = source_loc.parts[-1].replace('.sgm.sgm', '')
|
||||
filename = filename.replace('.xml', '')
|
||||
filename = filename.replace('.txt', '')
|
||||
parse_loc = parse_dir.joinpath(filename + '.xml.tree')
|
||||
parses = read_parses(parse_loc)
|
||||
source = source_loc.open().read().strip()
|
||||
if 'answers' in str(genre_dir):
|
||||
source = tags_re.sub('', source).strip()
|
||||
docs.append(_fmt_doc(filename, [[source, parses]]))
|
||||
|
||||
out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json')
|
||||
with open(str(out_loc), 'w') as out_file:
|
||||
out_file.write(json.dumps(docs, indent=4))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,32 +0,0 @@
|
|||
import io
|
||||
import plac
|
||||
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
def main(text_loc):
|
||||
with io.open(text_loc, 'r', encoding='utf8') as file_:
|
||||
text = file_.read()
|
||||
NLU = English()
|
||||
for paragraph in text.split('\n\n'):
|
||||
tokens = NLU(paragraph)
|
||||
|
||||
ent_starts = {}
|
||||
ent_ends = {}
|
||||
for span in tokens.ents:
|
||||
ent_starts[span.start] = span.label_
|
||||
ent_ends[span.end] = span.label_
|
||||
|
||||
output = []
|
||||
for token in tokens:
|
||||
if token.i in ent_starts:
|
||||
output.append('<%s>' % ent_starts[token.i])
|
||||
output.append(token.orth_)
|
||||
if (token.i+1) in ent_ends:
|
||||
output.append('</%s>' % ent_ends[token.i+1])
|
||||
output.append('\n\n')
|
||||
print ' '.join(output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,157 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import io
|
||||
import random
|
||||
import time
|
||||
import gzip
|
||||
|
||||
import plac
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import spacy.util
|
||||
from spacy.en import English
|
||||
from spacy.gold import GoldParse
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.syntax.parser import Parser
|
||||
from spacy.scorer import Scorer
|
||||
from spacy.tagger import Tagger
|
||||
|
||||
# Last updated for spaCy v0.97
|
||||
|
||||
|
||||
def read_conll(file_):
|
||||
"""Read a standard CoNLL/MALT-style format"""
|
||||
sents = []
|
||||
for sent_str in file_.read().strip().split('\n\n'):
|
||||
ids = []
|
||||
words = []
|
||||
heads = []
|
||||
labels = []
|
||||
tags = []
|
||||
for i, line in enumerate(sent_str.split('\n')):
|
||||
word, pos_string, head_idx, label = _parse_line(line)
|
||||
words.append(word)
|
||||
if head_idx < 0:
|
||||
head_idx = i
|
||||
ids.append(i)
|
||||
heads.append(head_idx)
|
||||
labels.append(label)
|
||||
tags.append(pos_string)
|
||||
text = ' '.join(words)
|
||||
annot = (ids, words, tags, heads, labels, ['O'] * len(ids))
|
||||
sents.append((None, [(annot, [])]))
|
||||
return sents
|
||||
|
||||
|
||||
def _parse_line(line):
|
||||
pieces = line.split()
|
||||
if len(pieces) == 4:
|
||||
word, pos, head_idx, label = pieces
|
||||
head_idx = int(head_idx)
|
||||
elif len(pieces) == 15:
|
||||
id_ = int(pieces[0].split('_')[-1])
|
||||
word = pieces[1]
|
||||
pos = pieces[4]
|
||||
head_idx = int(pieces[8])-1
|
||||
label = pieces[10]
|
||||
else:
|
||||
id_ = int(pieces[0].split('_')[-1])
|
||||
word = pieces[1]
|
||||
pos = pieces[4]
|
||||
head_idx = int(pieces[6])-1
|
||||
label = pieces[7]
|
||||
if head_idx == 0:
|
||||
label = 'ROOT'
|
||||
return word, pos, head_idx, label
|
||||
|
||||
|
||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.parser(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples, make_projective=False)
|
||||
scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct'))
|
||||
|
||||
|
||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||
gold_preproc=False, force_gold=False):
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(dep_model_dir):
|
||||
shutil.rmtree(dep_model_dir)
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
os.mkdir(dep_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=ArcEager.get_labels(gold_tuples))
|
||||
|
||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
||||
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
||||
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
||||
|
||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||
for itn in range(n_iter):
|
||||
scorer = Scorer()
|
||||
loss = 0
|
||||
for _, sents in gold_tuples:
|
||||
for annot_tuples, _ in sents:
|
||||
if len(annot_tuples[1]) == 1:
|
||||
continue
|
||||
|
||||
score_model(scorer, nlp, None, annot_tuples, verbose=False)
|
||||
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
||||
if not gold.is_projective:
|
||||
raise Exception(
|
||||
"Non-projective sentence in training, after we should "
|
||||
"have enforced projectivity: %s" % annot_tuples
|
||||
)
|
||||
|
||||
loss += nlp.parser.train(tokens, gold)
|
||||
nlp.tagger.train(tokens, gold.tags)
|
||||
random.shuffle(gold_tuples)
|
||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
|
||||
scorer.tags_acc, scorer.token_acc))
|
||||
print('end training')
|
||||
nlp.end_training(model_dir)
|
||||
print('done')
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
||||
model_dir=("Location of output model directory"),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, n_iter=15):
|
||||
with io.open(train_loc, 'r', encoding='utf8') as file_:
|
||||
train_sents = read_conll(file_)
|
||||
if not eval_only:
|
||||
train(English, train_sents, model_dir, n_iter=n_iter)
|
||||
nlp = English(data_dir=model_dir)
|
||||
dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8'))
|
||||
scorer = Scorer()
|
||||
for _, sents in dev_sents:
|
||||
for annot_tuples, _ in sents:
|
||||
score_model(scorer, nlp, None, annot_tuples)
|
||||
print('TOK', 100-scorer.token_acc)
|
||||
print('POS', scorer.tags_acc)
|
||||
print('UAS', scorer.uas)
|
||||
print('LAS', scorer.las)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,187 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import io
|
||||
import random
|
||||
|
||||
import plac
|
||||
import re
|
||||
|
||||
import spacy.util
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.gold import read_json_file
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.gold import merge_sents
|
||||
|
||||
from spacy.scorer import Scorer
|
||||
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.syntax.ner import BiluoPushDown
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.syntax.parser import Parser
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
|
||||
|
||||
def _corrupt(c, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return c
|
||||
elif c == ' ':
|
||||
return '\n'
|
||||
elif c == '\n':
|
||||
return ' '
|
||||
elif c in ['.', "'", "!", "?"]:
|
||||
return ''
|
||||
else:
|
||||
return c.lower()
|
||||
|
||||
|
||||
def add_noise(orig, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return orig
|
||||
elif type(orig) == list:
|
||||
corrupted = [_corrupt(word, noise_level) for word in orig]
|
||||
corrupted = [w for w in corrupted if w]
|
||||
return corrupted
|
||||
else:
|
||||
return ''.join(_corrupt(c, noise_level) for c in orig)
|
||||
|
||||
|
||||
def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
else:
|
||||
tokens = nlp.tokenizer(raw_text)
|
||||
nlp.tagger(tokens)
|
||||
nlp.entity(tokens)
|
||||
nlp.parser(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=verbose)
|
||||
|
||||
|
||||
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
||||
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
|
||||
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
||||
format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
|
||||
with Language.train(model_dir, train_data,
|
||||
tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
||||
loss = 0
|
||||
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
|
||||
augment_data=None)):
|
||||
for doc, gold in epoch:
|
||||
trainer.update(doc, gold)
|
||||
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
|
||||
print(format_str.format(itn, trainer.nlp.parser.model.nr_weight,
|
||||
trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores))
|
||||
|
||||
|
||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||
beam_width=None, cand_preproc=None):
|
||||
print("Load parser", model_dir)
|
||||
nlp = Language(path=model_dir)
|
||||
if nlp.lang == 'de':
|
||||
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
|
||||
if beam_width is not None:
|
||||
nlp.parser.cfg.beam_width = beam_width
|
||||
scorer = Scorer()
|
||||
for raw_text, sents in gold_tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
sents = merge_sents(sents)
|
||||
for annot_tuples, brackets in sents:
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.parser(tokens)
|
||||
nlp.entity(tokens)
|
||||
else:
|
||||
tokens = nlp(raw_text)
|
||||
gold = GoldParse.from_annot_tuples(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=verbose)
|
||||
return scorer
|
||||
|
||||
|
||||
def write_parses(Language, dev_loc, model_dir, out_loc):
|
||||
nlp = Language(data_dir=model_dir)
|
||||
gold_tuples = read_json_file(dev_loc)
|
||||
scorer = Scorer()
|
||||
out_file = io.open(out_loc, 'w', 'utf8')
|
||||
for raw_text, sents in gold_tuples:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, brackets in sents:
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.entity(tokens)
|
||||
nlp.parser(tokens)
|
||||
else:
|
||||
tokens = nlp(raw_text)
|
||||
#gold = GoldParse(tokens, annot_tuples)
|
||||
#scorer.score(tokens, gold, verbose=False)
|
||||
for sent in tokens.sents:
|
||||
for t in sent:
|
||||
if not t.is_space:
|
||||
out_file.write(
|
||||
'%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_)
|
||||
)
|
||||
out_file.write('\n')
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
language=("The language to train", "positional", None, str, ['en','de', 'zh']),
|
||||
train_loc=("Location of training file or directory"),
|
||||
dev_loc=("Location of development file or directory"),
|
||||
model_dir=("Location of output model directory",),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
||||
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
|
||||
out_loc=("Out location", "option", "o", str),
|
||||
n_sents=("Number of training sentences", "option", "n", int),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
||||
debug=("Debug mode", "flag", "d", bool),
|
||||
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
|
||||
L1=("L1 regularization penalty", "option", "L", float),
|
||||
)
|
||||
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False,
|
||||
L1=1e-6):
|
||||
parser_cfg = dict(locals())
|
||||
tagger_cfg = dict(locals())
|
||||
entity_cfg = dict(locals())
|
||||
|
||||
lang = spacy.util.get_lang_class(language)
|
||||
|
||||
parser_cfg['features'] = lang.Defaults.parser_features
|
||||
entity_cfg['features'] = lang.Defaults.entity_features
|
||||
|
||||
if not eval_only:
|
||||
gold_train = list(read_json_file(train_loc))
|
||||
gold_dev = list(read_json_file(dev_loc))
|
||||
if n_sents > 0:
|
||||
gold_train = gold_train[:n_sents]
|
||||
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
|
||||
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
|
||||
n_iter=n_iter)
|
||||
if out_loc:
|
||||
write_parses(lang, dev_loc, model_dir, out_loc)
|
||||
scorer = evaluate(lang, list(read_json_file(dev_loc)),
|
||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
||||
print('TOK', scorer.token_acc)
|
||||
print('POS', scorer.tags_acc)
|
||||
print('UAS', scorer.uas)
|
||||
print('LAS', scorer.las)
|
||||
|
||||
print('NER P', scorer.ents_p)
|
||||
print('NER R', scorer.ents_r)
|
||||
print('NER F', scorer.ents_f)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,201 +0,0 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
import plac
|
||||
import json
|
||||
import random
|
||||
import pathlib
|
||||
|
||||
from spacy.tokens import Doc
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
from spacy.language import Language
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.pipeline import DependencyParser, TokenVectorEncoder
|
||||
from spacy.syntax.parser import get_templates
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.scorer import Scorer
|
||||
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
|
||||
import spacy.attrs
|
||||
import io
|
||||
from thinc.neural.ops import CupyOps
|
||||
from thinc.neural import Model
|
||||
from spacy.es import Spanish
|
||||
from spacy.attrs import POS
|
||||
|
||||
|
||||
from thinc.neural import Model
|
||||
|
||||
|
||||
try:
|
||||
import cupy
|
||||
from thinc.neural.ops import CupyOps
|
||||
except:
|
||||
cupy = None
|
||||
|
||||
|
||||
def read_conllx(loc, n=0):
|
||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||
text = file_.read()
|
||||
i = 0
|
||||
for sent in text.strip().split('\n\n'):
|
||||
lines = sent.strip().split('\n')
|
||||
if lines:
|
||||
while lines[0].startswith('#'):
|
||||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, \
|
||||
_2 = line.split('\t')
|
||||
if '-' in id_ or '.' in id_:
|
||||
continue
|
||||
try:
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head != '0' else id_
|
||||
dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
|
||||
tag = pos+'__'+dep+'__'+morph
|
||||
Spanish.Defaults.tag_map[tag] = {POS: pos}
|
||||
tokens.append((id_, word, tag, head, dep, 'O'))
|
||||
except:
|
||||
raise
|
||||
tuples = [list(t) for t in zip(*tokens)]
|
||||
yield (None, [[tuples, []]])
|
||||
i += 1
|
||||
if n >= 1 and i >= n:
|
||||
break
|
||||
|
||||
|
||||
def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
|
||||
scorer = Scorer()
|
||||
correct = 0.
|
||||
total = 0.
|
||||
for doc, gold in zip(Xs, ys):
|
||||
doc = Doc(vocab, words=[w.text for w in doc])
|
||||
encoder(doc)
|
||||
parser(doc)
|
||||
PseudoProjectivity.deprojectivize(doc)
|
||||
scorer.score(doc, gold, verbose=verbose)
|
||||
for token, tag in zip(doc, gold.tags):
|
||||
if '_' in token.tag_:
|
||||
univ_guess, _ = token.tag_.split('_', 1)
|
||||
else:
|
||||
univ_guess = ''
|
||||
univ_truth, _ = tag.split('_', 1)
|
||||
correct += univ_guess == univ_truth
|
||||
total += 1
|
||||
return scorer
|
||||
|
||||
|
||||
def organize_data(vocab, train_sents):
|
||||
Xs = []
|
||||
ys = []
|
||||
for _, doc_sents in train_sents:
|
||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
||||
doc = Doc(vocab, words=words)
|
||||
gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
|
||||
Xs.append(doc)
|
||||
ys.append(gold)
|
||||
return Xs, ys
|
||||
|
||||
|
||||
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
||||
LangClass = spacy.util.get_lang_class(lang_name)
|
||||
train_sents = list(read_conllx(train_loc))
|
||||
dev_sents = list(read_conllx(dev_loc))
|
||||
train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
|
||||
|
||||
actions = ArcEager.get_actions(gold_parses=train_sents)
|
||||
features = get_templates('basic')
|
||||
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
if not model_dir.exists():
|
||||
model_dir.mkdir()
|
||||
if not (model_dir / 'deps').exists():
|
||||
(model_dir / 'deps').mkdir()
|
||||
if not (model_dir / 'pos').exists():
|
||||
(model_dir / 'pos').mkdir()
|
||||
with (model_dir / 'deps' / 'config.json').open('wb') as file_:
|
||||
file_.write(
|
||||
json.dumps(
|
||||
{'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
|
||||
|
||||
vocab = LangClass.Defaults.create_vocab()
|
||||
if not (model_dir / 'vocab').exists():
|
||||
(model_dir / 'vocab').mkdir()
|
||||
else:
|
||||
if (model_dir / 'vocab' / 'strings.json').exists():
|
||||
with (model_dir / 'vocab' / 'strings.json').open() as file_:
|
||||
vocab.strings.load(file_)
|
||||
if (model_dir / 'vocab' / 'lexemes.bin').exists():
|
||||
vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
|
||||
|
||||
if clusters_loc is not None:
|
||||
clusters_loc = pathlib.Path(clusters_loc)
|
||||
with clusters_loc.open() as file_:
|
||||
for line in file_:
|
||||
try:
|
||||
cluster, word, freq = line.split()
|
||||
except ValueError:
|
||||
continue
|
||||
lex = vocab[word]
|
||||
lex.cluster = int(cluster[::-1], 2)
|
||||
# Populate vocab
|
||||
for _, doc_sents in train_sents:
|
||||
for (ids, words, tags, heads, deps, ner), _ in doc_sents:
|
||||
for word in words:
|
||||
_ = vocab[word]
|
||||
for dep in deps:
|
||||
_ = vocab[dep]
|
||||
for tag in tags:
|
||||
_ = vocab[tag]
|
||||
if vocab.morphology.tag_map:
|
||||
for tag in tags:
|
||||
vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
|
||||
tagger = Tagger(vocab)
|
||||
encoder = TokenVectorEncoder(vocab, width=64)
|
||||
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
|
||||
|
||||
Xs, ys = organize_data(vocab, train_sents)
|
||||
dev_Xs, dev_ys = organize_data(vocab, dev_sents)
|
||||
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
|
||||
docs = list(Xs)
|
||||
for doc in docs:
|
||||
encoder(doc)
|
||||
nn_loss = [0.]
|
||||
def track_progress():
|
||||
with encoder.tagger.use_params(optimizer.averages):
|
||||
with parser.model.use_params(optimizer.averages):
|
||||
scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
|
||||
itn = len(nn_loss)
|
||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
|
||||
nn_loss.append(0.)
|
||||
track_progress()
|
||||
trainer.each_epoch.append(track_progress)
|
||||
trainer.batch_size = 24
|
||||
trainer.nb_epoch = 40
|
||||
for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
|
||||
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
|
||||
tokvecs, upd_tokvecs = encoder.begin_update(docs)
|
||||
for doc, tokvec in zip(docs, tokvecs):
|
||||
doc.tensor = tokvec
|
||||
d_tokvecs = parser.update(docs, golds, sgd=optimizer)
|
||||
upd_tokvecs(d_tokvecs, sgd=optimizer)
|
||||
encoder.update(docs, golds, sgd=optimizer)
|
||||
nlp = LangClass(vocab=vocab, parser=parser)
|
||||
scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
|
||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
||||
#nlp.end_training(model_dir)
|
||||
#scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
|
||||
#print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import cProfile
|
||||
import pstats
|
||||
if 1:
|
||||
plac.call(main)
|
||||
else:
|
||||
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
s.strip_dirs().sort_stats("time").print_stats()
|
||||
|
||||
|
||||
plac.call(main)
|
|
@ -1,194 +0,0 @@
|
|||
"""Convert OntoNotes into a json format.
|
||||
|
||||
doc: {
|
||||
id: string,
|
||||
paragraphs: [{
|
||||
raw: string,
|
||||
sents: [int],
|
||||
tokens: [{
|
||||
start: int,
|
||||
tag: string,
|
||||
head: int,
|
||||
dep: string}],
|
||||
ner: [{
|
||||
start: int,
|
||||
end: int,
|
||||
label: string}],
|
||||
brackets: [{
|
||||
start: int,
|
||||
end: int,
|
||||
label: string}]}]}
|
||||
|
||||
Consumes output of spacy/munge/align_raw.py
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import json
|
||||
from os import path
|
||||
import os
|
||||
import re
|
||||
import io
|
||||
from collections import defaultdict
|
||||
|
||||
from spacy.munge import read_ptb
|
||||
from spacy.munge import read_conll
|
||||
from spacy.munge import read_ner
|
||||
|
||||
|
||||
def _iter_raw_files(raw_loc):
|
||||
files = json.load(open(raw_loc))
|
||||
for f in files:
|
||||
yield f
|
||||
|
||||
|
||||
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
||||
ptb_sents = read_ptb.split(ptb_text)
|
||||
dep_sents = read_conll.split(dep_text)
|
||||
if len(ptb_sents) != len(dep_sents):
|
||||
return None
|
||||
if ner_text is not None:
|
||||
ner_sents = read_ner.split(ner_text)
|
||||
else:
|
||||
ner_sents = [None] * len(ptb_sents)
|
||||
|
||||
i = 0
|
||||
doc = {'id': file_id}
|
||||
if raw_paras is None:
|
||||
doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)]
|
||||
#for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents):
|
||||
# doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent]))
|
||||
else:
|
||||
doc['paragraphs'] = []
|
||||
for raw_sents in raw_paras:
|
||||
para = format_para(
|
||||
' '.join(raw_sents).replace('<SEP>', ''),
|
||||
ptb_sents[i:i+len(raw_sents)],
|
||||
dep_sents[i:i+len(raw_sents)],
|
||||
ner_sents[i:i+len(raw_sents)])
|
||||
if para['sentences']:
|
||||
doc['paragraphs'].append(para)
|
||||
i += len(raw_sents)
|
||||
return doc
|
||||
|
||||
|
||||
def format_para(raw_text, ptb_sents, dep_sents, ner_sents):
|
||||
para = {'raw': raw_text, 'sentences': []}
|
||||
offset = 0
|
||||
assert len(ptb_sents) == len(dep_sents) == len(ner_sents)
|
||||
for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents):
|
||||
_, deps = read_conll.parse(dep_text, strip_bad_periods=True)
|
||||
if deps and 'VERB' in [t['tag'] for t in deps]:
|
||||
continue
|
||||
if ner_text is not None:
|
||||
_, ner = read_ner.parse(ner_text, strip_bad_periods=True)
|
||||
else:
|
||||
ner = ['-' for _ in deps]
|
||||
_, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True)
|
||||
# Necessary because the ClearNLP converter deletes EDITED words.
|
||||
if len(ner) != len(deps):
|
||||
ner = ['-' for _ in deps]
|
||||
para['sentences'].append(format_sentence(deps, ner, brackets))
|
||||
return para
|
||||
|
||||
|
||||
def format_sentence(deps, ner, brackets):
|
||||
sent = {'tokens': [], 'brackets': []}
|
||||
for token_id, (token, token_ent) in enumerate(zip(deps, ner)):
|
||||
sent['tokens'].append(format_token(token_id, token, token_ent))
|
||||
|
||||
for label, start, end in brackets:
|
||||
if start != end:
|
||||
sent['brackets'].append({
|
||||
'label': label,
|
||||
'first': start,
|
||||
'last': (end-1)})
|
||||
return sent
|
||||
|
||||
|
||||
def format_token(token_id, token, ner):
|
||||
assert token_id == token['id']
|
||||
head = (token['head'] - token_id) if token['head'] != -1 else 0
|
||||
return {
|
||||
'id': token_id,
|
||||
'orth': token['word'],
|
||||
'tag': token['tag'],
|
||||
'head': head,
|
||||
'dep': token['dep'],
|
||||
'ner': ner}
|
||||
|
||||
|
||||
def read_file(*pieces):
|
||||
loc = path.join(*pieces)
|
||||
if not path.exists(loc):
|
||||
return None
|
||||
else:
|
||||
return io.open(loc, 'r', encoding='utf8').read().strip()
|
||||
|
||||
|
||||
def get_file_names(section_dir, subsection):
|
||||
filenames = []
|
||||
for fn in os.listdir(path.join(section_dir, subsection)):
|
||||
filenames.append(fn.rsplit('.', 1)[0])
|
||||
return list(sorted(set(filenames)))
|
||||
|
||||
|
||||
def read_wsj_with_source(onto_dir, raw_dir):
|
||||
# Now do WSJ, with source alignment
|
||||
onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj')
|
||||
docs = {}
|
||||
for i in range(25):
|
||||
section = str(i) if i >= 10 else ('0' + str(i))
|
||||
raw_loc = path.join(raw_dir, 'wsj%s.json' % section)
|
||||
for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)):
|
||||
if section == '00':
|
||||
j += 1
|
||||
if section == '04' and filename == '55':
|
||||
continue
|
||||
ptb = read_file(onto_dir, section, '%s.parse' % filename)
|
||||
dep = read_file(onto_dir, section, '%s.parse.dep' % filename)
|
||||
ner = read_file(onto_dir, section, '%s.name' % filename)
|
||||
if ptb is not None and dep is not None:
|
||||
docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner)
|
||||
return docs
|
||||
|
||||
|
||||
def get_doc(onto_dir, file_path, wsj_docs):
|
||||
filename = file_path.rsplit('/', 1)[1]
|
||||
if filename in wsj_docs:
|
||||
return wsj_docs[filename]
|
||||
else:
|
||||
ptb = read_file(onto_dir, file_path + '.parse')
|
||||
dep = read_file(onto_dir, file_path + '.parse.dep')
|
||||
ner = read_file(onto_dir, file_path + '.name')
|
||||
if ptb is not None and dep is not None:
|
||||
return format_doc(filename, None, ptb, dep, ner)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def read_ids(loc):
|
||||
return open(loc).read().strip().split('\n')
|
||||
|
||||
|
||||
def main(onto_dir, raw_dir, out_dir):
|
||||
wsj_docs = read_wsj_with_source(onto_dir, raw_dir)
|
||||
|
||||
for partition in ('train', 'test', 'development'):
|
||||
ids = read_ids(path.join(onto_dir, '%s.id' % partition))
|
||||
docs_by_genre = defaultdict(list)
|
||||
for file_path in ids:
|
||||
doc = get_doc(onto_dir, file_path, wsj_docs)
|
||||
if doc is not None:
|
||||
genre = file_path.split('/')[3]
|
||||
docs_by_genre[genre].append(doc)
|
||||
part_dir = path.join(out_dir, partition)
|
||||
if not path.exists(part_dir):
|
||||
os.mkdir(part_dir)
|
||||
for genre, docs in sorted(docs_by_genre.items()):
|
||||
out_loc = path.join(part_dir, genre + '.json')
|
||||
with open(out_loc, 'w') as file_:
|
||||
json.dump(docs, file_, indent=4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,13 +0,0 @@
|
|||
"""Read a vector file, and prepare it as binary data, for easy consumption"""
|
||||
|
||||
import plac
|
||||
|
||||
from spacy.vocab import write_binary_vectors
|
||||
|
||||
|
||||
def main(in_loc, out_loc):
|
||||
write_binary_vectors(in_loc, out_loc)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,175 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import codecs
|
||||
import random
|
||||
|
||||
import plac
|
||||
import re
|
||||
|
||||
import spacy.util
|
||||
from spacy.en import English
|
||||
|
||||
from spacy.tagger import Tagger
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.gold import read_json_file
|
||||
from spacy.gold import GoldParse
|
||||
|
||||
from spacy.scorer import Scorer
|
||||
|
||||
|
||||
def score_model(scorer, nlp, raw_text, annot_tuples):
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
else:
|
||||
tokens = nlp.tokenizer(raw_text)
|
||||
nlp.tagger(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold)
|
||||
|
||||
|
||||
def _merge_sents(sents):
|
||||
m_deps = [[], [], [], [], [], []]
|
||||
m_brackets = []
|
||||
i = 0
|
||||
for (ids, words, tags, heads, labels, ner), brackets in sents:
|
||||
m_deps[0].extend(id_ + i for id_ in ids)
|
||||
m_deps[1].extend(words)
|
||||
m_deps[2].extend(tags)
|
||||
m_deps[3].extend(head + i for head in heads)
|
||||
m_deps[4].extend(labels)
|
||||
m_deps[5].extend(ner)
|
||||
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
|
||||
i += len(ids)
|
||||
return [(m_deps, m_brackets)]
|
||||
|
||||
|
||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
|
||||
beam_width=1, verbose=False,
|
||||
use_orig_arc_eager=False):
|
||||
if n_sents > 0:
|
||||
gold_tuples = gold_tuples[:n_sents]
|
||||
|
||||
templates = Tagger.default_templates()
|
||||
nlp = Language(data_dir=model_dir, tagger=False)
|
||||
nlp.tagger = Tagger.blank(nlp.vocab, templates)
|
||||
|
||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||
for itn in range(n_iter):
|
||||
scorer = Scorer()
|
||||
loss = 0
|
||||
for raw_text, sents in gold_tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, ctnt in sents:
|
||||
words = annot_tuples[1]
|
||||
gold_tags = annot_tuples[2]
|
||||
score_model(scorer, nlp, raw_text, annot_tuples)
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||
else:
|
||||
tokens = nlp.tokenizer(raw_text)
|
||||
loss += nlp.tagger.train(tokens, gold_tags)
|
||||
random.shuffle(gold_tuples)
|
||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||
scorer.tags_acc,
|
||||
scorer.token_acc))
|
||||
nlp.end_training(model_dir)
|
||||
|
||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||
beam_width=None):
|
||||
nlp = Language(data_dir=model_dir)
|
||||
if beam_width is not None:
|
||||
nlp.parser.cfg.beam_width = beam_width
|
||||
scorer = Scorer()
|
||||
for raw_text, sents in gold_tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, brackets in sents:
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.entity(tokens)
|
||||
nlp.parser(tokens)
|
||||
else:
|
||||
tokens = nlp(raw_text, merge_mwes=False)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=verbose)
|
||||
return scorer
|
||||
|
||||
|
||||
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
|
||||
nlp = Language(data_dir=model_dir)
|
||||
if beam_width is not None:
|
||||
nlp.parser.cfg.beam_width = beam_width
|
||||
gold_tuples = read_json_file(dev_loc)
|
||||
scorer = Scorer()
|
||||
out_file = codecs.open(out_loc, 'w', 'utf8')
|
||||
for raw_text, sents in gold_tuples:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, brackets in sents:
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.entity(tokens)
|
||||
nlp.parser(tokens)
|
||||
else:
|
||||
tokens = nlp(raw_text, merge_mwes=False)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=False)
|
||||
for t in tokens:
|
||||
out_file.write(
|
||||
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
|
||||
)
|
||||
return scorer
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_loc=("Location of training file or directory"),
|
||||
dev_loc=("Location of development file or directory"),
|
||||
model_dir=("Location of output model directory",),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
||||
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
|
||||
out_loc=("Out location", "option", "o", str),
|
||||
n_sents=("Number of training sentences", "option", "n", int),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
||||
debug=("Debug mode", "flag", "d", bool),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
|
||||
if not eval_only:
|
||||
gold_train = list(read_json_file(train_loc))
|
||||
train(English, gold_train, model_dir,
|
||||
feat_set='basic' if not debug else 'debug',
|
||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
||||
corruption_level=corruption_level, n_iter=n_iter,
|
||||
verbose=verbose)
|
||||
#if out_loc:
|
||||
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
||||
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
||||
print('TOK', scorer.token_acc)
|
||||
print('POS', scorer.tags_acc)
|
||||
print('UAS', scorer.uas)
|
||||
print('LAS', scorer.las)
|
||||
|
||||
print('NER P', scorer.ents_p)
|
||||
print('NER R', scorer.ents_r)
|
||||
print('NER F', scorer.ents_f)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,160 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import io
|
||||
import random
|
||||
import time
|
||||
import gzip
|
||||
import ujson
|
||||
|
||||
import plac
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import spacy.util
|
||||
from spacy.de import German
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.scorer import PRFScore
|
||||
|
||||
from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags
|
||||
from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags
|
||||
from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
|
||||
from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
|
||||
from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
|
||||
|
||||
|
||||
def default_templates():
|
||||
return spacy.tagger.Tagger.default_templates()
|
||||
|
||||
def default_templates_without_clusters():
|
||||
return (
|
||||
(W_orth,),
|
||||
(P1_lemma, P1_pos),
|
||||
(P2_lemma, P2_pos),
|
||||
(N1_orth,),
|
||||
(N2_orth,),
|
||||
|
||||
(W_suffix,),
|
||||
(W_prefix,),
|
||||
|
||||
(P1_pos,),
|
||||
(P2_pos,),
|
||||
(P1_pos, P2_pos),
|
||||
(P1_pos, W_orth),
|
||||
(P1_suffix,),
|
||||
(N1_suffix,),
|
||||
|
||||
(W_shape,),
|
||||
|
||||
(W_flags,),
|
||||
(N1_flags,),
|
||||
(N2_flags,),
|
||||
(P1_flags,),
|
||||
(P2_flags,),
|
||||
)
|
||||
|
||||
|
||||
def make_tagger(vocab, templates):
|
||||
model = spacy.tagger.TaggerModel(templates)
|
||||
return spacy.tagger.Tagger(vocab,model)
|
||||
|
||||
|
||||
def read_conll(file_):
|
||||
def sentences():
|
||||
words, tags = [], []
|
||||
for line in file_:
|
||||
line = line.strip()
|
||||
if line:
|
||||
word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
|
||||
words.append(word)
|
||||
tags.append(tag)
|
||||
elif words:
|
||||
yield words, tags
|
||||
words, tags = [], []
|
||||
if words:
|
||||
yield words, tags
|
||||
return [ s for s in sentences() ]
|
||||
|
||||
|
||||
def score_model(score, nlp, words, gold_tags):
|
||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||
assert(len(tokens) == len(gold_tags))
|
||||
nlp.tagger(tokens)
|
||||
|
||||
for token, gold_tag in zip(tokens,gold_tags):
|
||||
score.score_set(set([token.tag_]),set([gold_tag]))
|
||||
|
||||
|
||||
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
|
||||
# make shuffling deterministic
|
||||
random.seed(seed)
|
||||
|
||||
# set up directory for model
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
||||
nlp.tagger = make_tagger(nlp.vocab,default_templates())
|
||||
|
||||
print("Itn.\ttrain acc %\tdev acc %")
|
||||
for itn in range(n_iter):
|
||||
# train on train set
|
||||
#train_acc = PRFScore()
|
||||
correct, total = 0., 0.
|
||||
for words, gold_tags in train_sents:
|
||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||
correct += nlp.tagger.train(tokens, gold_tags)
|
||||
total += len(words)
|
||||
train_acc = correct/total
|
||||
|
||||
# test on dev set
|
||||
dev_acc = PRFScore()
|
||||
for words, gold_tags in dev_sents:
|
||||
score_model(dev_acc, nlp, words, gold_tags)
|
||||
|
||||
random.shuffle(train_sents)
|
||||
print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
|
||||
|
||||
|
||||
print('end training')
|
||||
nlp.end_training(model_dir)
|
||||
print('done')
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
||||
model_dir=("Location of output model directory"),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
|
||||
# training
|
||||
if not eval_only:
|
||||
with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
|
||||
io.open(dev_loc, 'r', encoding='utf8') as devfile_:
|
||||
train_sents = read_conll(trainfile_)
|
||||
dev_sents = read_conll(devfile_)
|
||||
train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
|
||||
|
||||
# testing
|
||||
with io.open(dev_loc, 'r', encoding='utf8') as file_:
|
||||
dev_sents = read_conll(file_)
|
||||
nlp = German(data_dir=model_dir)
|
||||
|
||||
dev_acc = PRFScore()
|
||||
for words, gold_tags in dev_sents:
|
||||
score_model(dev_acc, nlp, words, gold_tags)
|
||||
|
||||
print('POS: %6.2f %%' % (100*dev_acc.precision))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
1
setup.py
1
setup.py
|
@ -24,7 +24,6 @@ MOD_NAMES = [
|
|||
'spacy.vocab',
|
||||
'spacy.attrs',
|
||||
'spacy.morphology',
|
||||
'spacy.tagger',
|
||||
'spacy.pipeline',
|
||||
'spacy.syntax.stateclass',
|
||||
'spacy.syntax._state',
|
||||
|
|
|
@ -3,8 +3,6 @@ from __future__ import unicode_literals
|
|||
|
||||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .deprecated import resolve_load_name
|
||||
#from .about import __version__
|
||||
from .about import __version__
|
||||
from . import util
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import print_function
|
||||
# NB! This breaks in plac on Python 2!!
|
||||
#from __future__ import unicode_literals
|
||||
# from __future__ import unicode_literals
|
||||
|
||||
if __name__ == '__main__':
|
||||
import plac
|
||||
|
|
221
spacy/_ml.py
221
spacy/_ml.py
|
@ -1,49 +1,42 @@
|
|||
import ujson
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
||||
from thinc.i2v import HashEmbed, StaticVectors
|
||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
||||
from thinc.t2v import Pooling, sum_pool
|
||||
from thinc.misc import Residual
|
||||
from thinc.misc import BatchNorm as BN
|
||||
from thinc.misc import LayerNorm as LN
|
||||
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.api import FeatureExtracter, with_getitem
|
||||
from thinc.api import uniqued, wrap, flatten_add_lengths, noop
|
||||
|
||||
from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
|
||||
from thinc.api import uniqued, wrap, noop
|
||||
from thinc.linear.linear import LinearModel
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module, copy_array
|
||||
from thinc.neural._lsuv import svd_orthonormal
|
||||
|
||||
import random
|
||||
import cytoolz
|
||||
|
||||
from thinc import describe
|
||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||
import thinc.extra.load_nlp
|
||||
from thinc.neural._lsuv import svd_orthonormal
|
||||
|
||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
|
||||
from .tokens.doc import Doc
|
||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
||||
from . import util
|
||||
|
||||
import numpy
|
||||
import io
|
||||
|
||||
# TODO: Unset this once we don't want to support models previous models.
|
||||
import thinc.neural._classes.layernorm
|
||||
thinc.neural._classes.layernorm.set_compat_six_eight(False)
|
||||
|
||||
VECTORS_KEY = 'spacy_pretrained_vectors'
|
||||
|
||||
|
||||
@layerize
|
||||
def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||
|
||||
def finish_update(d_X, sgd=None):
|
||||
return ops.unflatten(d_X, lengths, pad=pad)
|
||||
|
||||
X = ops.flatten(seqs, pad=pad)
|
||||
return (X, lengths), finish_update
|
||||
|
||||
|
@ -57,33 +50,14 @@ def _logistic(X, drop=0.):
|
|||
X = xp.minimum(X, 10., X)
|
||||
X = xp.maximum(X, -10., X)
|
||||
Y = 1. / (1. + xp.exp(-X))
|
||||
|
||||
def logistic_bwd(dY, sgd=None):
|
||||
dX = dY * (Y * (1-Y))
|
||||
return dX
|
||||
|
||||
return Y, logistic_bwd
|
||||
|
||||
|
||||
@layerize
|
||||
def add_tuples(X, drop=0.):
|
||||
"""Give inputs of sequence pairs, where each sequence is (vals, length),
|
||||
sum the values, returning a single sequence.
|
||||
|
||||
If input is:
|
||||
((vals1, length), (vals2, length)
|
||||
Output is:
|
||||
(vals1+vals2, length)
|
||||
|
||||
vals are a single tensor for the whole batch.
|
||||
"""
|
||||
(vals1, length1), (vals2, length2) = X
|
||||
assert length1 == length2
|
||||
|
||||
def add_tuples_bwd(dY, sgd=None):
|
||||
return (dY, dY)
|
||||
|
||||
return (vals1+vals2, length), add_tuples_bwd
|
||||
|
||||
|
||||
def _zero_init(model):
|
||||
def _zero_init_impl(self, X, y):
|
||||
self.W.fill(0)
|
||||
|
@ -111,13 +85,11 @@ def _preprocess_doc(docs, drop=0.):
|
|||
nO=Dimension("Output size"),
|
||||
nP=Dimension("Maxout pieces"),
|
||||
W=Synapses("Weights matrix",
|
||||
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI) if obj.nP >= 2
|
||||
else (obj.nF, obj.nO, obj.nI)),
|
||||
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
||||
b=Biases("Bias vector",
|
||||
lambda obj: (obj.nO, obj.nP) if obj.nP >= 2 else (obj.nO,)),
|
||||
lambda obj: (obj.nO, obj.nP)),
|
||||
d_W=Gradient("W"),
|
||||
d_b=Gradient("b")
|
||||
)
|
||||
d_b=Gradient("b"))
|
||||
class PrecomputableAffine(Model):
|
||||
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
|
||||
Model.__init__(self, **kwargs)
|
||||
|
@ -203,89 +175,6 @@ class PrecomputableAffine(Model):
|
|||
break
|
||||
|
||||
|
||||
# Thinc's Embed class is a bit broken atm, so drop this here.
|
||||
from thinc import describe
|
||||
from thinc.neural._classes.embed import _uniform_init
|
||||
|
||||
|
||||
@describe.attributes(
|
||||
nV=describe.Dimension("Number of vectors"),
|
||||
nO=describe.Dimension("Size of output"),
|
||||
vectors=describe.Weights("Embedding table",
|
||||
lambda obj: (obj.nV, obj.nO),
|
||||
_uniform_init(-0.1, 0.1)
|
||||
),
|
||||
d_vectors=describe.Gradient("vectors")
|
||||
)
|
||||
class Embed(Model):
|
||||
name = 'embed'
|
||||
|
||||
def __init__(self, nO, nV=None, **kwargs):
|
||||
if nV is not None:
|
||||
nV += 1
|
||||
Model.__init__(self, **kwargs)
|
||||
if 'name' in kwargs:
|
||||
self.name = kwargs['name']
|
||||
self.column = kwargs.get('column', 0)
|
||||
self.nO = nO
|
||||
self.nV = nV
|
||||
|
||||
def predict(self, ids):
|
||||
if ids.ndim == 2:
|
||||
ids = ids[:, self.column]
|
||||
return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
|
||||
|
||||
def begin_update(self, ids, drop=0.):
|
||||
if ids.ndim == 2:
|
||||
ids = ids[:, self.column]
|
||||
vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f')
|
||||
def backprop_embed(d_vectors, sgd=None):
|
||||
n_vectors = d_vectors.shape[0]
|
||||
self.ops.scatter_add(self.d_vectors, ids, d_vectors)
|
||||
if sgd is not None:
|
||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
return None
|
||||
return vectors, backprop_embed
|
||||
|
||||
|
||||
def HistoryFeatures(nr_class, hist_size=8, nr_dim=8):
|
||||
'''Wrap a model, adding features representing action history.'''
|
||||
if hist_size == 0:
|
||||
return layerize(noop())
|
||||
embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d')
|
||||
for i in range(hist_size)]
|
||||
embed = chain(concatenate(*embed_tables),
|
||||
LN(Maxout(hist_size*nr_dim, hist_size*nr_dim)))
|
||||
ops = embed.ops
|
||||
def add_history_fwd(vectors_hists, drop=0.):
|
||||
vectors, hist_ids = vectors_hists
|
||||
hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop)
|
||||
outputs = ops.xp.hstack((vectors, hist_feats))
|
||||
|
||||
def add_history_bwd(d_outputs, sgd=None):
|
||||
d_vectors = d_outputs[:, :vectors.shape[1]]
|
||||
d_hists = d_outputs[:, vectors.shape[1]:]
|
||||
bp_hists(d_hists, sgd=sgd)
|
||||
return embed.ops.xp.ascontiguousarray(d_vectors)
|
||||
return outputs, add_history_bwd
|
||||
return wrap(add_history_fwd, embed)
|
||||
|
||||
|
||||
def drop_layer(layer, factor=2.):
|
||||
def drop_layer_fwd(X, drop=0.):
|
||||
if drop <= 0.:
|
||||
return layer.begin_update(X, drop=drop)
|
||||
else:
|
||||
coinflip = layer.ops.xp.random.random()
|
||||
if (coinflip / factor) >= drop:
|
||||
return layer.begin_update(X, drop=drop)
|
||||
else:
|
||||
return X, lambda dX, sgd=None: dX
|
||||
|
||||
model = wrap(drop_layer_fwd, layer)
|
||||
model.predict = layer
|
||||
return model
|
||||
|
||||
def link_vectors_to_models(vocab):
|
||||
vectors = vocab.vectors
|
||||
ops = Model.ops
|
||||
|
@ -299,16 +188,21 @@ def link_vectors_to_models(vocab):
|
|||
# (unideal, I know)
|
||||
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
|
||||
|
||||
|
||||
def Tok2Vec(width, embed_size, **kwargs):
|
||||
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||
'*': reapply}):
|
||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
|
||||
'+': add, '*': reapply}):
|
||||
norm = HashEmbed(width, embed_size, column=cols.index(NORM),
|
||||
name='embed_norm')
|
||||
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
|
||||
name='embed_prefix')
|
||||
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
|
||||
name='embed_suffix')
|
||||
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
|
||||
name='embed_shape')
|
||||
if pretrained_dims is not None and pretrained_dims >= 1:
|
||||
glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
|
||||
|
||||
|
@ -320,7 +214,6 @@ def Tok2Vec(width, embed_size, **kwargs):
|
|||
(norm | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width*4, pieces=3)), column=5)
|
||||
|
||||
|
||||
convolution = Residual(
|
||||
ExtractWindow(nW=1)
|
||||
>> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
|
||||
|
@ -344,6 +237,7 @@ def reapply(layer, n_times):
|
|||
Y, backprop = layer.begin_update(X, drop=drop)
|
||||
X = Y
|
||||
backprops.append(backprop)
|
||||
|
||||
def reapply_bwd(dY, sgd=None):
|
||||
dX = None
|
||||
for backprop in reversed(backprops):
|
||||
|
@ -353,6 +247,7 @@ def reapply(layer, n_times):
|
|||
else:
|
||||
dX += dY
|
||||
return dX
|
||||
|
||||
return Y, reapply_bwd
|
||||
return wrap(reapply_fwd, layer)
|
||||
|
||||
|
@ -367,13 +262,14 @@ def _divide_array(X, size):
|
|||
parts = []
|
||||
index = 0
|
||||
while index < len(X):
|
||||
parts.append(X[index : index + size])
|
||||
parts.append(X[index:index + size])
|
||||
index += size
|
||||
return parts
|
||||
|
||||
|
||||
def get_col(idx):
|
||||
assert idx >= 0, idx
|
||||
|
||||
def forward(X, drop=0.):
|
||||
assert idx >= 0, idx
|
||||
if isinstance(X, numpy.ndarray):
|
||||
|
@ -381,30 +277,28 @@ def get_col(idx):
|
|||
else:
|
||||
ops = CupyOps()
|
||||
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
||||
|
||||
def backward(y, sgd=None):
|
||||
assert idx >= 0, idx
|
||||
dX = ops.allocate(X.shape)
|
||||
dX[:, idx] += y
|
||||
return dX
|
||||
|
||||
return output, backward
|
||||
|
||||
return layerize(forward)
|
||||
|
||||
|
||||
def zero_init(model):
|
||||
def _hook(self, X, y=None):
|
||||
self.W.fill(0)
|
||||
model.on_data_hooks.append(_hook)
|
||||
return model
|
||||
|
||||
|
||||
def doc2feats(cols=None):
|
||||
if cols is None:
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
|
||||
def forward(docs, drop=0.):
|
||||
feats = []
|
||||
for doc in docs:
|
||||
feats.append(doc.to_array(cols))
|
||||
return feats, None
|
||||
|
||||
model = layerize(forward)
|
||||
model.cols = cols
|
||||
return model
|
||||
|
@ -418,28 +312,14 @@ def print_shape(prefix):
|
|||
|
||||
@layerize
|
||||
def get_token_vectors(tokens_attrs_vectors, drop=0.):
|
||||
ops = Model.ops
|
||||
tokens, attrs, vectors = tokens_attrs_vectors
|
||||
|
||||
def backward(d_output, sgd=None):
|
||||
return (tokens, d_output)
|
||||
|
||||
return vectors, backward
|
||||
|
||||
|
||||
@layerize
|
||||
def flatten(seqs, drop=0.):
|
||||
if isinstance(seqs[0], numpy.ndarray):
|
||||
ops = NumpyOps()
|
||||
elif hasattr(CupyOps.xp, 'ndarray') and isinstance(seqs[0], CupyOps.xp.ndarray):
|
||||
ops = CupyOps()
|
||||
else:
|
||||
raise ValueError("Unable to flatten sequence of type %s" % type(seqs[0]))
|
||||
lengths = [len(seq) for seq in seqs]
|
||||
def finish_update(d_X, sgd=None):
|
||||
return ops.unflatten(d_X, lengths)
|
||||
X = ops.xp.vstack(seqs)
|
||||
return X, finish_update
|
||||
|
||||
|
||||
@layerize
|
||||
def logistic(X, drop=0.):
|
||||
xp = get_array_module(X)
|
||||
|
@ -449,9 +329,11 @@ def logistic(X, drop=0.):
|
|||
X = xp.minimum(X, 10., X)
|
||||
X = xp.maximum(X, -10., X)
|
||||
Y = 1. / (1. + xp.exp(-X))
|
||||
|
||||
def logistic_bwd(dY, sgd=None):
|
||||
dX = dY * (Y * (1-Y))
|
||||
return dX
|
||||
|
||||
return Y, logistic_bwd
|
||||
|
||||
|
||||
|
@ -461,6 +343,7 @@ def zero_init(model):
|
|||
model.on_data_hooks.append(_zero_init_impl)
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def preprocess_doc(docs, drop=0.):
|
||||
keys = [doc.to_array([LOWER]) for doc in docs]
|
||||
|
@ -501,8 +384,6 @@ def build_tagger_model(nr_class, **cfg):
|
|||
|
||||
@layerize
|
||||
def SpacyVectors(docs, drop=0.):
|
||||
xp = get_array_module(docs[0].vocab.vectors.data)
|
||||
width = docs[0].vocab.vectors.data.shape[1]
|
||||
batch = []
|
||||
for doc in docs:
|
||||
indices = numpy.zeros((len(doc),), dtype='i')
|
||||
|
@ -525,9 +406,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
model = (
|
||||
SpacyVectors
|
||||
>> flatten_add_lengths
|
||||
>> with_getitem(0,
|
||||
Affine(width, pretrained_dims)
|
||||
)
|
||||
>> with_getitem(0, Affine(width, pretrained_dims))
|
||||
>> ParametricAttention(width)
|
||||
>> Pooling(sum_pool)
|
||||
>> Residual(ReLu(width, width)) ** 2
|
||||
|
@ -536,7 +415,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
)
|
||||
return model
|
||||
|
||||
|
||||
lower = HashEmbed(width, nr_vector, column=1)
|
||||
prefix = HashEmbed(width//2, nr_vector, column=2)
|
||||
suffix = HashEmbed(width//2, nr_vector, column=3)
|
||||
|
@ -594,33 +472,40 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
model.lsuv = False
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def flatten(seqs, drop=0.):
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype='i')
|
||||
|
||||
def finish_update(d_X, sgd=None):
|
||||
return ops.unflatten(d_X, lengths, pad=0)
|
||||
|
||||
X = ops.flatten(seqs, pad=0)
|
||||
return X, finish_update
|
||||
|
||||
|
||||
def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||
'''Compose two or more models `f`, `g`, etc, such that their outputs are
|
||||
def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||
"""Compose two or more models `f`, `g`, etc, such that their outputs are
|
||||
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
|
||||
'''
|
||||
"""
|
||||
if not layers:
|
||||
return noop()
|
||||
drop_factor = kwargs.get('drop_factor', 1.0)
|
||||
ops = layers[0].ops
|
||||
layers = [chain(layer, flatten) for layer in layers]
|
||||
concat = concatenate(*layers)
|
||||
|
||||
def concatenate_lists_fwd(Xs, drop=0.):
|
||||
drop *= drop_factor
|
||||
lengths = ops.asarray([len(X) for X in Xs], dtype='i')
|
||||
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||
ys = ops.unflatten(flat_y, lengths)
|
||||
|
||||
def concatenate_lists_bwd(d_ys, sgd=None):
|
||||
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
|
||||
|
||||
return ys, concatenate_lists_bwd
|
||||
|
||||
model = wrap(concatenate_lists_fwd, concat)
|
||||
return model
|
||||
|
|
|
@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
"""
|
||||
Normalize a dictionary of attributes, converting them to ints.
|
||||
|
||||
Arguments:
|
||||
stringy_attrs (dict):
|
||||
Dictionary keyed by attribute string names. Values can be ints or strings.
|
||||
|
||||
strings_map (StringStore):
|
||||
Defaults to None. If provided, encodes string values into ints.
|
||||
|
||||
Returns:
|
||||
inty_attrs (dict):
|
||||
Attributes dictionary with keys and optionally values converted to
|
||||
ints.
|
||||
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
|
||||
can be ints or strings.
|
||||
strings_map (StringStore): Defaults to None. If provided, encodes string
|
||||
values into ints.
|
||||
RETURNS (dict): Attributes dictionary with keys and optionally values
|
||||
converted to ints.
|
||||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
|
|
|
@ -7,10 +7,9 @@ from pathlib import Path
|
|||
from .converters import conllu2json, iob2json, conll_ner2json
|
||||
from ..util import prints
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new entry
|
||||
# to this dict with the file extension mapped to the converter function imported
|
||||
# from /converters.
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new
|
||||
# entry to this dict with the file extension mapped to the converter function
|
||||
# imported from /converters.
|
||||
CONVERTERS = {
|
||||
'conllu': conllu2json,
|
||||
'conll': conllu2json,
|
||||
|
@ -24,8 +23,7 @@ CONVERTERS = {
|
|||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
||||
converter='auto'):
|
||||
"""
|
||||
|
@ -40,7 +38,7 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
|||
prints(output_path, title="Output directory not found", exits=1)
|
||||
if converter == 'auto':
|
||||
converter = input_path.suffix[1:]
|
||||
if not converter in CONVERTERS:
|
||||
if converter not in CONVERTERS:
|
||||
prints("Can't find converter for %s" % converter,
|
||||
title="Unknown format", exits=1)
|
||||
func = CONVERTERS[converter]
|
||||
|
|
|
@ -8,7 +8,8 @@ from ...gold import iob_to_biluo
|
|||
|
||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||
"""
|
||||
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
|
||||
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||
train cli.
|
||||
"""
|
||||
docs = read_conll_ner(input_path)
|
||||
|
||||
|
|
|
@ -13,10 +13,9 @@ from .. import about
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
model=("model to download (shortcut or model name)", "positional", None, str),
|
||||
model=("model to download, shortcut or name)", "positional", None, str),
|
||||
direct=("force direct download. Needs model name with version and won't "
|
||||
"perform compatibility check", "flag", "d", bool)
|
||||
)
|
||||
"perform compatibility check", "flag", "d", bool))
|
||||
def download(cmd, model, direct=False):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
|
@ -30,21 +29,25 @@ def download(cmd, model, direct=False):
|
|||
model_name = shortcuts.get(model, model)
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
|
||||
v=version))
|
||||
if dl == 0:
|
||||
try:
|
||||
# Get package path here because link uses
|
||||
# pip.get_installed_distributions() to check if model is a package,
|
||||
# which fails if model was just installed via subprocess
|
||||
# pip.get_installed_distributions() to check if model is a
|
||||
# package, which fails if model was just installed via
|
||||
# subprocess
|
||||
package_path = get_package_path(model_name)
|
||||
link(None, model_name, model, force=True, model_path=package_path)
|
||||
link(None, model_name, model, force=True,
|
||||
model_path=package_path)
|
||||
except:
|
||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||
# a convenience wrapper, it's best to show a success message and
|
||||
# loading instructions, even if linking fails.
|
||||
prints("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load "
|
||||
"the model via its full package name:",
|
||||
# Dirty, but since spacy.download and the auto-linking is
|
||||
# mostly a convenience wrapper, it's best to show a success
|
||||
# message and loading instructions, even if linking fails.
|
||||
prints(
|
||||
"Creating a shortcut link for 'en' didn't work (maybe "
|
||||
"you don't have admin permissions?), but you can still "
|
||||
"load the model via its full package name:",
|
||||
"nlp = spacy.load('%s')" % model_name,
|
||||
title="Download successful")
|
||||
|
||||
|
@ -52,9 +55,10 @@ def download(cmd, model, direct=False):
|
|||
def get_json(url, desc):
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
|
||||
"(v%s), and download it manually." % (desc, about.__version__),
|
||||
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
|
||||
msg = ("Couldn't fetch %s. Please find a model for your spaCy "
|
||||
"installation (v%s), and download it manually.")
|
||||
prints(msg % (desc, about.__version__), about.__docs_models__,
|
||||
title="Server error (%d)" % r.status_code, exits=1)
|
||||
return r.json()
|
||||
|
||||
|
||||
|
@ -71,13 +75,13 @@ def get_compatibility():
|
|||
def get_version(model, comp):
|
||||
if model not in comp:
|
||||
version = about.__version__
|
||||
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
|
||||
title="Compatibility error", exits=1)
|
||||
msg = "No compatible model found for '%s' (spaCy v%s)."
|
||||
prints(msg % (model, version), title="Compatibility error", exits=1)
|
||||
return comp[model][0]
|
||||
|
||||
|
||||
def download_model(filename):
|
||||
download_url = about.__download_url__ + '/' + filename
|
||||
return subprocess.call([sys.executable, '-m',
|
||||
'pip', 'install', '--no-cache-dir', download_url],
|
||||
env=os.environ.copy())
|
||||
return subprocess.call(
|
||||
[sys.executable, '-m', 'pip', 'install', '--no-cache-dir',
|
||||
download_url], env=os.environ.copy())
|
||||
|
|
|
@ -2,27 +2,15 @@
|
|||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import json
|
||||
from collections import defaultdict
|
||||
import cytoolz
|
||||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from ..tokens.doc import Doc
|
||||
from ..scorer import Scorer
|
||||
from ..gold import GoldParse, merge_sents
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..gold import GoldCorpus
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
@ -30,17 +18,18 @@ numpy.random.seed(0)
|
|||
|
||||
@plac.annotations(
|
||||
model=("Model name or path", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional",
|
||||
None, str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("Use GPU", "option", "g", int),
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)
|
||||
)
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option",
|
||||
"dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int))
|
||||
def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
||||
displacy_path=None, displacy_limit=25):
|
||||
"""
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an output
|
||||
directory as the displacy_path argument.
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||
output directory as the displacy_path argument.
|
||||
"""
|
||||
if gpu_id >= 0:
|
||||
util.use_gpu(gpu_id)
|
||||
|
@ -50,7 +39,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
|||
if not data_path.exists():
|
||||
prints(data_path, title="Evaluation data not found", exits=1)
|
||||
if displacy_path and not displacy_path.exists():
|
||||
prints(displacy_path, title="Visualization output directory not found", exits=1)
|
||||
prints(displacy_path, title="Visualization output directory not found",
|
||||
exits=1)
|
||||
corpus = GoldCorpus(data_path, data_path)
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
|
@ -64,12 +54,14 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False,
|
|||
docs, golds = zip(*dev_docs)
|
||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||
render_parses(docs, displacy_path, model_name=model, limit=displacy_limit,
|
||||
deps=render_deps, ents=render_ents)
|
||||
prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit)
|
||||
render_parses(docs, displacy_path, model_name=model,
|
||||
limit=displacy_limit, deps=render_deps, ents=render_ents)
|
||||
msg = "Generated %s parses as HTML" % displacy_limit
|
||||
prints(displacy_path, title=msg)
|
||||
|
||||
|
||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True):
|
||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
|
||||
ents=True):
|
||||
docs[0].user_data['title'] = model_name
|
||||
if ents:
|
||||
with (output_path / 'entities.html').open('w') as file_:
|
||||
|
@ -77,7 +69,8 @@ def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=T
|
|||
file_.write(html)
|
||||
if deps:
|
||||
with (output_path / 'parses.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True})
|
||||
html = displacy.render(docs[:limit], style='dep', page=True,
|
||||
options={'compact': True})
|
||||
file_.write(html)
|
||||
|
||||
|
||||
|
|
|
@ -12,8 +12,7 @@ from .. import util
|
|||
|
||||
@plac.annotations(
|
||||
model=("optional: shortcut link of model", "positional", None, str),
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
|
||||
)
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str))
|
||||
def info(cmd, model=None, markdown=False):
|
||||
"""Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
|
|
|
@ -12,8 +12,7 @@ from .. import util
|
|||
@plac.annotations(
|
||||
origin=("package name or local path to model", "positional", None, str),
|
||||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool)
|
||||
)
|
||||
force=("force overwriting of existing link", "flag", "f", bool))
|
||||
def link(cmd, origin, link_name, force=False, model_path=None):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
|
@ -46,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None):
|
|||
# This is quite dirty, but just making sure other errors are caught.
|
||||
prints("Creating a symlink in spacy/data failed. Make sure you have "
|
||||
"the required permissions and try re-running the command as "
|
||||
"admin, or use a virtualenv. You can still import the model as a "
|
||||
"module and call its load() method, or create the symlink manually.",
|
||||
"admin, or use a virtualenv. You can still import the model as "
|
||||
"a module and call its load() method, or create the symlink "
|
||||
"manually.",
|
||||
"%s --> %s" % (path2str(model_path), path2str(link_path)),
|
||||
title="Error: Couldn't link model to '%s'" % link_name)
|
||||
raise
|
||||
|
|
|
@ -16,10 +16,12 @@ from .. import about
|
|||
input_dir=("directory with model data", "positional", None, str),
|
||||
output_dir=("output parent directory", "positional", None, str),
|
||||
meta_path=("path to meta.json", "option", "m", str),
|
||||
create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool),
|
||||
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
|
||||
)
|
||||
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||
create_meta=("create meta.json, even if one exists in directory", "flag",
|
||||
"c", bool),
|
||||
force=("force overwriting of existing folder in output directory", "flag",
|
||||
"f", bool))
|
||||
def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False,
|
||||
force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
|
@ -52,13 +54,15 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force
|
|||
package_path = main_path / model_name
|
||||
|
||||
create_dirs(package_path, force)
|
||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||
shutil.copytree(path2str(input_path),
|
||||
path2str(package_path / model_name_v))
|
||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||
create_file(main_path / 'setup.py', template_setup)
|
||||
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||
create_file(package_path / '__init__.py', template_init)
|
||||
prints(main_path, "To build the package, run `python setup.py sdist` in this "
|
||||
"directory.", title="Successfully created package '%s'" % model_name_v)
|
||||
prints(main_path, "To build the package, run `python setup.py sdist` in "
|
||||
"this directory.",
|
||||
title="Successfully created package '%s'" % model_name_v)
|
||||
|
||||
|
||||
def create_dirs(package_path, force):
|
||||
|
@ -66,9 +70,10 @@ def create_dirs(package_path, force):
|
|||
if force:
|
||||
shutil.rmtree(path2str(package_path))
|
||||
else:
|
||||
prints(package_path, "Please delete the directory and try again, or "
|
||||
"use the --force flag to overwrite existing directories.",
|
||||
title="Package directory already exists", exits=1)
|
||||
prints(package_path, "Please delete the directory and try again, "
|
||||
"or use the --force flag to overwrite existing "
|
||||
"directories.", title="Package directory already exists",
|
||||
exits=1)
|
||||
Path.mkdir(package_path, parents=True)
|
||||
|
||||
|
||||
|
@ -82,7 +87,8 @@ def generate_meta(model_path):
|
|||
settings = [('lang', 'Model language', 'en'),
|
||||
('name', 'Model name', 'model'),
|
||||
('version', 'Model version', '0.0.0'),
|
||||
('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__),
|
||||
('spacy_version', 'Required spaCy version',
|
||||
'>=%s,<3.0.0' % about.__version__),
|
||||
('description', 'Model description', False),
|
||||
('author', 'Author', False),
|
||||
('email', 'Author email', False),
|
||||
|
|
|
@ -27,15 +27,15 @@ def read_inputs(loc):
|
|||
|
||||
@plac.annotations(
|
||||
lang=("model/language", "positional", None, str),
|
||||
inputs=("Location of input file", "positional", None, read_inputs)
|
||||
)
|
||||
inputs=("Location of input file", "positional", None, read_inputs))
|
||||
def profile(cmd, lang, inputs=None):
|
||||
"""
|
||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||
"""
|
||||
nlp = spacy.load(lang)
|
||||
nlp = spacy.load(lang)
|
||||
texts = list(cytoolz.take(10000, inputs))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
||||
"Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
s.strip_dirs().sort_stats("time").print_stats()
|
||||
|
||||
|
|
|
@ -2,21 +2,14 @@
|
|||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import json
|
||||
from collections import defaultdict
|
||||
import cytoolz
|
||||
from pathlib import Path
|
||||
import dill
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from thinc.neural.optimizers import linear_decay
|
||||
from timeit import default_timer as timer
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from ..tokens.doc import Doc
|
||||
from ..scorer import Scorer
|
||||
from ..gold import GoldParse, merge_sents
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
@ -31,8 +24,10 @@ numpy.random.seed(0)
|
|||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("output directory to store model in", "positional", None, str),
|
||||
train_data=("location of JSON-formatted training data", "positional", None, str),
|
||||
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
|
||||
train_data=("location of JSON-formatted training data", "positional",
|
||||
None, str),
|
||||
dev_data=("location of JSON-formatted development data (optional)",
|
||||
"positional", None, str),
|
||||
n_iter=("number of iterations", "option", "n", int),
|
||||
n_sents=("number of sentences", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
|
@ -42,11 +37,12 @@ numpy.random.seed(0)
|
|||
no_entities=("Don't train NER", "flag", "N", bool),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
||||
)
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||
"overwritten.", "option", "m", Path))
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||
gold_preproc=False, version="0.0.0", meta_path=None):
|
||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False,
|
||||
no_entities=False, gold_preproc=False, version="0.0.0",
|
||||
meta_path=None):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
|
@ -72,9 +68,12 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
meta.setdefault('name', 'unnamed')
|
||||
|
||||
pipeline = ['tagger', 'parser', 'ner']
|
||||
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
||||
if no_parser and 'parser' in pipeline: pipeline.remove('parser')
|
||||
if no_entities and 'ner' in pipeline: pipeline.remove('ner')
|
||||
if no_tagger and 'tagger' in pipeline:
|
||||
pipeline.remove('tagger')
|
||||
if no_parser and 'parser' in pipeline:
|
||||
pipeline.remove('parser')
|
||||
if no_entities and 'ner' in pipeline:
|
||||
pipeline.remove('ner')
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
|
@ -139,7 +138,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
end_time = timer()
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
|
||||
with acc_loc.open('w') as file_:
|
||||
file_.write(json_dumps(scorer.scores))
|
||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||
|
@ -157,7 +156,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
with meta_loc.open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
|
||||
gpu_wps=gpu_wps)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
try:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import requests
|
||||
import pkg_resources
|
||||
|
@ -29,8 +29,10 @@ def validate(cmd):
|
|||
model_links = get_model_links(current_compat)
|
||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
||||
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
|
||||
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
|
||||
incompat_models = {d['name'] for _, d in model_pkgs.items()
|
||||
if not d['compat']}
|
||||
incompat_models.update([d['name'] for _, d in model_links.items()
|
||||
if not d['compat']])
|
||||
na_models = [m for m in incompat_models if m not in current_compat]
|
||||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
|
||||
|
@ -90,7 +92,6 @@ def get_model_pkgs(compat, all_models):
|
|||
|
||||
|
||||
def get_model_row(compat, name, data, type='package'):
|
||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
||||
if data['compat']:
|
||||
|
@ -110,7 +111,8 @@ def get_row(*args):
|
|||
def is_model_path(model_path):
|
||||
exclude = ['cache', 'pycache', '__pycache__']
|
||||
name = model_path.parts[-1]
|
||||
return model_path.is_dir() and name not in exclude and not name.startswith('.')
|
||||
return (model_path.is_dir() and name not in exclude
|
||||
and not name.startswith('.'))
|
||||
|
||||
|
||||
def is_compat(compat, name, version):
|
||||
|
@ -118,6 +120,7 @@ def is_compat(compat, name, version):
|
|||
|
||||
|
||||
def reformat_version(version):
|
||||
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||
if version.endswith('-alpha'):
|
||||
return version.replace('-alpha', 'a0')
|
||||
return version.replace('-alpha', 'a')
|
||||
|
|
|
@ -91,15 +91,15 @@ def symlink_to(orig, dest):
|
|||
|
||||
|
||||
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
||||
return ((python2 == None or python2 == is_python2) and
|
||||
(python3 == None or python3 == is_python3) and
|
||||
(windows == None or windows == is_windows) and
|
||||
(linux == None or linux == is_linux) and
|
||||
(osx == None or osx == is_osx))
|
||||
return ((python2 is None or python2 == is_python2) and
|
||||
(python3 is None or python3 == is_python3) and
|
||||
(windows is None or windows == is_windows) and
|
||||
(linux is None or linux == is_linux) and
|
||||
(osx is None or osx == is_osx))
|
||||
|
||||
|
||||
def normalize_string_keys(old):
|
||||
'''Given a dictionary, make sure keys are unicode strings, not bytes.'''
|
||||
"""Given a dictionary, make sure keys are unicode strings, not bytes."""
|
||||
new = {}
|
||||
for key, value in old.items():
|
||||
if isinstance(key, bytes_):
|
||||
|
|
|
@ -24,7 +24,7 @@ def depr_model_download(lang):
|
|||
|
||||
|
||||
def resolve_load_name(name, **overrides):
|
||||
"""Resolve model loading if deprecated path kwarg is specified in overrides.
|
||||
"""Resolve model loading if deprecated path kwarg in overrides.
|
||||
|
||||
name (unicode): Name of model to load.
|
||||
**overrides: Overrides specified in spacy.load().
|
||||
|
@ -32,8 +32,9 @@ def resolve_load_name(name, **overrides):
|
|||
"""
|
||||
if overrides.get('path') not in (None, False, True):
|
||||
name = overrides.get('path')
|
||||
prints("To load a model from a path, you can now use the first argument. "
|
||||
"The model meta is used to load the required Language class.",
|
||||
"OLD: spacy.load('en', path='/some/path')", "NEW: spacy.load('/some/path')",
|
||||
prints("To load a model from a path, you can now use the first "
|
||||
"argument. The model meta is used to load the Language class.",
|
||||
"OLD: spacy.load('en', path='/some/path')",
|
||||
"NEW: spacy.load('/some/path')",
|
||||
title="Warning: deprecated argument 'path'")
|
||||
return name
|
||||
|
|
|
@ -12,7 +12,7 @@ IS_JUPYTER = is_in_jupyter()
|
|||
|
||||
|
||||
def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
||||
options={}, manual=False):
|
||||
options={}, manual=False):
|
||||
"""Render displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
|
@ -21,7 +21,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
|||
minify (bool): Minify HTML markup.
|
||||
jupyter (bool): Experimental, use Jupyter's `display()` to output markup.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
"""
|
||||
factories = {'dep': (DependencyRenderer, parse_deps),
|
||||
|
@ -35,7 +35,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
|||
parsed = [converter(doc, options) for doc in docs] if not manual else docs
|
||||
_html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip()
|
||||
html = _html['parsed']
|
||||
if jupyter: # return HTML rendered by IPython display()
|
||||
if jupyter: # return HTML rendered by IPython display()
|
||||
from IPython.core.display import display, HTML
|
||||
return display(HTML(html))
|
||||
return html
|
||||
|
@ -50,13 +50,15 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
|||
page (bool): Render markup as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
port (int): Port to serve visualisation.
|
||||
"""
|
||||
from wsgiref import simple_server
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
render(docs, style=style, page=page, minify=minify, options=options,
|
||||
manual=manual)
|
||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
||||
prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port)
|
||||
prints("Using the '%s' visualizer" % style,
|
||||
title="Serving on port %d..." % port)
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
|
@ -67,7 +69,8 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
|||
|
||||
def app(environ, start_response):
|
||||
# headers and status need to be bytes in Python 2, see #1227
|
||||
headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))]
|
||||
headers = [(b_to_str(b'Content-type'),
|
||||
b_to_str(b'text/html; charset=utf-8'))]
|
||||
start_response(b_to_str(b'200 OK'), headers)
|
||||
res = _html['parsed'].encode(encoding='utf-8')
|
||||
return [res]
|
||||
|
@ -89,9 +92,9 @@ def parse_deps(orig_doc, options={}):
|
|||
end = word.i + 1
|
||||
while end < len(doc) and doc[end].is_punct:
|
||||
end += 1
|
||||
span = doc[start : end]
|
||||
span = doc[start:end]
|
||||
spans.append((span.start_char, span.end_char, word.tag_,
|
||||
word.lemma_, word.ent_type_))
|
||||
word.lemma_, word.ent_type_))
|
||||
for span_props in spans:
|
||||
doc.merge(*span_props)
|
||||
words = [{'text': w.text, 'tag': w.tag_} for w in doc]
|
||||
|
@ -113,6 +116,7 @@ def parse_ents(doc, options={}):
|
|||
RETURNS (dict): Generated entities keyed by text (original text) and ents.
|
||||
"""
|
||||
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
|
||||
for ent in doc.ents]
|
||||
title = doc.user_data.get('title', None) if hasattr(doc, 'user_data') else None
|
||||
for ent in doc.ents]
|
||||
title = (doc.user_data.get('title', None)
|
||||
if hasattr(doc, 'user_data') else None)
|
||||
return {'text': doc.text, 'ents': ents, 'title': title}
|
||||
|
|
|
@ -14,13 +14,15 @@ class DependencyRenderer(object):
|
|||
"""Initialise dependency renderer.
|
||||
|
||||
options (dict): Visualiser-specific options (compact, word_spacing,
|
||||
arrow_spacing, arrow_width, arrow_stroke, distance,
|
||||
offset_x, color, bg, font)
|
||||
arrow_spacing, arrow_width, arrow_stroke, distance, offset_x,
|
||||
color, bg, font)
|
||||
"""
|
||||
self.compact = options.get('compact', False)
|
||||
self.word_spacing = options.get('word_spacing', 45)
|
||||
self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20)
|
||||
self.arrow_width = options.get('arrow_width', 6 if self.compact else 10)
|
||||
self.arrow_spacing = options.get('arrow_spacing',
|
||||
12 if self.compact else 20)
|
||||
self.arrow_width = options.get('arrow_width',
|
||||
6 if self.compact else 10)
|
||||
self.arrow_stroke = options.get('arrow_stroke', 2)
|
||||
self.distance = options.get('distance', 150 if self.compact else 175)
|
||||
self.offset_x = options.get('offset_x', 50)
|
||||
|
@ -39,7 +41,8 @@ class DependencyRenderer(object):
|
|||
rendered = [self.render_svg(i, p['words'], p['arcs'])
|
||||
for i, p in enumerate(parsed)]
|
||||
if page:
|
||||
content = ''.join([TPL_FIGURE.format(content=svg) for svg in rendered])
|
||||
content = ''.join([TPL_FIGURE.format(content=svg)
|
||||
for svg in rendered])
|
||||
markup = TPL_PAGE.format(content=content)
|
||||
else:
|
||||
markup = ''.join(rendered)
|
||||
|
@ -63,12 +66,13 @@ class DependencyRenderer(object):
|
|||
self.id = render_id
|
||||
words = [self.render_word(w['text'], w['tag'], i)
|
||||
for i, w in enumerate(words)]
|
||||
arcs = [self.render_arrow(a['label'], a['start'], a['end'], a['dir'], i)
|
||||
arcs = [self.render_arrow(a['label'], a['start'],
|
||||
a['end'], a['dir'], i)
|
||||
for i, a in enumerate(arcs)]
|
||||
content = ''.join(words) + ''.join(arcs)
|
||||
return TPL_DEP_SVG.format(id=self.id, width=self.width, height=self.height,
|
||||
color=self.color, bg=self.bg, font=self.font,
|
||||
content=content)
|
||||
return TPL_DEP_SVG.format(id=self.id, width=self.width,
|
||||
height=self.height, color=self.color,
|
||||
bg=self.bg, font=self.font, content=content)
|
||||
|
||||
def render_word(self, text, tag, i):
|
||||
"""Render individual word.
|
||||
|
@ -96,7 +100,7 @@ class DependencyRenderer(object):
|
|||
x_start = self.offset_x+start*self.distance+self.arrow_spacing
|
||||
y = self.offset_y
|
||||
x_end = (self.offset_x+(end-start)*self.distance+start*self.distance
|
||||
-self.arrow_spacing*(self.highest_level-level)/4)
|
||||
- self.arrow_spacing*(self.highest_level-level)/4)
|
||||
y_curve = self.offset_y-level*self.distance/2
|
||||
if self.compact:
|
||||
y_curve = self.offset_y-level*self.distance/6
|
||||
|
@ -133,8 +137,10 @@ class DependencyRenderer(object):
|
|||
if direction is 'left':
|
||||
pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2)
|
||||
else:
|
||||
pos1, pos2, pos3 = (end, end+self.arrow_width-2, end-self.arrow_width+2)
|
||||
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, y-self.arrow_width)
|
||||
pos1, pos2, pos3 = (end, end+self.arrow_width-2,
|
||||
end-self.arrow_width+2)
|
||||
arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3,
|
||||
y-self.arrow_width)
|
||||
return "M{},{} L{},{} {},{}".format(*arrowhead)
|
||||
|
||||
def get_levels(self, arcs):
|
||||
|
@ -159,9 +165,10 @@ class EntityRenderer(object):
|
|||
"""
|
||||
colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74',
|
||||
'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb',
|
||||
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LANGUAGE': '#ff8197',
|
||||
'WORK_OF_ART': '#f0d0ff', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9',
|
||||
'MONEY': '#e4e7d2', 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
|
||||
'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197',
|
||||
'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff',
|
||||
'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2',
|
||||
'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2',
|
||||
'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'}
|
||||
colors.update(options.get('colors', {}))
|
||||
self.default_color = '#ddd'
|
||||
|
@ -176,9 +183,11 @@ class EntityRenderer(object):
|
|||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
"""
|
||||
rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed]
|
||||
rendered = [self.render_ents(p['text'], p['ents'],
|
||||
p.get('title', None)) for p in parsed]
|
||||
if page:
|
||||
docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered])
|
||||
docs = ''.join([TPL_FIGURE.format(content=doc)
|
||||
for doc in rendered])
|
||||
markup = TPL_PAGE.format(content=docs)
|
||||
else:
|
||||
markup = ''.join(rendered)
|
||||
|
|
|
@ -264,7 +264,6 @@ GLOSSARY = {
|
|||
'nk': 'noun kernel element',
|
||||
'nmc': 'numerical component',
|
||||
'oa': 'accusative object',
|
||||
'oa': 'second accusative object',
|
||||
'oc': 'clausal object',
|
||||
'og': 'genitive object',
|
||||
'op': 'prepositional object',
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import io
|
||||
import re
|
||||
import ujson
|
||||
import random
|
||||
|
@ -10,9 +9,8 @@ import cytoolz
|
|||
import itertools
|
||||
|
||||
from .syntax import nonproj
|
||||
from .util import ensure_path
|
||||
from . import util
|
||||
from .tokens import Doc
|
||||
from . import util
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
|
@ -54,7 +52,8 @@ def merge_sents(sents):
|
|||
m_deps[3].extend(head + i for head in heads)
|
||||
m_deps[4].extend(labels)
|
||||
m_deps[5].extend(ner)
|
||||
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
|
||||
m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])
|
||||
for b in brackets)
|
||||
i += len(ids)
|
||||
return [(m_deps, m_brackets)]
|
||||
|
||||
|
@ -80,6 +79,8 @@ def align(cand_words, gold_words):
|
|||
|
||||
|
||||
punct_re = re.compile(r'\W')
|
||||
|
||||
|
||||
def _min_edit_path(cand_words, gold_words):
|
||||
cdef:
|
||||
Pool mem
|
||||
|
@ -98,9 +99,9 @@ def _min_edit_path(cand_words, gold_words):
|
|||
mem = Pool()
|
||||
n_cand = len(cand_words)
|
||||
n_gold = len(gold_words)
|
||||
# Levenshtein distance, except we need the history, and we may want different
|
||||
# costs.
|
||||
# Mark operations with a string, and score the history using _edit_cost.
|
||||
# Levenshtein distance, except we need the history, and we may want
|
||||
# different costs. Mark operations with a string, and score the history
|
||||
# using _edit_cost.
|
||||
previous_row = []
|
||||
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||
|
@ -144,9 +145,9 @@ def _min_edit_path(cand_words, gold_words):
|
|||
|
||||
|
||||
def minibatch(items, size=8):
|
||||
'''Iterate over batches of items. `size` may be an iterator,
|
||||
"""Iterate over batches of items. `size` may be an iterator,
|
||||
so that batch-size can vary on each step.
|
||||
'''
|
||||
"""
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(8)
|
||||
else:
|
||||
|
@ -168,6 +169,7 @@ class GoldCorpus(object):
|
|||
|
||||
train_path (unicode or Path): File or directory of training data.
|
||||
dev_path (unicode or Path): File or directory of development data.
|
||||
RETURNS (GoldCorpus): The newly created object.
|
||||
"""
|
||||
self.train_path = util.ensure_path(train_path)
|
||||
self.dev_path = util.ensure_path(dev_path)
|
||||
|
@ -213,7 +215,7 @@ class GoldCorpus(object):
|
|||
train_tuples = self.train_tuples
|
||||
if projectivize:
|
||||
train_tuples = nonproj.preprocess_training_data(
|
||||
self.train_tuples, label_freq_cutoff=100)
|
||||
self.train_tuples, label_freq_cutoff=100)
|
||||
random.shuffle(train_tuples)
|
||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||
max_length=max_length,
|
||||
|
@ -222,7 +224,6 @@ class GoldCorpus(object):
|
|||
|
||||
def dev_docs(self, nlp, gold_preproc=False):
|
||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||
#gold_docs = nlp.preprocess_gold(gold_docs)
|
||||
yield from gold_docs
|
||||
|
||||
@classmethod
|
||||
|
@ -233,7 +234,6 @@ class GoldCorpus(object):
|
|||
raw_text = None
|
||||
else:
|
||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||
|
||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||
gold_preproc, noise_level=noise_level)
|
||||
golds = cls._make_golds(docs, paragraph_tuples)
|
||||
|
@ -248,17 +248,20 @@ class GoldCorpus(object):
|
|||
raw_text = add_noise(raw_text, noise_level)
|
||||
return [nlp.make_doc(raw_text)]
|
||||
else:
|
||||
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
||||
for (sent_tuples, brackets) in paragraph_tuples]
|
||||
return [Doc(nlp.vocab,
|
||||
words=add_noise(sent_tuples[1], noise_level))
|
||||
for (sent_tuples, brackets) in paragraph_tuples]
|
||||
|
||||
@classmethod
|
||||
def _make_golds(cls, docs, paragraph_tuples):
|
||||
assert len(docs) == len(paragraph_tuples)
|
||||
if len(docs) == 1:
|
||||
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
|
||||
return [GoldParse.from_annot_tuples(docs[0],
|
||||
paragraph_tuples[0][0])]
|
||||
else:
|
||||
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
||||
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
|
||||
for doc, (sent_tuples, brackets)
|
||||
in zip(docs, paragraph_tuples)]
|
||||
|
||||
@staticmethod
|
||||
def walk_corpus(path):
|
||||
|
@ -305,7 +308,7 @@ def _corrupt(c, noise_level):
|
|||
|
||||
|
||||
def read_json_file(loc, docs_filter=None, limit=None):
|
||||
loc = ensure_path(loc)
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.is_dir():
|
||||
for filename in loc.iterdir():
|
||||
yield from read_json_file(loc / filename, limit=limit)
|
||||
|
@ -330,16 +333,16 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
|||
for i, token in enumerate(sent['tokens']):
|
||||
words.append(token['orth'])
|
||||
ids.append(i)
|
||||
tags.append(token.get('tag','-'))
|
||||
heads.append(token.get('head',0) + i)
|
||||
labels.append(token.get('dep',''))
|
||||
tags.append(token.get('tag', '-'))
|
||||
heads.append(token.get('head', 0) + i)
|
||||
labels.append(token.get('dep', ''))
|
||||
# Ensure ROOT label is case-insensitive
|
||||
if labels[-1].lower() == 'root':
|
||||
labels[-1] = 'ROOT'
|
||||
ner.append(token.get('ner', '-'))
|
||||
sents.append([
|
||||
[ids, words, tags, heads, labels, ner],
|
||||
sent.get('brackets', [])])
|
||||
sent.get('brackets', [])])
|
||||
if sents:
|
||||
yield [paragraph.get('raw', None), sents]
|
||||
|
||||
|
@ -382,19 +385,21 @@ cdef class GoldParse:
|
|||
@classmethod
|
||||
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
|
||||
_, words, tags, heads, deps, entities = annot_tuples
|
||||
return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,
|
||||
make_projective=make_projective)
|
||||
return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
|
||||
entities=entities, make_projective=make_projective)
|
||||
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||
deps=None, entities=None, make_projective=False,
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None,
|
||||
heads=None, deps=None, entities=None, make_projective=False,
|
||||
cats=None):
|
||||
"""Create a GoldParse.
|
||||
|
||||
doc (Doc): The document the annotations refer to.
|
||||
words (iterable): A sequence of unicode word strings.
|
||||
tags (iterable): A sequence of strings, representing tag annotations.
|
||||
heads (iterable): A sequence of integers, representing syntactic head offsets.
|
||||
deps (iterable): A sequence of strings, representing the syntactic relation types.
|
||||
heads (iterable): A sequence of integers, representing syntactic
|
||||
head offsets.
|
||||
deps (iterable): A sequence of strings, representing the syntactic
|
||||
relation types.
|
||||
entities (iterable): A sequence of named entity annotations, either as
|
||||
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||
representing the entity positions.
|
||||
|
@ -404,9 +409,10 @@ cdef class GoldParse:
|
|||
document (usually a sentence). Unlike entity annotations, label
|
||||
annotations can overlap, i.e. a single word can be covered by
|
||||
multiple labelled spans. The TextCategorizer component expects
|
||||
true examples of a label to have the value 1.0, and negative examples
|
||||
of a label to have the value 0.0. Labels not in the dictionary are
|
||||
treated as missing -- the gradient for those labels will be zero.
|
||||
true examples of a label to have the value 1.0, and negative
|
||||
examples of a label to have the value 0.0. Labels not in the
|
||||
dictionary are treated as missing - the gradient for those labels
|
||||
will be zero.
|
||||
RETURNS (GoldParse): The newly constructed object.
|
||||
"""
|
||||
if words is None:
|
||||
|
@ -470,11 +476,11 @@ cdef class GoldParse:
|
|||
self.ner[i] = entities[gold_i]
|
||||
|
||||
cycle = nonproj.contains_cycle(self.heads)
|
||||
if cycle != None:
|
||||
if cycle is not None:
|
||||
raise Exception("Cycle found: %s" % cycle)
|
||||
|
||||
if make_projective:
|
||||
proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
|
||||
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
|
||||
self.heads = proj_heads
|
||||
|
||||
def __len__(self):
|
||||
|
@ -497,20 +503,19 @@ cdef class GoldParse:
|
|||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
scheme (BILUO).
|
||||
"""Encode labelled spans into per-token tags, using the
|
||||
Begin/In/Last/Unit/Out scheme (BILUO).
|
||||
|
||||
doc (Doc): The document that the entity offsets refer to. The output tags
|
||||
will refer to the token boundaries within the document.
|
||||
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
|
||||
`end` should be character-offset integers denoting the slice into the
|
||||
original string.
|
||||
|
||||
entities (iterable): A sequence of `(start, end, label)` triples. `start`
|
||||
and `end` should be character-offset integers denoting the slice into
|
||||
the original string.
|
||||
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||
string will be of the form either "", "O" or "{action}-{label}", where
|
||||
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||
entity offsets don't align with the tokenization in the `Doc` object. The
|
||||
training algorithm will view these as missing values. "O" denotes a
|
||||
entity offsets don't align with the tokenization in the `Doc` object.
|
||||
The training algorithm will view these as missing values. "O" denotes a
|
||||
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||
|
|
|
@ -1,31 +1,28 @@
|
|||
# coding: utf8
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
from contextlib import contextmanager
|
||||
import copy
|
||||
|
||||
from thinc.neural import Model
|
||||
import random
|
||||
import ujson
|
||||
from collections import OrderedDict
|
||||
import itertools
|
||||
import weakref
|
||||
import functools
|
||||
import tqdm
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
from copy import copy
|
||||
from thinc.neural import Model
|
||||
from thinc.neural.optimizers import Adam
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .vocab import Vocab
|
||||
from .tagger import Tagger
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
from .pipeline import DependencyParser, Tensorizer, Tagger
|
||||
from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
|
||||
|
||||
from .compat import Optimizer
|
||||
from .compat import json_dumps, izip, copy_reg
|
||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||
from .pipeline import SimilarityHook, TextCategorizer
|
||||
from .compat import json_dumps, izip
|
||||
from .scorer import Scorer
|
||||
from ._ml import link_vectors_to_models
|
||||
from .attrs import IS_STOP
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .lang.punctuation import TOKENIZER_INFIXES
|
||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||
from .lang.tag_map import TAG_MAP
|
||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||
|
@ -57,16 +54,18 @@ class BaseDefaults(object):
|
|||
def create_tokenizer(cls, nlp=None):
|
||||
rules = cls.tokenizer_exceptions
|
||||
token_match = cls.token_match
|
||||
prefix_search = util.compile_prefix_regex(cls.prefixes).search \
|
||||
if cls.prefixes else None
|
||||
suffix_search = util.compile_suffix_regex(cls.suffixes).search \
|
||||
if cls.suffixes else None
|
||||
infix_finditer = util.compile_infix_regex(cls.infixes).finditer \
|
||||
if cls.infixes else None
|
||||
prefix_search = (util.compile_prefix_regex(cls.prefixes).search
|
||||
if cls.prefixes else None)
|
||||
suffix_search = (util.compile_suffix_regex(cls.suffixes).search
|
||||
if cls.suffixes else None)
|
||||
infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
|
||||
if cls.infixes else None)
|
||||
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
return Tokenizer(vocab, rules=rules,
|
||||
prefix_search=prefix_search, suffix_search=suffix_search,
|
||||
infix_finditer=infix_finditer, token_match=token_match)
|
||||
prefix_search=prefix_search,
|
||||
suffix_search=suffix_search,
|
||||
infix_finditer=infix_finditer,
|
||||
token_match=token_match)
|
||||
|
||||
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
|
||||
token_match = TOKEN_MATCH
|
||||
|
@ -98,7 +97,7 @@ class Language(object):
|
|||
|
||||
factories = {
|
||||
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
||||
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
|
||||
'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
|
||||
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
||||
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
||||
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
||||
|
@ -218,14 +217,14 @@ class Language(object):
|
|||
def add_pipe(self, component, name=None, before=None, after=None,
|
||||
first=None, last=None):
|
||||
"""Add a component to the processing pipeline. Valid components are
|
||||
callables that take a `Doc` object, modify it and return it. Only one of
|
||||
before, after, first or last can be set. Default behaviour is "last".
|
||||
callables that take a `Doc` object, modify it and return it. Only one
|
||||
of before/after/first/last can be set. Default behaviour is "last".
|
||||
|
||||
component (callable): The pipeline component.
|
||||
name (unicode): Name of pipeline component. Overwrites existing
|
||||
component.name attribute if available. If no name is set and
|
||||
the component exposes no name attribute, component.__name__ is
|
||||
used. An error is raised if the name already exists in the pipeline.
|
||||
used. An error is raised if a name already exists in the pipeline.
|
||||
before (unicode): Component name to insert component directly before.
|
||||
after (unicode): Component name to insert component directly after.
|
||||
first (bool): Insert component first / not first in the pipeline.
|
||||
|
@ -240,7 +239,8 @@ class Language(object):
|
|||
name = component.name
|
||||
elif hasattr(component, '__name__'):
|
||||
name = component.__name__
|
||||
elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
|
||||
elif (hasattr(component, '__class__') and
|
||||
hasattr(component.__class__, '__name__')):
|
||||
name = component.__class__.__name__
|
||||
else:
|
||||
name = repr(component)
|
||||
|
@ -269,7 +269,7 @@ class Language(object):
|
|||
`name in nlp.pipe_names`.
|
||||
|
||||
name (unicode): Name of the component.
|
||||
RETURNS (bool): Whether a component of that name exists in the pipeline.
|
||||
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||||
"""
|
||||
return name in self.pipe_names
|
||||
|
||||
|
@ -332,15 +332,12 @@ class Language(object):
|
|||
return doc
|
||||
|
||||
def disable_pipes(self, *names):
|
||||
'''Disable one or more pipeline components.
|
||||
|
||||
If used as a context manager, the pipeline will be restored to the initial
|
||||
state at the end of the block. Otherwise, a DisabledPipes object is
|
||||
returned, that has a `.restore()` method you can use to undo your
|
||||
changes.
|
||||
"""Disable one or more pipeline components. If used as a context
|
||||
manager, the pipeline will be restored to the initial state at the end
|
||||
of the block. Otherwise, a DisabledPipes object is returned, that has
|
||||
a `.restore()` method you can use to undo your changes.
|
||||
|
||||
EXAMPLE:
|
||||
|
||||
>>> nlp.add_pipe('parser')
|
||||
>>> nlp.add_pipe('tagger')
|
||||
>>> with nlp.disable_pipes('parser', 'tagger'):
|
||||
|
@ -351,7 +348,7 @@ class Language(object):
|
|||
>>> assert not nlp.has_pipe('parser')
|
||||
>>> disabled.restore()
|
||||
>>> assert nlp.has_pipe('parser')
|
||||
'''
|
||||
"""
|
||||
return DisabledPipes(self, *names)
|
||||
|
||||
def make_doc(self, text):
|
||||
|
@ -367,14 +364,14 @@ class Language(object):
|
|||
RETURNS (dict): Results from the update.
|
||||
|
||||
EXAMPLE:
|
||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
>>> with nlp.begin_training(gold) as (trainer, optimizer):
|
||||
>>> for epoch in trainer.epochs(gold):
|
||||
>>> for docs, golds in epoch:
|
||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||
"""
|
||||
if len(docs) != len(golds):
|
||||
raise IndexError("Update expects same number of docs and golds "
|
||||
"Got: %d, %d" % (len(docs), len(golds)))
|
||||
"Got: %d, %d" % (len(docs), len(golds)))
|
||||
if len(docs) == 0:
|
||||
return
|
||||
if sgd is None:
|
||||
|
@ -382,8 +379,10 @@ class Language(object):
|
|||
self._optimizer = Adam(Model.ops, 0.001)
|
||||
sgd = self._optimizer
|
||||
grads = {}
|
||||
|
||||
def get_grads(W, dW, key=None):
|
||||
grads[key] = (W, dW)
|
||||
|
||||
pipes = list(self.pipeline)
|
||||
random.shuffle(pipes)
|
||||
for name, proc in pipes:
|
||||
|
@ -420,8 +419,8 @@ class Language(object):
|
|||
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||
self._optimizer = Optimizer(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
self._optimizer.max_grad_norm = max_grad_norm
|
||||
self._optimizer.device = device
|
||||
return self._optimizer
|
||||
|
@ -460,8 +459,8 @@ class Language(object):
|
|||
eps = util.env_opt('optimizer_eps', 1e-08)
|
||||
L2 = util.env_opt('L2_penalty', 1e-6)
|
||||
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
||||
self._optimizer = Optimizer(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
||||
beta2=beta2, eps=eps)
|
||||
self._optimizer.max_grad_norm = max_grad_norm
|
||||
self._optimizer.device = device
|
||||
return self._optimizer
|
||||
|
@ -512,17 +511,17 @@ class Language(object):
|
|||
pass
|
||||
|
||||
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
||||
disable=[]):
|
||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
||||
GIL-free multi-threading.
|
||||
disable=[]):
|
||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||
Supports GIL-free multi-threading.
|
||||
|
||||
texts (iterator): A sequence of texts to process.
|
||||
as_tuples (bool):
|
||||
If set to True, inputs should be a sequence of
|
||||
(text, context) tuples. Output will then be a sequence of
|
||||
(doc, context) tuples. Defaults to False.
|
||||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
||||
decide how many to use at run time. Default is 2.
|
||||
n_threads (int): The number of worker threads to use. If -1, OpenMP
|
||||
will decide how many to use at run time. Default is 2.
|
||||
batch_size (int): The number of texts to buffer.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
YIELDS (Doc): Documents in the order of the original text.
|
||||
|
@ -546,7 +545,8 @@ class Language(object):
|
|||
if name in disable:
|
||||
continue
|
||||
if hasattr(proc, 'pipe'):
|
||||
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
|
||||
docs = proc.pipe(docs, n_threads=n_threads,
|
||||
batch_size=batch_size)
|
||||
else:
|
||||
# Apply the function, but yield the doc
|
||||
docs = _pipe(proc, docs)
|
||||
|
@ -583,7 +583,7 @@ class Language(object):
|
|||
will include the model.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
it doesn't exist. Paths may be strings or `Path`-like objects.
|
||||
disable (list): Names of pipeline components to disable and prevent
|
||||
from being saved.
|
||||
|
||||
|
@ -649,7 +649,7 @@ class Language(object):
|
|||
serializers = OrderedDict((
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||
('meta', lambda: ujson.dumps(self.meta))
|
||||
('meta', lambda: json_dumps(self.meta))
|
||||
))
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
if name in disable:
|
||||
|
@ -682,14 +682,14 @@ class Language(object):
|
|||
|
||||
|
||||
class DisabledPipes(list):
|
||||
'''Manager for temporary pipeline disabling.'''
|
||||
"""Manager for temporary pipeline disabling."""
|
||||
def __init__(self, nlp, *names):
|
||||
self.nlp = nlp
|
||||
self.names = names
|
||||
# Important! Not deep copy -- we just want the container (but we also
|
||||
# want to support people providing arbitrarily typed nlp.pipeline
|
||||
# objects.)
|
||||
self.original_pipeline = copy.copy(nlp.pipeline)
|
||||
self.original_pipeline = copy(nlp.pipeline)
|
||||
list.__init__(self)
|
||||
self.extend(nlp.remove_pipe(name) for name in names)
|
||||
|
||||
|
@ -702,7 +702,8 @@ class DisabledPipes(list):
|
|||
def restore(self):
|
||||
'''Restore the pipeline to its state when DisabledPipes was created.'''
|
||||
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
|
||||
unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)]
|
||||
unexpected = [name for name, pipe in current
|
||||
if not self.nlp.has_pipe(name)]
|
||||
if unexpected:
|
||||
# Don't change the pipeline if we're raising an error.
|
||||
self.nlp.pipeline = current
|
||||
|
|
|
@ -43,16 +43,15 @@ class Lemmatizer(object):
|
|||
morphology = {} if morphology is None else morphology
|
||||
others = [key for key in morphology
|
||||
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
||||
true_morph_key = morphology.get('morph', 0)
|
||||
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
||||
return True
|
||||
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
|
||||
return True
|
||||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
|
||||
morphology.get('Tense') == 'pres' and \
|
||||
morphology.get('Number') is None and \
|
||||
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and
|
||||
morphology.get('Tense') == 'pres' and
|
||||
morphology.get('Number') is None and
|
||||
not others):
|
||||
return True
|
||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||
|
@ -89,9 +88,6 @@ class Lemmatizer(object):
|
|||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
# TODO: Is this correct? See discussion in Issue #435.
|
||||
#if string in index:
|
||||
# forms.append(string)
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
if not forms:
|
||||
|
|
344
spacy/lexeme.pyx
344
spacy/lexeme.pyx
|
@ -2,27 +2,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from libc.math cimport sqrt
|
||||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||
from cython.view cimport array as cvarray
|
||||
cimport numpy as np
|
||||
np.import_array()
|
||||
|
||||
from libc.string cimport memset
|
||||
import numpy
|
||||
|
||||
from .typedefs cimport attr_t, flags_t
|
||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from .attrs cimport IS_BRACKET
|
||||
from .attrs cimport IS_QUOTE
|
||||
from .attrs cimport IS_LEFT_PUNCT
|
||||
from .attrs cimport IS_RIGHT_PUNCT
|
||||
from .attrs cimport IS_OOV
|
||||
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
|
||||
from . import about
|
||||
|
||||
|
||||
|
@ -32,8 +22,8 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
|||
cdef class Lexeme:
|
||||
"""An entry in the vocabulary. A `Lexeme` has no string context – it's a
|
||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||
tag).
|
||||
tag, dependency parse, or lemma (lemmatization depends on the
|
||||
part-of-speech tag).
|
||||
"""
|
||||
def __init__(self, Vocab vocab, attr_t orth):
|
||||
"""Create a Lexeme object.
|
||||
|
@ -60,17 +50,17 @@ cdef class Lexeme:
|
|||
else:
|
||||
a = 0
|
||||
b = 1
|
||||
if op == 2: # ==
|
||||
if op == 2: # ==
|
||||
return a == b
|
||||
elif op == 3: # !=
|
||||
elif op == 3: # !=
|
||||
return a != b
|
||||
elif op == 0: # <
|
||||
elif op == 0: # <
|
||||
return a < b
|
||||
elif op == 1: # <=
|
||||
elif op == 1: # <=
|
||||
return a <= b
|
||||
elif op == 4: # >
|
||||
elif op == 4: # >
|
||||
return a > b
|
||||
elif op == 5: # >=
|
||||
elif op == 5: # >=
|
||||
return a >= b
|
||||
else:
|
||||
raise NotImplementedError(op)
|
||||
|
@ -104,7 +94,8 @@ cdef class Lexeme:
|
|||
"""
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
return (numpy.dot(self.vector, other.vector) /
|
||||
(self.vector_norm * other.vector_norm))
|
||||
|
||||
def to_bytes(self):
|
||||
lex_data = Lexeme.c_to_bytes(self.c)
|
||||
|
@ -130,19 +121,13 @@ cdef class Lexeme:
|
|||
self.orth = self.c.orth
|
||||
|
||||
property has_vector:
|
||||
"""A boolean value indicating whether a word vector is associated with
|
||||
the object.
|
||||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.has_vector(self.c.orth)
|
||||
|
||||
property vector_norm:
|
||||
"""The L2 norm of the lexeme's vector representation.
|
||||
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
"""
|
||||
"""RETURNS (float): The L2 norm of the vector representation."""
|
||||
def __get__(self):
|
||||
vector = self.vector
|
||||
return numpy.sqrt((vector**2).sum())
|
||||
|
@ -169,149 +154,320 @@ cdef class Lexeme:
|
|||
self.vocab.set_vector(self.c.orth, vector)
|
||||
|
||||
property rank:
|
||||
"""RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
|
||||
to index into tables, e.g. for word vectors."""
|
||||
def __get__(self):
|
||||
return self.c.id
|
||||
|
||||
def __set__(self, value):
|
||||
self.c.id = value
|
||||
|
||||
property sentiment:
|
||||
"""RETURNS (float): A scalar value indicating the positivity or
|
||||
negativity of the lexeme."""
|
||||
def __get__(self):
|
||||
return self.c.sentiment
|
||||
|
||||
def __set__(self, float sentiment):
|
||||
self.c.sentiment = sentiment
|
||||
|
||||
property orth_:
|
||||
"""RETURNS (unicode): The original verbatim text of the lexeme
|
||||
(identical to `Lexeme.text`). Exists mostly for consistency with
|
||||
the other attributes."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.orth]
|
||||
|
||||
property text:
|
||||
"""A unicode representation of the token text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the token.
|
||||
"""
|
||||
"""RETURNS (unicode): The original verbatim text of the lexeme."""
|
||||
def __get__(self):
|
||||
return self.orth_
|
||||
|
||||
property lower:
|
||||
def __get__(self): return self.c.lower
|
||||
def __set__(self, attr_t x): self.c.lower = x
|
||||
"""RETURNS (unicode): Lowercase form of the lexeme."""
|
||||
def __get__(self):
|
||||
return self.c.lower
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
self.c.lower = x
|
||||
|
||||
property norm:
|
||||
def __get__(self): return self.c.norm
|
||||
def __set__(self, attr_t x): self.c.norm = x
|
||||
"""RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the
|
||||
lexeme text.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.norm
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
self.c.norm = x
|
||||
|
||||
property shape:
|
||||
def __get__(self): return self.c.shape
|
||||
def __set__(self, attr_t x): self.c.shape = x
|
||||
"""RETURNS (uint64): Transform of the word's string, to show
|
||||
orthographic features.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.shape
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
self.c.shape = x
|
||||
|
||||
property prefix:
|
||||
def __get__(self): return self.c.prefix
|
||||
def __set__(self, attr_t x): self.c.prefix = x
|
||||
"""RETURNS (uint64): Length-N substring from the start of the word.
|
||||
Defaults to `N=1`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.prefix
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
self.c.prefix = x
|
||||
|
||||
property suffix:
|
||||
def __get__(self): return self.c.suffix
|
||||
def __set__(self, attr_t x): self.c.suffix = x
|
||||
"""RETURNS (uint64): Length-N substring from the end of the word.
|
||||
Defaults to `N=3`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.suffix
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
self.c.suffix = x
|
||||
|
||||
property cluster:
|
||||
def __get__(self): return self.c.cluster
|
||||
def __set__(self, attr_t x): self.c.cluster = x
|
||||
"""RETURNS (int): Brown cluster ID."""
|
||||
def __get__(self):
|
||||
return self.c.cluster
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
self.c.cluster = x
|
||||
|
||||
property lang:
|
||||
def __get__(self): return self.c.lang
|
||||
def __set__(self, attr_t x): self.c.lang = x
|
||||
"""RETURNS (uint64): Language of the parent vocabulary."""
|
||||
def __get__(self):
|
||||
return self.c.lang
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
self.c.lang = x
|
||||
|
||||
property prob:
|
||||
def __get__(self): return self.c.prob
|
||||
def __set__(self, float x): self.c.prob = x
|
||||
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
||||
type."""
|
||||
def __get__(self):
|
||||
return self.c.prob
|
||||
|
||||
def __set__(self, float x):
|
||||
self.c.prob = x
|
||||
|
||||
property lower_:
|
||||
def __get__(self): return self.vocab.strings[self.c.lower]
|
||||
def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x)
|
||||
"""RETURNS (unicode): Lowercase form of the word."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lower]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
self.c.lower = self.vocab.strings.add(x)
|
||||
|
||||
property norm_:
|
||||
def __get__(self): return self.vocab.strings[self.c.norm]
|
||||
def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x)
|
||||
"""RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
|
||||
lexeme text.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.norm]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
self.c.norm = self.vocab.strings.add(x)
|
||||
|
||||
property shape_:
|
||||
def __get__(self): return self.vocab.strings[self.c.shape]
|
||||
def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x)
|
||||
"""RETURNS (unicode): Transform of the word's string, to show
|
||||
orthographic features.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.shape]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
self.c.shape = self.vocab.strings.add(x)
|
||||
|
||||
property prefix_:
|
||||
def __get__(self): return self.vocab.strings[self.c.prefix]
|
||||
def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x)
|
||||
"""RETURNS (unicode): Length-N substring from the start of the word.
|
||||
Defaults to `N=1`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.prefix]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
self.c.prefix = self.vocab.strings.add(x)
|
||||
|
||||
property suffix_:
|
||||
def __get__(self): return self.vocab.strings[self.c.suffix]
|
||||
def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x)
|
||||
"""RETURNS (unicode): Length-N substring from the end of the word.
|
||||
Defaults to `N=3`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.suffix]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
self.c.suffix = self.vocab.strings.add(x)
|
||||
|
||||
property lang_:
|
||||
def __get__(self): return self.vocab.strings[self.c.lang]
|
||||
def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x)
|
||||
"""RETURNS (unicode): Language of the parent vocabulary."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lang]
|
||||
|
||||
def __set__(self, unicode x):
|
||||
self.c.lang = self.vocab.strings.add(x)
|
||||
|
||||
property flags:
|
||||
def __get__(self): return self.c.flags
|
||||
def __set__(self, flags_t x): self.c.flags = x
|
||||
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
||||
def __get__(self):
|
||||
return self.c.flags
|
||||
|
||||
def __set__(self, flags_t x):
|
||||
self.c.flags = x
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
|
||||
def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
|
||||
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_OOV)
|
||||
|
||||
def __set__(self, attr_t x):
|
||||
Lexeme.c_set_flag(self.c, IS_OOV, x)
|
||||
|
||||
property is_stop:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x)
|
||||
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
||||
"""RETURNS (bool): Whether the lexeme consists of alphanumeric
|
||||
characters. Equivalent to `lexeme.text.isalpha()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
||||
|
||||
property is_ascii:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
||||
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
||||
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
||||
|
||||
property is_digit:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
||||
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
||||
to `lexeme.text.isdigit()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
||||
|
||||
property is_lower:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
||||
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
||||
`lexeme.text.islower()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
||||
|
||||
property is_upper:
|
||||
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
||||
`lexeme.text.isupper()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
||||
|
||||
property is_title:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
||||
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
||||
`lexeme.text.istitle()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
||||
|
||||
property is_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
||||
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
||||
|
||||
property is_space:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||||
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
||||
Equivalent to `lexeme.text.isspace()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||||
|
||||
property is_bracket:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||||
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||||
|
||||
property is_quote:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||
|
||||
property is_left_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||||
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. )."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||||
|
||||
property is_right_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||
|
||||
property like_num:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
||||
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
||||
"10", "ten", etc.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
||||
|
||||
property like_email:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
||||
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
||||
|
|
|
@ -4,12 +4,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .attrs cimport attr_id_t
|
||||
from .structs cimport TokenC
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from libcpp.vector cimport vector
|
||||
|
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
|
|||
from murmurhash.mrmr cimport hash64
|
||||
from libc.stdint cimport int32_t
|
||||
|
||||
from .attrs cimport ID, NULL_ATTR, ENT_TYPE
|
||||
from . import attrs
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport TokenC
|
||||
from .tokens.doc cimport Doc, get_token_attr
|
||||
from .vocab cimport Vocab
|
||||
|
||||
from .attrs import IDS
|
||||
from .attrs cimport attr_id_t, ID, NULL_ATTR
|
||||
from .attrs import FLAG61 as U_ENT
|
||||
|
||||
from .attrs import FLAG60 as B2_ENT
|
||||
from .attrs import FLAG59 as B3_ENT
|
||||
from .attrs import FLAG58 as B4_ENT
|
||||
|
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
|
|||
from .attrs import FLAG54 as B8_ENT
|
||||
from .attrs import FLAG53 as B9_ENT
|
||||
from .attrs import FLAG52 as B10_ENT
|
||||
|
||||
from .attrs import FLAG51 as I3_ENT
|
||||
from .attrs import FLAG50 as I4_ENT
|
||||
from .attrs import FLAG49 as I5_ENT
|
||||
|
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
|
|||
from .attrs import FLAG46 as I8_ENT
|
||||
from .attrs import FLAG45 as I9_ENT
|
||||
from .attrs import FLAG44 as I10_ENT
|
||||
|
||||
from .attrs import FLAG43 as L2_ENT
|
||||
from .attrs import FLAG42 as L3_ENT
|
||||
from .attrs import FLAG41 as L4_ENT
|
||||
|
@ -153,7 +146,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
|||
def _convert_strings(token_specs, string_store):
|
||||
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
||||
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
||||
'?': (ZERO_ONE,), '1': (ONE,)}
|
||||
'?': (ZERO_ONE,), '1': (ONE,)}
|
||||
tokens = []
|
||||
op = ONE
|
||||
for spec in token_specs:
|
||||
|
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
|
|||
if value in operators:
|
||||
ops = operators[value]
|
||||
else:
|
||||
raise KeyError(
|
||||
"Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
|
||||
msg = "Unknown operator '%s'. Options: %s"
|
||||
raise KeyError(msg % (value, ', '.join(operators.keys())))
|
||||
if isinstance(attr, basestring):
|
||||
attr = attrs.IDS.get(attr.upper())
|
||||
attr = IDS.get(attr.upper())
|
||||
if isinstance(value, basestring):
|
||||
value = string_store.add(value)
|
||||
if isinstance(value, bool):
|
||||
|
@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store):
|
|||
def merge_phrase(matcher, doc, i, matches):
|
||||
"""Callback to merge a phrase on match."""
|
||||
ent_id, label, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span = doc[start:end]
|
||||
span.merge(ent_type=label, ent_id=ent_id)
|
||||
|
||||
|
||||
|
@ -233,13 +226,13 @@ cdef class Matcher:
|
|||
return self._normalize_key(key) in self._patterns
|
||||
|
||||
def add(self, key, on_match, *patterns):
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
||||
an on_match callback, and one or more patterns.
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||
key, an on_match callback, and one or more patterns.
|
||||
|
||||
If the key exists, the patterns are appended to the previous ones, and
|
||||
the previous on_match callback is replaced. The `on_match` callback will
|
||||
receive the arguments `(matcher, doc, i, matches)`. You can also set
|
||||
`on_match` to `None` to not perform any actions.
|
||||
the previous on_match callback is replaced. The `on_match` callback
|
||||
will receive the arguments `(matcher, doc, i, matches)`. You can also
|
||||
set `on_match` to `None` to not perform any actions.
|
||||
|
||||
A pattern consists of one or more `token_specs`, where a `token_spec`
|
||||
is a dictionary mapping attribute IDs to values, and optionally a
|
||||
|
@ -253,8 +246,8 @@ cdef class Matcher:
|
|||
The + and * operators are usually interpretted "greedily", i.e. longer
|
||||
matches are returned where possible. However, if you specify two '+'
|
||||
and '*' patterns in a row and their matches overlap, the first
|
||||
operator will behave non-greedily. This quirk in the semantics
|
||||
makes the matcher more efficient, by avoiding the need for back-tracking.
|
||||
operator will behave non-greedily. This quirk in the semantics makes
|
||||
the matcher more efficient, by avoiding the need for back-tracking.
|
||||
|
||||
key (unicode): The match ID.
|
||||
on_match (callable): Callback executed on match.
|
||||
|
@ -268,7 +261,6 @@ cdef class Matcher:
|
|||
key = self._normalize_key(key)
|
||||
self._patterns.setdefault(key, [])
|
||||
self._callbacks[key] = on_match
|
||||
|
||||
for pattern in patterns:
|
||||
specs = _convert_strings(pattern, self.vocab.strings)
|
||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||
|
@ -315,9 +307,9 @@ cdef class Matcher:
|
|||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
||||
in parallel, if the implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in docs:
|
||||
|
@ -325,7 +317,7 @@ cdef class Matcher:
|
|||
yield doc
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doc (Doc): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
|
@ -342,8 +334,8 @@ cdef class Matcher:
|
|||
for token_i in range(doc.length):
|
||||
token = &doc.c[token_i]
|
||||
q = 0
|
||||
# Go over the open matches, extending or finalizing if able. Otherwise,
|
||||
# we over-write them (q doesn't advance)
|
||||
# Go over the open matches, extending or finalizing if able.
|
||||
# Otherwise, we over-write them (q doesn't advance)
|
||||
for state in partials:
|
||||
action = get_action(state.second, token)
|
||||
if action == PANIC:
|
||||
|
@ -356,8 +348,8 @@ cdef class Matcher:
|
|||
|
||||
if action == REPEAT:
|
||||
# Leave the state in the queue, and advance to next slot
|
||||
# (i.e. we don't overwrite -- we want to greedily match more
|
||||
# pattern.
|
||||
# (i.e. we don't overwrite -- we want to greedily match
|
||||
# more pattern.
|
||||
q += 1
|
||||
elif action == REJECT:
|
||||
pass
|
||||
|
@ -366,8 +358,8 @@ cdef class Matcher:
|
|||
partials[q].second += 1
|
||||
q += 1
|
||||
elif action in (ACCEPT, ACCEPT_PREV):
|
||||
# TODO: What to do about patterns starting with ZERO? Need to
|
||||
# adjust the start position.
|
||||
# TODO: What to do about patterns starting with ZERO? Need
|
||||
# to adjust the start position.
|
||||
start = state.first
|
||||
end = token_i+1 if action == ACCEPT else token_i
|
||||
ent_id = state.second[1].attrs[0].value
|
||||
|
@ -388,8 +380,8 @@ cdef class Matcher:
|
|||
state.second = pattern
|
||||
partials.push_back(state)
|
||||
elif action == ADVANCE:
|
||||
# TODO: What to do about patterns starting with ZERO? Need to
|
||||
# adjust the start position.
|
||||
# TODO: What to do about patterns starting with ZERO? Need
|
||||
# to adjust the start position.
|
||||
state.first = token_i
|
||||
state.second = pattern + 1
|
||||
partials.push_back(state)
|
||||
|
@ -413,7 +405,6 @@ cdef class Matcher:
|
|||
on_match = self._callbacks.get(ent_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
# TODO: only return (match_id, start, end)
|
||||
return matches
|
||||
|
||||
def _normalize_key(self, key):
|
||||
|
@ -441,7 +432,8 @@ def get_bilou(length):
|
|||
elif length == 8:
|
||||
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
||||
elif length == 9:
|
||||
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
|
||||
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
|
||||
L9_ENT]
|
||||
elif length == 10:
|
||||
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
||||
I10_ENT, I10_ENT, L10_ENT]
|
||||
|
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
|
|||
cdef Vocab vocab
|
||||
cdef Matcher matcher
|
||||
cdef PreshMap phrase_ids
|
||||
|
||||
cdef int max_length
|
||||
cdef attr_t* _phrase_key
|
||||
|
||||
cdef public object _callbacks
|
||||
cdef public object _patterns
|
||||
|
||||
|
@ -470,7 +460,8 @@ cdef class PhraseMatcher:
|
|||
self.phrase_ids = PreshMap()
|
||||
abstract_patterns = []
|
||||
for length in range(1, max_length):
|
||||
abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
|
||||
abstract_patterns.append([{tag: True}
|
||||
for tag in get_bilou(length)])
|
||||
self.matcher.add('Candidate', None, *abstract_patterns)
|
||||
self._callbacks = {}
|
||||
|
||||
|
@ -496,8 +487,8 @@ cdef class PhraseMatcher:
|
|||
return (self.__class__, (self.vocab,), None, None)
|
||||
|
||||
def add(self, key, on_match, *docs):
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID key,
|
||||
an on_match callback, and one or more patterns.
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||
key, an on_match callback, and one or more patterns.
|
||||
|
||||
key (unicode): The match ID.
|
||||
on_match (callable): Callback executed on match.
|
||||
|
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
|
|||
raise ValueError(msg % (len(doc), self.max_length))
|
||||
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||
self._callbacks[ent_id] = on_match
|
||||
|
||||
cdef int length
|
||||
cdef int i
|
||||
cdef hash_t phrase_hash
|
||||
|
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
|
|||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
||||
in parallel, if the implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in stream:
|
||||
|
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
|
|||
self._phrase_key[i] = 0
|
||||
for i, j in enumerate(range(start, end)):
|
||||
self._phrase_key[i] = doc.c[j].lex.orth
|
||||
cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
|
||||
cdef hash_t key = hash64(self._phrase_key,
|
||||
self.max_length * sizeof(attr_t), 0)
|
||||
ent_id = <hash_t>self.phrase_ids.get(key)
|
||||
if ent_id == 0:
|
||||
return None
|
||||
|
|
|
@ -4,17 +4,15 @@ from __future__ import unicode_literals
|
|||
|
||||
from libc.string cimport memset
|
||||
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
|
||||
from .attrs cimport POS, IS_SPACE
|
||||
from .attrs import LEMMA, intify_attrs
|
||||
from .parts_of_speech cimport SPACE
|
||||
from .parts_of_speech import IDS as POS_IDS
|
||||
from .lexeme cimport Lexeme
|
||||
from .attrs import LEMMA, intify_attrs
|
||||
|
||||
|
||||
def _normalize_props(props):
|
||||
"""
|
||||
Transform deprecated string keys to correct names.
|
||||
"""
|
||||
"""Transform deprecated string keys to correct names."""
|
||||
out = {}
|
||||
for key, value in props.items():
|
||||
if key == POS:
|
||||
|
@ -77,7 +75,8 @@ cdef class Morphology:
|
|||
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||
"""Set morphological attributes on a token without a POS tag. Uses
|
||||
the lemmatizer's lookup() method, which looks up the string in the
|
||||
table provided by the language data as lemma_lookup (if available)."""
|
||||
table provided by the language data as lemma_lookup (if available).
|
||||
"""
|
||||
if token.lemma == 0:
|
||||
orth_str = self.strings[token.lex.orth]
|
||||
lemma = self.lemmatizer.lookup(orth_str)
|
||||
|
@ -95,11 +94,10 @@ cdef class Morphology:
|
|||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id > self.n_tags:
|
||||
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
||||
# is that this is where the specific word and the tag interact. Still,
|
||||
# we should have a better way to enforce this rule, or figure out why
|
||||
# the statistical model fails.
|
||||
# Related to Issue #220
|
||||
# TODO: It's pretty arbitrary to put this logic here. I guess the
|
||||
# justification is that this is where the specific word and the tag
|
||||
# interact. Still, we should have a better way to enforce this rule, or
|
||||
# figure out why the statistical model fails. Related to Issue #220
|
||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||
tag_id = self.reverse_index[self.strings.add('_SP')]
|
||||
rich_tag = self.rich_tags[tag_id]
|
||||
|
@ -123,14 +121,13 @@ cdef class Morphology:
|
|||
else:
|
||||
flags[0] &= ~(one << flag_id)
|
||||
|
||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
||||
"""
|
||||
Add a special-case rule to the morphological analyser. Tokens whose
|
||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
|
||||
force=False):
|
||||
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||
tag and orth match the rule will receive the specified properties.
|
||||
|
||||
Arguments:
|
||||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
"""
|
||||
self.exc[(tag_str, orth_str)] = dict(attrs)
|
||||
tag = self.strings.add(tag_str)
|
||||
|
@ -144,10 +141,9 @@ cdef class Morphology:
|
|||
elif force:
|
||||
memset(cached, 0, sizeof(cached[0]))
|
||||
else:
|
||||
msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
|
||||
"to overwrite.")
|
||||
msg = msg % (tag_str, orth_str)
|
||||
raise ValueError(msg)
|
||||
raise ValueError(
|
||||
"Conflicting morphology exception for (%s, %s). Use "
|
||||
"force=True to overwrite." % (tag_str, orth_str))
|
||||
|
||||
cached.tag = rich_tag
|
||||
# TODO: Refactor this to take arbitrary attributes.
|
||||
|
@ -218,7 +214,7 @@ IDS = {
|
|||
"Definite_two": Definite_two,
|
||||
"Definite_def": Definite_def,
|
||||
"Definite_red": Definite_red,
|
||||
"Definite_cons": Definite_cons, # U20
|
||||
"Definite_cons": Definite_cons, # U20
|
||||
"Definite_ind": Definite_ind,
|
||||
"Degree_cmp": Degree_cmp,
|
||||
"Degree_comp": Degree_comp,
|
||||
|
@ -227,7 +223,7 @@ IDS = {
|
|||
"Degree_sup": Degree_sup,
|
||||
"Degree_abs": Degree_abs,
|
||||
"Degree_com": Degree_com,
|
||||
"Degree_dim ": Degree_dim, # du
|
||||
"Degree_dim ": Degree_dim, # du
|
||||
"Gender_com": Gender_com,
|
||||
"Gender_fem": Gender_fem,
|
||||
"Gender_masc": Gender_masc,
|
||||
|
@ -242,15 +238,15 @@ IDS = {
|
|||
"Negative_neg": Negative_neg,
|
||||
"Negative_pos": Negative_pos,
|
||||
"Negative_yes": Negative_yes,
|
||||
"Polarity_neg": Polarity_neg, # U20
|
||||
"Polarity_pos": Polarity_pos, # U20
|
||||
"Polarity_neg": Polarity_neg, # U20
|
||||
"Polarity_pos": Polarity_pos, # U20
|
||||
"Number_com": Number_com,
|
||||
"Number_dual": Number_dual,
|
||||
"Number_none": Number_none,
|
||||
"Number_plur": Number_plur,
|
||||
"Number_sing": Number_sing,
|
||||
"Number_ptan ": Number_ptan, # bg
|
||||
"Number_count ": Number_count, # bg
|
||||
"Number_ptan ": Number_ptan, # bg
|
||||
"Number_count ": Number_count, # bg
|
||||
"NumType_card": NumType_card,
|
||||
"NumType_dist": NumType_dist,
|
||||
"NumType_frac": NumType_frac,
|
||||
|
@ -276,7 +272,7 @@ IDS = {
|
|||
"PronType_rel": PronType_rel,
|
||||
"PronType_tot": PronType_tot,
|
||||
"PronType_clit": PronType_clit,
|
||||
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
||||
"PronType_exc ": PronType_exc, # es, ca, it, fa,
|
||||
"Reflex_yes": Reflex_yes,
|
||||
"Tense_fut": Tense_fut,
|
||||
"Tense_imp": Tense_imp,
|
||||
|
@ -292,19 +288,19 @@ IDS = {
|
|||
"VerbForm_partPres": VerbForm_partPres,
|
||||
"VerbForm_sup": VerbForm_sup,
|
||||
"VerbForm_trans": VerbForm_trans,
|
||||
"VerbForm_conv": VerbForm_conv, # U20
|
||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||
"VerbForm_conv": VerbForm_conv, # U20
|
||||
"VerbForm_gdv ": VerbForm_gdv, # la,
|
||||
"Voice_act": Voice_act,
|
||||
"Voice_cau": Voice_cau,
|
||||
"Voice_pass": Voice_pass,
|
||||
"Voice_mid ": Voice_mid, # gkc,
|
||||
"Voice_int ": Voice_int, # hb,
|
||||
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
||||
"AdpType_prep ": AdpType_prep, # cz, U,
|
||||
"AdpType_post ": AdpType_post, # U,
|
||||
"AdpType_voc ": AdpType_voc, # cz,
|
||||
"AdpType_comprep ": AdpType_comprep, # cz,
|
||||
"AdpType_circ ": AdpType_circ, # U,
|
||||
"Voice_mid ": Voice_mid, # gkc,
|
||||
"Voice_int ": Voice_int, # hb,
|
||||
"Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
|
||||
"AdpType_prep ": AdpType_prep, # cz, U,
|
||||
"AdpType_post ": AdpType_post, # U,
|
||||
"AdpType_voc ": AdpType_voc, # cz,
|
||||
"AdpType_comprep ": AdpType_comprep, # cz,
|
||||
"AdpType_circ ": AdpType_circ, # U,
|
||||
"AdvType_man": AdvType_man,
|
||||
"AdvType_loc": AdvType_loc,
|
||||
"AdvType_tim": AdvType_tim,
|
||||
|
@ -314,122 +310,122 @@ IDS = {
|
|||
"AdvType_sta": AdvType_sta,
|
||||
"AdvType_ex": AdvType_ex,
|
||||
"AdvType_adadj": AdvType_adadj,
|
||||
"ConjType_oper ": ConjType_oper, # cz, U,
|
||||
"ConjType_comp ": ConjType_comp, # cz, U,
|
||||
"Connegative_yes ": Connegative_yes, # fi,
|
||||
"Derivation_minen ": Derivation_minen, # fi,
|
||||
"Derivation_sti ": Derivation_sti, # fi,
|
||||
"Derivation_inen ": Derivation_inen, # fi,
|
||||
"Derivation_lainen ": Derivation_lainen, # fi,
|
||||
"Derivation_ja ": Derivation_ja, # fi,
|
||||
"Derivation_ton ": Derivation_ton, # fi,
|
||||
"Derivation_vs ": Derivation_vs, # fi,
|
||||
"Derivation_ttain ": Derivation_ttain, # fi,
|
||||
"Derivation_ttaa ": Derivation_ttaa, # fi,
|
||||
"Echo_rdp ": Echo_rdp, # U,
|
||||
"Echo_ech ": Echo_ech, # U,
|
||||
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
|
||||
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
|
||||
"Foreign_tscript ": Foreign_tscript, # cz, U,
|
||||
"Foreign_yes ": Foreign_yes, # sl,
|
||||
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
|
||||
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
|
||||
"Gender_erg_masc ": Gender_erg_masc, # bq,
|
||||
"Gender_erg_fem ": Gender_erg_fem, # bq,
|
||||
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
|
||||
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
|
||||
"Gender_psor_neut ": Gender_psor_neut, # sl,
|
||||
"Hyph_yes ": Hyph_yes, # cz, U,
|
||||
"InfForm_one ": InfForm_one, # fi,
|
||||
"InfForm_two ": InfForm_two, # fi,
|
||||
"InfForm_three ": InfForm_three, # fi,
|
||||
"NameType_geo ": NameType_geo, # U, cz,
|
||||
"NameType_prs ": NameType_prs, # U, cz,
|
||||
"NameType_giv ": NameType_giv, # U, cz,
|
||||
"NameType_sur ": NameType_sur, # U, cz,
|
||||
"NameType_nat ": NameType_nat, # U, cz,
|
||||
"NameType_com ": NameType_com, # U, cz,
|
||||
"NameType_pro ": NameType_pro, # U, cz,
|
||||
"NameType_oth ": NameType_oth, # U, cz,
|
||||
"NounType_com ": NounType_com, # U,
|
||||
"NounType_prop ": NounType_prop, # U,
|
||||
"NounType_class ": NounType_class, # U,
|
||||
"Number_abs_sing ": Number_abs_sing, # bq, U,
|
||||
"Number_abs_plur ": Number_abs_plur, # bq, U,
|
||||
"Number_dat_sing ": Number_dat_sing, # bq, U,
|
||||
"Number_dat_plur ": Number_dat_plur, # bq, U,
|
||||
"Number_erg_sing ": Number_erg_sing, # bq, U,
|
||||
"Number_erg_plur ": Number_erg_plur, # bq, U,
|
||||
"Number_psee_sing ": Number_psee_sing, # U,
|
||||
"Number_psee_plur ": Number_psee_plur, # U,
|
||||
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
||||
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||
"NumValue_one ": NumValue_one, # cz, U,
|
||||
"NumValue_two ": NumValue_two, # cz, U,
|
||||
"NumValue_three ": NumValue_three, # cz, U,
|
||||
"PartForm_pres ": PartForm_pres, # fi,
|
||||
"PartForm_past ": PartForm_past, # fi,
|
||||
"PartForm_agt ": PartForm_agt, # fi,
|
||||
"PartForm_neg ": PartForm_neg, # fi,
|
||||
"PartType_mod ": PartType_mod, # U,
|
||||
"PartType_emp ": PartType_emp, # U,
|
||||
"PartType_res ": PartType_res, # U,
|
||||
"PartType_inf ": PartType_inf, # U,
|
||||
"PartType_vbp ": PartType_vbp, # U,
|
||||
"Person_abs_one ": Person_abs_one, # bq, U,
|
||||
"Person_abs_two ": Person_abs_two, # bq, U,
|
||||
"Person_abs_three ": Person_abs_three, # bq, U,
|
||||
"Person_dat_one ": Person_dat_one, # bq, U,
|
||||
"Person_dat_two ": Person_dat_two, # bq, U,
|
||||
"Person_dat_three ": Person_dat_three, # bq, U,
|
||||
"Person_erg_one ": Person_erg_one, # bq, U,
|
||||
"Person_erg_two ": Person_erg_two, # bq, U,
|
||||
"Person_erg_three ": Person_erg_three, # bq, U,
|
||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||
"Polite_inf ": Polite_inf, # bq, U,
|
||||
"Polite_pol ": Polite_pol, # bq, U,
|
||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
|
||||
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
|
||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||
"Prefix_yes ": Prefix_yes, # U,
|
||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||
"PrepCase_pre ": PrepCase_pre, # U,
|
||||
"PunctSide_ini ": PunctSide_ini, # U,
|
||||
"PunctSide_fin ": PunctSide_fin, # U,
|
||||
"PunctType_peri ": PunctType_peri, # U,
|
||||
"PunctType_qest ": PunctType_qest, # U,
|
||||
"PunctType_excl ": PunctType_excl, # U,
|
||||
"PunctType_quot ": PunctType_quot, # U,
|
||||
"PunctType_brck ": PunctType_brck, # U,
|
||||
"PunctType_comm ": PunctType_comm, # U,
|
||||
"PunctType_colo ": PunctType_colo, # U,
|
||||
"PunctType_semi ": PunctType_semi, # U,
|
||||
"PunctType_dash ": PunctType_dash, # U,
|
||||
"Style_arch ": Style_arch, # cz, fi, U,
|
||||
"Style_rare ": Style_rare, # cz, fi, U,
|
||||
"Style_poet ": Style_poet, # cz, U,
|
||||
"Style_norm ": Style_norm, # cz, U,
|
||||
"Style_coll ": Style_coll, # cz, U,
|
||||
"Style_vrnc ": Style_vrnc, # cz, U,
|
||||
"Style_sing ": Style_sing, # cz, U,
|
||||
"Style_expr ": Style_expr, # cz, U,
|
||||
"Style_derg ": Style_derg, # cz, U,
|
||||
"Style_vulg ": Style_vulg, # cz, U,
|
||||
"Style_yes ": Style_yes, # fi, U,
|
||||
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
|
||||
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
|
||||
"VerbType_aux ": VerbType_aux, # U,
|
||||
"VerbType_cop ": VerbType_cop, # U,
|
||||
"VerbType_mod ": VerbType_mod, # U,
|
||||
"VerbType_light ": VerbType_light, # U,
|
||||
"ConjType_oper ": ConjType_oper, # cz, U,
|
||||
"ConjType_comp ": ConjType_comp, # cz, U,
|
||||
"Connegative_yes ": Connegative_yes, # fi,
|
||||
"Derivation_minen ": Derivation_minen, # fi,
|
||||
"Derivation_sti ": Derivation_sti, # fi,
|
||||
"Derivation_inen ": Derivation_inen, # fi,
|
||||
"Derivation_lainen ": Derivation_lainen, # fi,
|
||||
"Derivation_ja ": Derivation_ja, # fi,
|
||||
"Derivation_ton ": Derivation_ton, # fi,
|
||||
"Derivation_vs ": Derivation_vs, # fi,
|
||||
"Derivation_ttain ": Derivation_ttain, # fi,
|
||||
"Derivation_ttaa ": Derivation_ttaa, # fi,
|
||||
"Echo_rdp ": Echo_rdp, # U,
|
||||
"Echo_ech ": Echo_ech, # U,
|
||||
"Foreign_foreign ": Foreign_foreign, # cz, fi, U,
|
||||
"Foreign_fscript ": Foreign_fscript, # cz, fi, U,
|
||||
"Foreign_tscript ": Foreign_tscript, # cz, U,
|
||||
"Foreign_yes ": Foreign_yes, # sl,
|
||||
"Gender_dat_masc ": Gender_dat_masc, # bq, U,
|
||||
"Gender_dat_fem ": Gender_dat_fem, # bq, U,
|
||||
"Gender_erg_masc ": Gender_erg_masc, # bq,
|
||||
"Gender_erg_fem ": Gender_erg_fem, # bq,
|
||||
"Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
|
||||
"Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
|
||||
"Gender_psor_neut ": Gender_psor_neut, # sl,
|
||||
"Hyph_yes ": Hyph_yes, # cz, U,
|
||||
"InfForm_one ": InfForm_one, # fi,
|
||||
"InfForm_two ": InfForm_two, # fi,
|
||||
"InfForm_three ": InfForm_three, # fi,
|
||||
"NameType_geo ": NameType_geo, # U, cz,
|
||||
"NameType_prs ": NameType_prs, # U, cz,
|
||||
"NameType_giv ": NameType_giv, # U, cz,
|
||||
"NameType_sur ": NameType_sur, # U, cz,
|
||||
"NameType_nat ": NameType_nat, # U, cz,
|
||||
"NameType_com ": NameType_com, # U, cz,
|
||||
"NameType_pro ": NameType_pro, # U, cz,
|
||||
"NameType_oth ": NameType_oth, # U, cz,
|
||||
"NounType_com ": NounType_com, # U,
|
||||
"NounType_prop ": NounType_prop, # U,
|
||||
"NounType_class ": NounType_class, # U,
|
||||
"Number_abs_sing ": Number_abs_sing, # bq, U,
|
||||
"Number_abs_plur ": Number_abs_plur, # bq, U,
|
||||
"Number_dat_sing ": Number_dat_sing, # bq, U,
|
||||
"Number_dat_plur ": Number_dat_plur, # bq, U,
|
||||
"Number_erg_sing ": Number_erg_sing, # bq, U,
|
||||
"Number_erg_plur ": Number_erg_plur, # bq, U,
|
||||
"Number_psee_sing ": Number_psee_sing, # U,
|
||||
"Number_psee_plur ": Number_psee_plur, # U,
|
||||
"Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
|
||||
"Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
|
||||
"NumForm_digit ": NumForm_digit, # cz, sl, U,
|
||||
"NumForm_roman ": NumForm_roman, # cz, sl, U,
|
||||
"NumForm_word ": NumForm_word, # cz, sl, U,
|
||||
"NumValue_one ": NumValue_one, # cz, U,
|
||||
"NumValue_two ": NumValue_two, # cz, U,
|
||||
"NumValue_three ": NumValue_three, # cz, U,
|
||||
"PartForm_pres ": PartForm_pres, # fi,
|
||||
"PartForm_past ": PartForm_past, # fi,
|
||||
"PartForm_agt ": PartForm_agt, # fi,
|
||||
"PartForm_neg ": PartForm_neg, # fi,
|
||||
"PartType_mod ": PartType_mod, # U,
|
||||
"PartType_emp ": PartType_emp, # U,
|
||||
"PartType_res ": PartType_res, # U,
|
||||
"PartType_inf ": PartType_inf, # U,
|
||||
"PartType_vbp ": PartType_vbp, # U,
|
||||
"Person_abs_one ": Person_abs_one, # bq, U,
|
||||
"Person_abs_two ": Person_abs_two, # bq, U,
|
||||
"Person_abs_three ": Person_abs_three, # bq, U,
|
||||
"Person_dat_one ": Person_dat_one, # bq, U,
|
||||
"Person_dat_two ": Person_dat_two, # bq, U,
|
||||
"Person_dat_three ": Person_dat_three, # bq, U,
|
||||
"Person_erg_one ": Person_erg_one, # bq, U,
|
||||
"Person_erg_two ": Person_erg_two, # bq, U,
|
||||
"Person_erg_three ": Person_erg_three, # bq, U,
|
||||
"Person_psor_one ": Person_psor_one, # fi, U,
|
||||
"Person_psor_two ": Person_psor_two, # fi, U,
|
||||
"Person_psor_three ": Person_psor_three, # fi, U,
|
||||
"Polite_inf ": Polite_inf, # bq, U,
|
||||
"Polite_pol ": Polite_pol, # bq, U,
|
||||
"Polite_abs_inf ": Polite_abs_inf, # bq, U,
|
||||
"Polite_abs_pol ": Polite_abs_pol, # bq, U,
|
||||
"Polite_erg_inf ": Polite_erg_inf, # bq, U,
|
||||
"Polite_erg_pol ": Polite_erg_pol, # bq, U,
|
||||
"Polite_dat_inf ": Polite_dat_inf, # bq, U,
|
||||
"Polite_dat_pol ": Polite_dat_pol, # bq, U,
|
||||
"Prefix_yes ": Prefix_yes, # U,
|
||||
"PrepCase_npr ": PrepCase_npr, # cz,
|
||||
"PrepCase_pre ": PrepCase_pre, # U,
|
||||
"PunctSide_ini ": PunctSide_ini, # U,
|
||||
"PunctSide_fin ": PunctSide_fin, # U,
|
||||
"PunctType_peri ": PunctType_peri, # U,
|
||||
"PunctType_qest ": PunctType_qest, # U,
|
||||
"PunctType_excl ": PunctType_excl, # U,
|
||||
"PunctType_quot ": PunctType_quot, # U,
|
||||
"PunctType_brck ": PunctType_brck, # U,
|
||||
"PunctType_comm ": PunctType_comm, # U,
|
||||
"PunctType_colo ": PunctType_colo, # U,
|
||||
"PunctType_semi ": PunctType_semi, # U,
|
||||
"PunctType_dash ": PunctType_dash, # U,
|
||||
"Style_arch ": Style_arch, # cz, fi, U,
|
||||
"Style_rare ": Style_rare, # cz, fi, U,
|
||||
"Style_poet ": Style_poet, # cz, U,
|
||||
"Style_norm ": Style_norm, # cz, U,
|
||||
"Style_coll ": Style_coll, # cz, U,
|
||||
"Style_vrnc ": Style_vrnc, # cz, U,
|
||||
"Style_sing ": Style_sing, # cz, U,
|
||||
"Style_expr ": Style_expr, # cz, U,
|
||||
"Style_derg ": Style_derg, # cz, U,
|
||||
"Style_vulg ": Style_vulg, # cz, U,
|
||||
"Style_yes ": Style_yes, # fi, U,
|
||||
"StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
|
||||
"StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
|
||||
"VerbType_aux ": VerbType_aux, # U,
|
||||
"VerbType_cop ": VerbType_cop, # U,
|
||||
"VerbType_mod ": VerbType_mod, # U,
|
||||
"VerbType_light ": VerbType_light, # U,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ IDS = {
|
|||
"ADP": ADP,
|
||||
"ADV": ADV,
|
||||
"AUX": AUX,
|
||||
"CONJ": CONJ, # U20
|
||||
"CONJ": CONJ, # U20
|
||||
"CCONJ": CCONJ,
|
||||
"DET": DET,
|
||||
"INTJ": INTJ,
|
||||
|
|
|
@ -3,26 +3,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from thinc.api import chain, layerize, with_getitem
|
||||
import numpy
|
||||
cimport numpy as np
|
||||
import cytoolz
|
||||
import util
|
||||
from collections import OrderedDict
|
||||
import ujson
|
||||
import msgpack
|
||||
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||
from thinc.i2v import HashEmbed
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||
from thinc.misc import Residual
|
||||
from thinc.misc import BatchNorm as BN
|
||||
from thinc.misc import LayerNorm as LN
|
||||
|
||||
from thinc.api import chain
|
||||
from thinc.v2v import Softmax
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||
from thinc.neural.util import to_categorical
|
||||
|
||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
|
@ -30,29 +21,23 @@ from .syntax.nn_parser cimport Parser
|
|||
from .syntax import nonproj
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
from .syntax.arc_eager cimport ArcEager
|
||||
from .tagger import Tagger
|
||||
from .syntax.stateclass cimport StateClass
|
||||
from .gold cimport GoldParse
|
||||
from .morphology cimport Morphology
|
||||
from .vocab cimport Vocab
|
||||
from .syntax import nonproj
|
||||
from .compat import json_dumps
|
||||
|
||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||
from ._ml import Tok2Vec, flatten
|
||||
from ._ml import build_text_classifier, build_tagger_model
|
||||
from ._ml import link_vectors_to_models
|
||||
from .attrs import POS
|
||||
from .parts_of_speech import X
|
||||
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||
from ._ml import link_vectors_to_models
|
||||
from . import util
|
||||
|
||||
|
||||
class SentenceSegmenter(object):
|
||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||
(that doesn't require the dependency parse).
|
||||
|
||||
To change the sentence boundary detection strategy, pass a generator
|
||||
function `strategy` on initialization, or assign a new strategy to
|
||||
the .strategy attribute.
|
||||
|
||||
(that doesn't require the dependency parse). To change the sentence
|
||||
boundary detection strategy, pass a generator function `strategy` on
|
||||
initialization, or assign a new strategy to the .strategy attribute.
|
||||
Sentence detection strategies should be generators that take `Doc` objects
|
||||
and yield `Span` objects for each sentence.
|
||||
"""
|
||||
|
@ -74,16 +59,20 @@ class SentenceSegmenter(object):
|
|||
seen_period = False
|
||||
for i, word in enumerate(doc):
|
||||
if seen_period and not word.is_punct:
|
||||
yield doc[start : word.i]
|
||||
yield doc[start:word.i]
|
||||
start = word.i
|
||||
seen_period = False
|
||||
elif word.text in ['.', '!', '?']:
|
||||
seen_period = True
|
||||
if start < len(doc):
|
||||
yield doc[start : len(doc)]
|
||||
yield doc[start:len(doc)]
|
||||
|
||||
|
||||
class Pipe(object):
|
||||
"""This class is not instantiated directly. Components inherit from it, and
|
||||
it defines the interface that components should follow to function as
|
||||
components in a spaCy analysis pipeline.
|
||||
"""
|
||||
name = None
|
||||
|
||||
@classmethod
|
||||
|
@ -149,8 +138,7 @@ class Pipe(object):
|
|||
link_vectors_to_models(self.vocab)
|
||||
|
||||
def use_params(self, params):
|
||||
"""Modify the pipe's model, to use the given parameter values.
|
||||
"""
|
||||
"""Modify the pipe's model, to use the given parameter values."""
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
|
@ -235,8 +223,8 @@ class Tensorizer(Pipe):
|
|||
"""Construct a new statistical model. Weights are not allocated on
|
||||
initialisation.
|
||||
|
||||
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
||||
instance with the `Doc` objects it will process.
|
||||
vocab (Vocab): A `Vocab` instance. The model must share the same
|
||||
`Vocab` instance with the `Doc` objects it will process.
|
||||
model (Model): A `Model` instance or `True` allocate one later.
|
||||
**cfg: Config parameters.
|
||||
|
||||
|
@ -280,7 +268,7 @@ class Tensorizer(Pipe):
|
|||
"""Return a single tensor for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
RETURNS (object): Vector representations for each token in the documents.
|
||||
RETURNS (object): Vector representations for each token in the docs.
|
||||
"""
|
||||
tokvecs = self.model(docs)
|
||||
return tokvecs
|
||||
|
@ -289,7 +277,7 @@ class Tensorizer(Pipe):
|
|||
"""Set the tensor attribute for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
tokvecs (object): Vector representation for each token in the documents.
|
||||
tokvecs (object): Vector representation for each token in the docs.
|
||||
"""
|
||||
for doc, tokvecs in zip(docs, tokvecses):
|
||||
assert tokvecs.shape[0] == len(doc)
|
||||
|
@ -328,12 +316,14 @@ class Tensorizer(Pipe):
|
|||
|
||||
class Tagger(Pipe):
|
||||
name = 'tagger'
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||
self.cfg.setdefault('pretrained_dims',
|
||||
self.vocab.vectors.data.shape[1])
|
||||
|
||||
def __call__(self, doc):
|
||||
tags = self.predict([doc])
|
||||
|
@ -353,8 +343,7 @@ class Tagger(Pipe):
|
|||
guesses = scores.argmax(axis=1)
|
||||
if not isinstance(guesses, numpy.ndarray):
|
||||
guesses = guesses.get()
|
||||
guesses = self.model.ops.unflatten(guesses,
|
||||
[len(d) for d in docs])
|
||||
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
|
@ -387,8 +376,8 @@ class Tagger(Pipe):
|
|||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
||||
|
||||
tag_index = {tag: i
|
||||
for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||
guesses = scores.argmax(axis=1)
|
||||
|
@ -443,17 +432,18 @@ class Tagger(Pipe):
|
|||
serialize['model'] = self.model.to_bytes
|
||||
serialize['vocab'] = self.vocab.to_bytes
|
||||
|
||||
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
|
||||
use_bin_type=True,
|
||||
encoding='utf8')
|
||||
serialize['tag_map'] = lambda: msgpack.dumps(
|
||||
self.vocab.morphology.tag_map, use_bin_type=True, encoding='utf8')
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def load_model(b):
|
||||
if self.model is True:
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
self.cfg.get('token_vector_width', 128))
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||
token_vector_width = util.env_opt(
|
||||
'token_vector_width',
|
||||
self.cfg.get('token_vector_width', 128))
|
||||
self.model = self.Model(self.vocab.morphology.n_tags,
|
||||
**self.cfg)
|
||||
self.model.from_bytes(b)
|
||||
|
||||
def load_tag_map(b):
|
||||
|
@ -509,11 +499,11 @@ class Tagger(Pipe):
|
|||
|
||||
|
||||
class MultitaskObjective(Tagger):
|
||||
'''Assist training of a parser or tagger, by training a side-objective.
|
||||
|
||||
Experimental
|
||||
'''
|
||||
"""Experimental: Assist training of a parser or tagger, by training a
|
||||
side-objective.
|
||||
"""
|
||||
name = 'nn_labeller'
|
||||
|
||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -530,12 +520,12 @@ class MultitaskObjective(Tagger):
|
|||
elif hasattr(target, '__call__'):
|
||||
self.make_label = target
|
||||
else:
|
||||
raise ValueError(
|
||||
"MultitaskObjective target should be function or one of "
|
||||
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
|
||||
raise ValueError("MultitaskObjective target should be function or "
|
||||
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||
self.cfg.setdefault('pretrained_dims',
|
||||
self.vocab.vectors.data.shape[1])
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -623,20 +613,19 @@ class MultitaskObjective(Tagger):
|
|||
|
||||
class SimilarityHook(Pipe):
|
||||
"""
|
||||
Experimental
|
||||
Experimental: A pipeline component to install a hook for supervised
|
||||
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
||||
documents. The similarity model can be any object obeying the Thinc `Model`
|
||||
interface. By default, the model concatenates the elementwise mean and
|
||||
elementwise max of the two tensors, and compares them using the
|
||||
Cauchy-like similarity function from Chen (2013):
|
||||
|
||||
A pipeline component to install a hook for supervised similarity into
|
||||
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
|
||||
model can be any object obeying the Thinc Model interface. By default,
|
||||
the model concatenates the elementwise mean and elementwise max of the two
|
||||
tensors, and compares them using the Cauchy-like similarity function
|
||||
from Chen (2013):
|
||||
|
||||
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||
>>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||
|
||||
Where W is a vector of dimension weights, initialized to 1.
|
||||
"""
|
||||
name = 'similarity'
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -662,8 +651,7 @@ class SimilarityHook(Pipe):
|
|||
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
||||
|
||||
def begin_training(self, _=tuple(), pipeline=None):
|
||||
"""
|
||||
Allocate model, using width from tensorizer in pipeline.
|
||||
"""Allocate model, using width from tensorizer in pipeline.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
|
@ -763,12 +751,14 @@ cdef class DependencyParser(Parser):
|
|||
for target in []:
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||
tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||
return (DependencyParser, (self.vocab, self.moves, self.model),
|
||||
None, None)
|
||||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
|
@ -781,12 +771,14 @@ cdef class EntityRecognizer(Parser):
|
|||
for target in []:
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||
tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||
return (EntityRecognizer, (self.vocab, self.moves, self.model),
|
||||
None, None)
|
||||
|
||||
|
||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
|
||||
|
|
|
@ -74,18 +74,21 @@ class Scorer(object):
|
|||
@property
|
||||
def scores(self):
|
||||
return {
|
||||
'uas': self.uas, 'las': self.las,
|
||||
'ents_p': self.ents_p, 'ents_r': self.ents_r, 'ents_f': self.ents_f,
|
||||
'uas': self.uas,
|
||||
'las': self.las,
|
||||
'ents_p': self.ents_p,
|
||||
'ents_r': self.ents_r,
|
||||
'ents_f': self.ents_f,
|
||||
'tags_acc': self.tags_acc,
|
||||
'token_acc': self.token_acc
|
||||
}
|
||||
|
||||
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
||||
assert len(tokens) == len(gold)
|
||||
|
||||
gold_deps = set()
|
||||
gold_tags = set()
|
||||
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
||||
gold_ents = set(tags_to_entities([annot[-1]
|
||||
for annot in gold.orig_annot]))
|
||||
for id_, word, tag, head, dep, ner in gold.orig_annot:
|
||||
gold_tags.add((id_, tag))
|
||||
if dep not in (None, "") and dep.lower() not in punct_labels:
|
||||
|
|
|
@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import
|
|||
|
||||
cimport cython
|
||||
from libc.string cimport memcpy
|
||||
from libc.stdint cimport uint64_t, uint32_t
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
from preshed.maps cimport map_iter, key_t
|
||||
from libc.stdint cimport uint32_t
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
import ujson
|
||||
import dill
|
||||
|
||||
from .symbols import IDS as SYMBOLS_BY_STR
|
||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from . import util
|
||||
from .compat import json_dumps
|
||||
from . import util
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
|
@ -195,7 +191,7 @@ cdef class StringStore:
|
|||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
strings = list(self)
|
||||
|
@ -225,7 +221,7 @@ cdef class StringStore:
|
|||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||
"""
|
||||
return ujson.dumps(list(self))
|
||||
return json_dumps(list(self))
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
# coding: utf8
|
||||
#cython: optimize.unpack_method_calls=False
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
IDS = {
|
||||
"": NIL,
|
||||
"IS_ALPHA": IS_ALPHA,
|
||||
|
@ -464,9 +464,11 @@ IDS = {
|
|||
"LAW": LAW
|
||||
}
|
||||
|
||||
|
||||
def sort_nums(x):
|
||||
return x[1]
|
||||
|
||||
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# cython: profile=True
|
||||
cimport numpy as np
|
||||
import numpy
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.ref cimport PyObject, Py_XDECREF
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.extra.search import MaxViolation
|
||||
from thinc.typedefs cimport hash_t, class_t
|
||||
|
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
|
|||
from .transition_system cimport TransitionSystem, Transition
|
||||
from .stateclass cimport StateClass
|
||||
from ..gold cimport GoldParse
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
|
@ -50,7 +49,7 @@ cdef class ParserBeam(object):
|
|||
cdef public object dones
|
||||
|
||||
def __init__(self, TransitionSystem moves, states, golds,
|
||||
int width, float density):
|
||||
int width, float density):
|
||||
self.moves = moves
|
||||
self.states = states
|
||||
self.golds = golds
|
||||
|
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
|
|||
cdef StateClass state, st
|
||||
for state in states:
|
||||
beam = Beam(self.moves.n_moves, width, density)
|
||||
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
|
||||
beam.initialize(self.moves.init_beam_state, state.c.length,
|
||||
state.c._sent)
|
||||
for i in range(beam.width):
|
||||
st = <StateClass>beam.at(i)
|
||||
st.c.offset = state.c.offset
|
||||
|
@ -74,7 +74,8 @@ cdef class ParserBeam(object):
|
|||
|
||||
@property
|
||||
def is_done(self):
|
||||
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
|
||||
return all(b.is_done or self.dones[i]
|
||||
for i, b in enumerate(self.beams))
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.beams[i]
|
||||
|
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
|
|||
for i in range(beam.size):
|
||||
state = <StateClass>beam.at(i)
|
||||
if not state.c.is_final():
|
||||
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
|
||||
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
|
||||
state, gold)
|
||||
if follow_gold:
|
||||
for j in range(beam.nr_class):
|
||||
if beam.costs[i][j] >= 1:
|
||||
|
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
|
|||
c_ids += ids.shape[1]
|
||||
return ids
|
||||
|
||||
|
||||
nr_update = 0
|
||||
|
||||
|
||||
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||
states, golds,
|
||||
state2vec, vec2scores,
|
||||
|
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
|||
if pbeam.is_done and gbeam.is_done:
|
||||
break
|
||||
# The beam maps let us find the right row in the flattened scores
|
||||
# arrays for each state. States are identified by (example id, history).
|
||||
# We keep a different beam map for each step (since we'll have a flat
|
||||
# scores array for each step). The beam map will let us take the per-state
|
||||
# losses, and compute the gradient for each (step, state, class).
|
||||
# arrays for each state. States are identified by (example id,
|
||||
# history). We keep a different beam map for each step (since we'll
|
||||
# have a flat scores array for each step). The beam map will let us
|
||||
# take the per-state losses, and compute the gradient for each (step,
|
||||
# state, class).
|
||||
beam_maps.append({})
|
||||
# Gather all states from the two beams in a list. Some stats may occur
|
||||
# in both beams. To figure out which beam each state belonged to,
|
||||
# we keep two lists of indices, p_indices and g_indices
|
||||
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
|
||||
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
|
||||
nr_update)
|
||||
if not states:
|
||||
break
|
||||
# Now that we have our flat list of states, feed them through the model
|
||||
token_ids = get_token_ids(states, nr_feature)
|
||||
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
||||
if hist_feats:
|
||||
hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i')
|
||||
scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop)
|
||||
hists = numpy.asarray([st.history[:hist_feats] for st in states],
|
||||
dtype='i')
|
||||
scores, bp_scores = vec2scores.begin_update((vectors, hists),
|
||||
drop=drop)
|
||||
else:
|
||||
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
||||
|
||||
|
@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
|||
|
||||
# Unpack the flat scores into lists for the two beams. The indices arrays
|
||||
# tell us which example and state the scores-row refers to.
|
||||
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
|
||||
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
|
||||
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
||||
for indices in p_indices]
|
||||
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
||||
for indices in g_indices]
|
||||
# Now advance the states in the beams. The gold beam is contrained to
|
||||
# to follow only gold analyses.
|
||||
pbeam.advance(p_scores)
|
||||
|
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
|
|||
|
||||
|
||||
def get_gradient(nr_class, beam_maps, histories, losses):
|
||||
"""
|
||||
The global model assigns a loss to each parse. The beam scores
|
||||
"""The global model assigns a loss to each parse. The beam scores
|
||||
are additive, so the same gradient is applied to each action
|
||||
in the history. This gives the gradient of a single *action*
|
||||
for a beam state -- so we have "the gradient of loss for taking
|
||||
|
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
|||
if loss != 0.0 and not numpy.isnan(loss):
|
||||
nr_step = max(nr_step, len(hist))
|
||||
for i in range(nr_step):
|
||||
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
|
||||
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
|
||||
dtype='f'))
|
||||
assert len(histories) == len(losses)
|
||||
for eg_id, hists in enumerate(histories):
|
||||
for loss, hist in zip(losses[eg_id], hists):
|
||||
|
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
|||
grads[j][i, clas] += loss
|
||||
key = key + tuple([clas])
|
||||
return grads
|
||||
|
||||
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
# test
|
|
@ -4,24 +4,16 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
import ctypes
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
from collections import OrderedDict
|
||||
from thinc.extra.search cimport Beam
|
||||
import numpy
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, is_space_token
|
||||
from ._state cimport StateC
|
||||
from .nonproj import is_nonproj_tree
|
||||
from .transition_system cimport do_func_t, get_cost_func_t
|
||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..gold cimport GoldParse, GoldParseC
|
||||
from ..structs cimport TokenC
|
||||
|
||||
|
||||
|
@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):
|
|||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
OrderedDict((
|
||||
(SHIFT, ['']),
|
||||
(REDUCE, ['']),
|
||||
(RIGHT, []),
|
||||
(LEFT, []),
|
||||
(BREAK, ['ROOT'])
|
||||
)))
|
||||
actions = kwargs.get('actions', OrderedDict((
|
||||
(SHIFT, ['']),
|
||||
(REDUCE, ['']),
|
||||
(RIGHT, []),
|
||||
(LEFT, []),
|
||||
(BREAK, ['ROOT']))
|
||||
))
|
||||
seen_actions = set()
|
||||
for label in kwargs.get('left_labels', []):
|
||||
if label.upper() != 'ROOT':
|
||||
|
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
|
|||
if gold.cand_to_gold[i] is None:
|
||||
continue
|
||||
if state.safe_get(i).dep:
|
||||
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
|
||||
predicted.add((i, state.H(i),
|
||||
self.strings[state.safe_get(i).dep]))
|
||||
else:
|
||||
predicted.add((i, state.H(i), 'ROOT'))
|
||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||
|
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
|
|||
if not self.has_gold(gold):
|
||||
return None
|
||||
for i in range(gold.length):
|
||||
if gold.heads[i] is None or gold.labels[i] is None: # Missing values
|
||||
# Missing values
|
||||
if gold.heads[i] is None or gold.labels[i] is None:
|
||||
gold.c.heads[i] = i
|
||||
gold.c.has_dep[i] = False
|
||||
else:
|
||||
|
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
|
|||
# Check projectivity --- leading cause
|
||||
if is_nonproj_tree(gold.heads):
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise the dependency "
|
||||
"parser.\n"
|
||||
"Likely cause: the tree is non-projective (i.e. it has crossing "
|
||||
"arcs -- see spacy/syntax/nonproj.pyx for definitions)\n"
|
||||
"The ArcEager transition system only supports projective trees.\n"
|
||||
"To learn non-projective representations, transform the data "
|
||||
"before training and after parsing. Either pass make_projective=True "
|
||||
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
|
||||
"Could not find a gold-standard action to supervise the "
|
||||
"dependency parser. Likely cause: the tree is "
|
||||
"non-projective (i.e. it has crossing arcs -- see "
|
||||
"spacy/syntax/nonproj.pyx for definitions). The ArcEager "
|
||||
"transition system only supports projective trees. To "
|
||||
"learn non-projective representations, transform the data "
|
||||
"before training and after parsing. Either pass "
|
||||
"make_projective=True to the GoldParse class, or use "
|
||||
"spacy.syntax.nonproj.preprocess_training_data.")
|
||||
else:
|
||||
print(gold.orig_annot)
|
||||
print(gold.words)
|
||||
|
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
|
|||
print(gold.labels)
|
||||
print(gold.sent_starts)
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise the dependency "
|
||||
"parser.\n"
|
||||
"The GoldParse was projective.\n"
|
||||
"The transition system has %d actions.\n"
|
||||
"State at failure:\n"
|
||||
"%s" % (self.n_moves, stcls.print_state(gold.words)))
|
||||
"Could not find a gold-standard action to supervise the"
|
||||
"dependency parser. The GoldParse was projective. The "
|
||||
"transition system has %d actions. State at failure: %s"
|
||||
% (self.n_moves, stcls.print_state(gold.words)))
|
||||
assert n_gold >= 1
|
||||
|
||||
def get_beam_annot(self, Beam beam):
|
||||
|
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
|
|||
deps[j].setdefault(dep, 0.0)
|
||||
deps[j][dep] += prob
|
||||
return heads, deps
|
||||
|
||||
|
|
|
@ -1,144 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX
|
||||
|
||||
|
||||
def english_noun_chunks(obj):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse.
|
||||
Works on both Doc and Span.
|
||||
"""
|
||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||
'attr', 'ROOT']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.i+1))
|
||||
yield word.left_edge.i, word.i+1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.i+1))
|
||||
yield word.left_edge.i, word.i+1, np_label
|
||||
|
||||
|
||||
# this iterator extracts spans headed by NOUNs starting from the left-most
|
||||
# syntactic dependent until the NOUN itself
|
||||
# for close apposition and measurement construction, the span is sometimes
|
||||
# extended to the right of the NOUN
|
||||
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
|
||||
# just "eine Tasse", same for "das Thema Familie"
|
||||
def german_noun_chunks(obj):
|
||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||
close_app = doc.vocab.strings.add('nk')
|
||||
|
||||
rbracket = 0
|
||||
for i, word in enumerate(obj):
|
||||
if i < rbracket:
|
||||
continue
|
||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||
rbracket = word.i+1
|
||||
# try to extend the span to the right
|
||||
# to capture close apposition/measurement constructions
|
||||
for rdep in doc[word.i].rights:
|
||||
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
|
||||
rbracket = rdep.i+1
|
||||
yield word.left_edge.i, rbracket, np_label
|
||||
|
||||
|
||||
def es_noun_chunks(obj):
|
||||
doc = obj.doc
|
||||
np_label = doc.vocab.strings['NP']
|
||||
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
||||
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
||||
stop_labels = ['punct']
|
||||
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
|
||||
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
|
||||
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
|
||||
|
||||
def next_token(token):
|
||||
try:
|
||||
return token.nbor()
|
||||
except:
|
||||
return None
|
||||
|
||||
def noun_bounds(root):
|
||||
def is_verb_token(token):
|
||||
return token.pos in [VERB, AUX]
|
||||
|
||||
left_bound = root
|
||||
for token in reversed(list(root.lefts)):
|
||||
if token.dep in np_left_deps:
|
||||
left_bound = token
|
||||
right_bound = root
|
||||
for token in root.rights:
|
||||
if (token.dep in np_right_deps):
|
||||
left, right = noun_bounds(token)
|
||||
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||
doc[left_bound.i: right.i])):
|
||||
break
|
||||
else:
|
||||
right_bound = right
|
||||
return left_bound, right_bound
|
||||
|
||||
token = doc[0]
|
||||
while token and token.i < len(doc):
|
||||
if token.pos in [PROPN, NOUN, PRON]:
|
||||
left, right = noun_bounds(token)
|
||||
yield left.i, right.i+1, np_label
|
||||
token = right
|
||||
token = next_token(token)
|
||||
|
||||
|
||||
def french_noun_chunks(obj):
|
||||
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
|
||||
|
||||
CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
|
||||
'es': es_noun_chunks, 'fr': french_noun_chunks}
|
|
@ -4,17 +4,12 @@ from __future__ import unicode_literals
|
|||
from thinc.typedefs cimport weight_t
|
||||
from thinc.extra.search cimport Beam
|
||||
from collections import OrderedDict
|
||||
import numpy
|
||||
from thinc.neural.ops import NumpyOps
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .transition_system cimport Transition
|
||||
from .transition_system cimport do_func_t
|
||||
from ..structs cimport TokenC, Entity
|
||||
from ..gold cimport GoldParseC
|
||||
from ..gold cimport GoldParse
|
||||
from ..attrs cimport ENT_TYPE, ENT_IOB
|
||||
from ..gold cimport GoldParseC, GoldParse
|
||||
|
||||
|
||||
cdef enum:
|
||||
|
@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
OrderedDict((
|
||||
(MISSING, ['']),
|
||||
(BEGIN, []),
|
||||
(IN, []),
|
||||
(LAST, []),
|
||||
(UNIT, []),
|
||||
(OUT, [''])
|
||||
)))
|
||||
actions = kwargs.get('actions', OrderedDict((
|
||||
(MISSING, ['']),
|
||||
(BEGIN, []),
|
||||
(IN, []),
|
||||
(LAST, []),
|
||||
(UNIT, []),
|
||||
(OUT, [''])
|
||||
)))
|
||||
seen_entities = set()
|
||||
for entity_type in kwargs.get('entity_types', []):
|
||||
if entity_type in seen_entities:
|
||||
|
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
cdef attr_t label
|
||||
if name == '-' or name == None:
|
||||
if name == '-' or name is None:
|
||||
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||
elif name == '!O':
|
||||
return Transition(clas=0, move=ISNT, label=0, score=0)
|
||||
|
@ -328,8 +322,8 @@ cdef class In:
|
|||
return False
|
||||
elif preset_ent_iob == 3:
|
||||
return False
|
||||
# TODO: Is this quite right?
|
||||
# I think it's supposed to be ensuring the gazetteer matches are maintained
|
||||
# TODO: Is this quite right? I think it's supposed to be ensuring the
|
||||
# gazetteer matches are maintained
|
||||
elif st.B_(1).ent_iob != preset_ent_iob:
|
||||
return False
|
||||
# Don't allow entities to extend across sentence boundaries
|
||||
|
@ -354,10 +348,12 @@ cdef class In:
|
|||
if g_act == MISSING:
|
||||
return 0
|
||||
elif g_act == BEGIN:
|
||||
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
|
||||
# I, Gold B --> True
|
||||
# (P of bad open entity sunk, R of this entity sunk)
|
||||
return 0
|
||||
elif g_act == IN:
|
||||
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
|
||||
# I, Gold I --> True
|
||||
# (label forced by prev, if mismatch, P and R both sunk)
|
||||
return 0
|
||||
elif g_act == LAST:
|
||||
# I, Gold L --> True iff this entity sunk and next tag == O
|
||||
|
@ -505,11 +501,3 @@ cdef class Out:
|
|||
return 1
|
||||
else:
|
||||
return 1
|
||||
|
||||
|
||||
class OracleError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class UnknownMove(Exception):
|
||||
pass
|
||||
|
|
|
@ -4,79 +4,56 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from collections import Counter, OrderedDict
|
||||
from collections import OrderedDict
|
||||
import ujson
|
||||
import json
|
||||
import contextlib
|
||||
import numpy
|
||||
|
||||
from libc.math cimport exp
|
||||
cimport cython
|
||||
cimport cython.parallel
|
||||
import cytoolz
|
||||
import dill
|
||||
|
||||
import numpy.random
|
||||
cimport numpy as np
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.ref cimport PyObject, Py_XDECREF
|
||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.linalg cimport Vec, VecVec
|
||||
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
||||
from thinc.extra.eg cimport Example
|
||||
from libc.math cimport exp
|
||||
from libcpp.vector cimport vector
|
||||
from libc.string cimport memset
|
||||
from libc.stdlib cimport calloc, free
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t, class_t, hash_t
|
||||
from thinc.extra.search cimport Beam
|
||||
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport MapStruct
|
||||
from preshed.maps cimport map_get
|
||||
|
||||
from thinc.api import layerize, chain, clone, with_flatten
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||
from thinc.api import chain, clone
|
||||
from thinc.v2v import Model, Maxout, Affine
|
||||
from thinc.misc import LayerNorm
|
||||
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.ops import CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
from thinc.linalg cimport Vec, VecVec
|
||||
|
||||
from .. import util
|
||||
from ..util import get_async, get_cuda_stream
|
||||
from .._ml import zero_init, PrecomputableAffine
|
||||
from .._ml import Tok2Vec, doc2feats
|
||||
from .._ml import Residual, drop_layer, flatten
|
||||
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
||||
from .._ml import link_vectors_to_models
|
||||
from .._ml import HistoryFeatures
|
||||
from ..compat import json_dumps, copy_array
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..gold cimport GoldParse
|
||||
from .. import util
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from . import nonproj
|
||||
from .transition_system import OracleError
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..strings cimport StringStore
|
||||
from ..gold cimport GoldParse
|
||||
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
|
||||
from . import _beam_utils
|
||||
from .transition_system cimport Transition
|
||||
from . import _beam_utils, nonproj
|
||||
|
||||
|
||||
def get_templates(*args, **kwargs):
|
||||
return []
|
||||
|
||||
|
||||
DEBUG = False
|
||||
|
||||
|
||||
def set_debug(val):
|
||||
global DEBUG
|
||||
DEBUG = val
|
||||
|
||||
|
||||
cdef class precompute_hiddens:
|
||||
'''Allow a model to be "primed" by pre-computing input features in bulk.
|
||||
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
||||
|
||||
This is used for the parser, where we want to take a batch of documents,
|
||||
and compute vectors for each (token, position) pair. These vectors can then
|
||||
|
@ -91,7 +68,7 @@ cdef class precompute_hiddens:
|
|||
so we can save the factor k. This also gives a nice CPU/GPU division:
|
||||
we can do all our hard maths up front, packed into large multiplications,
|
||||
and do the hard-to-program parsing on the CPU.
|
||||
'''
|
||||
"""
|
||||
cdef int nF, nO, nP
|
||||
cdef bint _is_synchronized
|
||||
cdef public object ops
|
||||
|
@ -101,7 +78,8 @@ cdef class precompute_hiddens:
|
|||
cdef object _cuda_stream
|
||||
cdef object _bp_hiddens
|
||||
|
||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.):
|
||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||
drop=0.):
|
||||
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
||||
cdef np.ndarray cached
|
||||
if not isinstance(gpu_cached, numpy.ndarray):
|
||||
|
@ -122,8 +100,7 @@ cdef class precompute_hiddens:
|
|||
self._bp_hiddens = bp_features
|
||||
|
||||
cdef const float* get_feat_weights(self) except NULL:
|
||||
if not self._is_synchronized \
|
||||
and self._cuda_stream is not None:
|
||||
if not self._is_synchronized and self._cuda_stream is not None:
|
||||
self._cuda_stream.synchronize()
|
||||
self._is_synchronized = True
|
||||
return <float*>self._cached.data
|
||||
|
@ -248,10 +225,10 @@ cdef class Parser:
|
|||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
||||
if depth != 1:
|
||||
raise ValueError("Currently parser depth is hard-coded to 1.")
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
|
||||
#if parser_maxout_pieces != 2:
|
||||
# raise ValueError("Currently parser_maxout_pieces is hard-coded to 2")
|
||||
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||
cfg.get('maxout_pieces', 2))
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
cfg.get('token_vector_width', 128))
|
||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
|
||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
||||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
||||
|
@ -289,23 +266,19 @@ cdef class Parser:
|
|||
return (tok2vec, lower, upper), cfg
|
||||
|
||||
def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
|
||||
"""
|
||||
Create a Parser.
|
||||
"""Create a Parser.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
The vocabulary object. Must be shared with documents to be processed.
|
||||
The value is set to the .vocab attribute.
|
||||
moves (TransitionSystem):
|
||||
Defines how the parse-state is created, updated and evaluated.
|
||||
The value is set to the .moves attribute unless True (default),
|
||||
in which case a new instance is created with Parser.Moves().
|
||||
model (object):
|
||||
Defines how the parse-state is created, updated and evaluated.
|
||||
The value is set to the .model attribute unless True (default),
|
||||
in which case a new instance is created with Parser.Model().
|
||||
**cfg:
|
||||
Arbitrary configuration parameters. Set to the .cfg attribute
|
||||
vocab (Vocab): The vocabulary object. Must be shared with documents
|
||||
to be processed. The value is set to the `.vocab` attribute.
|
||||
moves (TransitionSystem): Defines how the parse-state is created,
|
||||
updated and evaluated. The value is set to the .moves attribute
|
||||
unless True (default), in which case a new instance is created with
|
||||
`Parser.Moves()`.
|
||||
model (object): Defines how the parse-state is created, updated and
|
||||
evaluated. The value is set to the .model attribute unless True
|
||||
(default), in which case a new instance is created with
|
||||
`Parser.Model()`.
|
||||
**cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
|
||||
"""
|
||||
self.vocab = vocab
|
||||
if moves is True:
|
||||
|
@ -331,13 +304,10 @@ cdef class Parser:
|
|||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
def __call__(self, Doc doc, beam_width=None, beam_density=None):
|
||||
"""
|
||||
Apply the parser or entity recognizer, setting the annotations onto the Doc object.
|
||||
"""Apply the parser or entity recognizer, setting the annotations onto
|
||||
the `Doc` object.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The document to be processed.
|
||||
Returns:
|
||||
None
|
||||
doc (Doc): The document to be processed.
|
||||
"""
|
||||
if beam_width is None:
|
||||
beam_width = self.cfg.get('beam_width', 1)
|
||||
|
@ -359,16 +329,13 @@ cdef class Parser:
|
|||
|
||||
def pipe(self, docs, int batch_size=256, int n_threads=2,
|
||||
beam_width=None, beam_density=None):
|
||||
"""
|
||||
Process a stream of documents.
|
||||
"""Process a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to process.
|
||||
batch_size (int):
|
||||
The number of documents to accumulate into a working set.
|
||||
n_threads (int):
|
||||
The number of threads with which to work on the buffer in parallel.
|
||||
Yields (Doc): Documents, in order.
|
||||
stream: The sequence of documents to process.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
if beam_width is None:
|
||||
beam_width = self.cfg.get('beam_width', 1)
|
||||
|
@ -385,8 +352,8 @@ cdef class Parser:
|
|||
parse_states = self.parse_batch(subbatch)
|
||||
beams = []
|
||||
else:
|
||||
beams = self.beam_parse(subbatch,
|
||||
beam_width=beam_width, beam_density=beam_density)
|
||||
beams = self.beam_parse(subbatch, beam_width=beam_width,
|
||||
beam_density=beam_density)
|
||||
parse_states = []
|
||||
for beam in beams:
|
||||
parse_states.append(<StateClass>beam.at(0))
|
||||
|
@ -406,9 +373,9 @@ cdef class Parser:
|
|||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
0.0)
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||
docs, cuda_stream, 0.0)
|
||||
nr_state = len(docs)
|
||||
nr_class = self.moves.n_moves
|
||||
nr_dim = tokvecs.shape[1]
|
||||
|
@ -422,7 +389,8 @@ cdef class Parser:
|
|||
|
||||
feat_weights = state2vec.get_feat_weights()
|
||||
cdef int i
|
||||
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T)
|
||||
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
|
||||
vec2scores._layers[-1].W.T)
|
||||
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
|
||||
|
||||
hW = <float*>hidden_weights.data
|
||||
|
@ -450,6 +418,7 @@ cdef class Parser:
|
|||
with gil:
|
||||
PyErr_SetFromErrno(MemoryError)
|
||||
PyErr_CheckSignals()
|
||||
cdef float feature
|
||||
while not state.is_final():
|
||||
state.set_context_tokens(token_ids, nr_feat)
|
||||
memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
|
||||
|
@ -489,9 +458,9 @@ cdef class Parser:
|
|||
cdef Doc doc
|
||||
cdef int nr_class = self.moves.n_moves
|
||||
cdef StateClass stcls, output
|
||||
cuda_stream = get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
0.0)
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||
docs, cuda_stream, 0.0)
|
||||
beams = []
|
||||
cdef int offset = 0
|
||||
cdef int j = 0
|
||||
|
@ -546,9 +515,7 @@ cdef class Parser:
|
|||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||
docs = [docs]
|
||||
golds = [golds]
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
drop)
|
||||
|
@ -563,7 +530,6 @@ cdef class Parser:
|
|||
n_steps = 0
|
||||
while todo:
|
||||
states, golds = zip(*todo)
|
||||
|
||||
token_ids = self.get_token_ids(states)
|
||||
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
|
||||
if drop != 0:
|
||||
|
@ -585,8 +551,8 @@ cdef class Parser:
|
|||
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
backprops.append((
|
||||
get_async(cuda_stream, token_ids),
|
||||
get_async(cuda_stream, d_vector),
|
||||
util.get_async(cuda_stream, token_ids),
|
||||
util.get_async(cuda_stream, d_vector),
|
||||
bp_vector
|
||||
))
|
||||
else:
|
||||
|
@ -619,15 +585,13 @@ cdef class Parser:
|
|||
states = self.moves.init_batch(docs)
|
||||
for gold in golds:
|
||||
self.moves.preprocess_gold(gold)
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
|
||||
|
||||
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
||||
states, golds,
|
||||
state2vec, vec2scores,
|
||||
width, density, self.cfg.get('hist_size', 0),
|
||||
drop=drop, losses=losses)
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||
docs, cuda_stream, drop)
|
||||
states_d_scores, backprops = _beam_utils.update_beam(
|
||||
self.moves, self.nr_feature, 500, states, golds, state2vec,
|
||||
vec2scores, width, density, self.cfg.get('hist_size', 0),
|
||||
drop=drop, losses=losses)
|
||||
backprop_lower = []
|
||||
cdef float batch_size = len(docs)
|
||||
for i, d_scores in enumerate(states_d_scores):
|
||||
|
@ -639,13 +603,14 @@ cdef class Parser:
|
|||
if isinstance(self.model[0].ops, CupyOps) \
|
||||
and not isinstance(ids, state2vec.ops.xp.ndarray):
|
||||
backprop_lower.append((
|
||||
get_async(cuda_stream, ids),
|
||||
get_async(cuda_stream, d_vector),
|
||||
util.get_async(cuda_stream, ids),
|
||||
util.get_async(cuda_stream, d_vector),
|
||||
bp_vectors))
|
||||
else:
|
||||
backprop_lower.append((ids, d_vector, bp_vectors))
|
||||
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
|
||||
cuda_stream)
|
||||
|
||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||
"""Make a square batch, of length equal to the shortest doc. A long
|
||||
|
@ -796,7 +761,8 @@ cdef class Parser:
|
|||
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||
if 'model' in cfg:
|
||||
self.model = cfg['model']
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples,
|
||||
label_freq_cutoff=100)
|
||||
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
||||
for action, labels in actions.items():
|
||||
for label in labels:
|
||||
|
|
|
@ -1,39 +1,37 @@
|
|||
# coding: utf-8
|
||||
"""
|
||||
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||
for doing pseudo-projective parsing implementation uses the HEAD decoration
|
||||
scheme.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from copy import copy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..attrs import DEP, HEAD
|
||||
|
||||
DELIMITER = '||'
|
||||
|
||||
|
||||
def ancestors(tokenid, heads):
|
||||
# returns all words going from the word up the path to the root
|
||||
# the path to root cannot be longer than the number of words in the sentence
|
||||
# this function ends after at most len(heads) steps
|
||||
# because it would otherwise loop indefinitely on cycles
|
||||
# Returns all words going from the word up the path to the root. The path
|
||||
# to root cannot be longer than the number of words in the sentence. This
|
||||
# function ends after at most len(heads) steps, because it would otherwise
|
||||
# loop indefinitely on cycles.
|
||||
head = tokenid
|
||||
cnt = 0
|
||||
while heads[head] != head and cnt < len(heads):
|
||||
head = heads[head]
|
||||
cnt += 1
|
||||
yield head
|
||||
if head == None:
|
||||
if head is None:
|
||||
break
|
||||
|
||||
|
||||
def contains_cycle(heads):
|
||||
# in an acyclic tree, the path from each word following
|
||||
# the head relation upwards always ends at the root node
|
||||
# in an acyclic tree, the path from each word following the head relation
|
||||
# upwards always ends at the root node
|
||||
for tokenid in range(len(heads)):
|
||||
seen = set([tokenid])
|
||||
for ancestor in ancestors(tokenid,heads):
|
||||
for ancestor in ancestors(tokenid, heads):
|
||||
if ancestor in seen:
|
||||
return seen
|
||||
seen.add(ancestor)
|
||||
|
@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads):
|
|||
# if there is a token k, h < k < d such that h is not
|
||||
# an ancestor of k. Same for h -> d, h > d
|
||||
head = heads[tokenid]
|
||||
if head == tokenid: # root arcs cannot be non-projective
|
||||
if head == tokenid: # root arcs cannot be non-projective
|
||||
return False
|
||||
elif head == None: # unattached tokens cannot be non-projective
|
||||
elif head is None: # unattached tokens cannot be non-projective
|
||||
return False
|
||||
|
||||
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
|
||||
for k in range(start,end):
|
||||
for ancestor in ancestors(k,heads):
|
||||
if ancestor == None: # for unattached tokens/subtrees
|
||||
for k in range(start, end):
|
||||
for ancestor in ancestors(k, heads):
|
||||
if ancestor is None: # for unattached tokens/subtrees
|
||||
break
|
||||
elif ancestor == head: # normal case: k dominated by h
|
||||
elif ancestor == head: # normal case: k dominated by h
|
||||
break
|
||||
else: # head not in ancestors: d -> h is non-projective
|
||||
else: # head not in ancestors: d -> h is non-projective
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_nonproj_tree(heads):
|
||||
# a tree is non-projective if at least one arc is non-projective
|
||||
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
|
||||
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
|
||||
|
||||
|
||||
def decompose(label):
|
||||
|
@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
|
|||
for raw_text, sents in gold_tuples:
|
||||
prepro_sents = []
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
proj_heads,deco_labels = projectivize(heads,labels)
|
||||
proj_heads, deco_labels = projectivize(heads, labels)
|
||||
# set the label to ROOT for each root dependent
|
||||
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
|
||||
deco_labels = ['ROOT' if head == i else deco_labels[i]
|
||||
for i, head in enumerate(proj_heads)]
|
||||
# count label frequencies
|
||||
if label_freq_cutoff > 0:
|
||||
for label in deco_labels:
|
||||
if is_decorated(label):
|
||||
freqs[label] = freqs.get(label,0) + 1
|
||||
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
|
||||
freqs[label] = freqs.get(label, 0) + 1
|
||||
prepro_sents.append(
|
||||
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
|
||||
preprocessed.append((raw_text, prepro_sents))
|
||||
|
||||
if label_freq_cutoff > 0:
|
||||
return _filter_labels(preprocessed,label_freq_cutoff,freqs)
|
||||
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
||||
return preprocessed
|
||||
|
||||
|
||||
def projectivize(heads, labels):
|
||||
# use the algorithm by Nivre & Nilsson 2005
|
||||
# assumes heads to be a proper tree, i.e. connected and cycle-free
|
||||
# returns a new pair (heads,labels) which encode
|
||||
# a projective and decorated tree
|
||||
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
|
||||
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
||||
# which encode a projective and decorated tree.
|
||||
proj_heads = copy(heads)
|
||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||
if smallest_np_arc == None: # this sentence is already projective
|
||||
if smallest_np_arc is None: # this sentence is already projective
|
||||
return proj_heads, copy(labels)
|
||||
while smallest_np_arc != None:
|
||||
while smallest_np_arc is not None:
|
||||
_lift(smallest_np_arc, proj_heads)
|
||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||
deco_labels = _decorate(heads, proj_heads, labels)
|
||||
|
@ -114,24 +112,26 @@ def projectivize(heads, labels):
|
|||
|
||||
|
||||
def deprojectivize(tokens):
|
||||
# reattach arcs with decorated labels (following HEAD scheme)
|
||||
# for each decorated arc X||Y, search top-down, left-to-right,
|
||||
# breadth-first until hitting a Y then make this the new head
|
||||
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
||||
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
||||
# hitting a Y then make this the new head.
|
||||
for token in tokens:
|
||||
if is_decorated(token.dep_):
|
||||
newlabel,headlabel = decompose(token.dep_)
|
||||
newhead = _find_new_head(token,headlabel)
|
||||
newlabel, headlabel = decompose(token.dep_)
|
||||
newhead = _find_new_head(token, headlabel)
|
||||
token.head = newhead
|
||||
token.dep_ = newlabel
|
||||
return tokens
|
||||
|
||||
|
||||
def _decorate(heads, proj_heads, labels):
|
||||
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
||||
assert(len(heads) == len(proj_heads) == len(labels))
|
||||
deco_labels = []
|
||||
for tokenid,head in enumerate(heads):
|
||||
for tokenid, head in enumerate(heads):
|
||||
if head != proj_heads[tokenid]:
|
||||
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
|
||||
deco_labels.append(
|
||||
'%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
|
||||
else:
|
||||
deco_labels.append(labels[tokenid])
|
||||
return deco_labels
|
||||
|
@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads):
|
|||
# and ties are broken left to right
|
||||
smallest_size = float('inf')
|
||||
smallest_np_arc = None
|
||||
for tokenid,head in enumerate(heads):
|
||||
for tokenid, head in enumerate(heads):
|
||||
size = abs(tokenid-head)
|
||||
if size < smallest_size and is_nonproj_arc(tokenid,heads):
|
||||
if size < smallest_size and is_nonproj_arc(tokenid, heads):
|
||||
smallest_size = size
|
||||
smallest_np_arc = tokenid
|
||||
return smallest_np_arc
|
||||
|
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
|
|||
next_queue = []
|
||||
for qtoken in queue:
|
||||
for child in qtoken.children:
|
||||
if child.is_space: continue
|
||||
if child == token: continue
|
||||
if child.is_space:
|
||||
continue
|
||||
if child == token:
|
||||
continue
|
||||
if child.dep_ == headlabel:
|
||||
return child
|
||||
next_queue.append(child)
|
||||
|
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
|
|||
for raw_text, sents in gold_tuples:
|
||||
filtered_sents = []
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
|
||||
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
||||
filtered_labels = [decompose(label)[0]
|
||||
if freqs.get(label, cutoff) < cutoff
|
||||
else label for label in labels]
|
||||
filtered_sents.append(
|
||||
((ids, words, tags, heads, filtered_labels, iob), ctnts))
|
||||
filtered.append((raw_text, filtered_sents))
|
||||
return filtered
|
||||
|
|
|
@ -2,17 +2,8 @@
|
|||
# cython: infer_types=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
import numpy
|
||||
|
||||
from ..vocab cimport EMPTY_LEXEME
|
||||
from ..structs cimport Entity
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..symbols cimport punct
|
||||
from ..attrs cimport IS_SPACE
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
|
||||
|
|
|
@ -2,17 +2,17 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
from collections import defaultdict, OrderedDict
|
||||
from collections import OrderedDict
|
||||
import ujson
|
||||
|
||||
from .. import util
|
||||
from ..structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||
from ..typedefs cimport attr_t
|
||||
from ..compat import json_dumps
|
||||
from .. import util
|
||||
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
|
@ -136,11 +136,12 @@ cdef class TransitionSystem:
|
|||
print([gold.c.ner[i].clas for i in range(gold.length)])
|
||||
print([gold.c.ner[i].move for i in range(gold.length)])
|
||||
print([gold.c.ner[i].label for i in range(gold.length)])
|
||||
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
|
||||
print("Self labels",
|
||||
[self.c[i].label for i in range(self.n_moves)])
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise "
|
||||
"the entity recognizer\n"
|
||||
"The transition system has %d actions." % (self.n_moves))
|
||||
"the entity recognizer. The transition system has "
|
||||
"%d actions." % (self.n_moves))
|
||||
|
||||
def get_class_name(self, int clas):
|
||||
act = self.c[clas]
|
||||
|
@ -149,7 +150,7 @@ cdef class TransitionSystem:
|
|||
def add_action(self, int action, label_name):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, int) and \
|
||||
not isinstance(label_name, long):
|
||||
not isinstance(label_name, long):
|
||||
label_id = self.strings.add(label_name)
|
||||
else:
|
||||
label_id = label_name
|
||||
|
@ -186,7 +187,7 @@ cdef class TransitionSystem:
|
|||
'name': self.move_name(trans.move, trans.label)
|
||||
})
|
||||
serializers = {
|
||||
'transitions': lambda: ujson.dumps(transitions),
|
||||
'transitions': lambda: json_dumps(transitions),
|
||||
'strings': lambda: self.strings.to_bytes()
|
||||
}
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.extra.eg cimport Example
|
||||
from thinc.structs cimport ExampleC
|
||||
|
||||
from .structs cimport TokenC
|
||||
from .vocab cimport Vocab
|
||||
|
||||
|
||||
cdef class TaggerModel(AveragedPerceptron):
|
||||
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
cdef readonly Vocab vocab
|
||||
cdef readonly TaggerModel model
|
||||
cdef public dict freqs
|
||||
cdef public object cfg
|
253
spacy/tagger.pyx
253
spacy/tagger.pyx
|
@ -1,253 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport atom_t
|
||||
from thinc.extra.eg cimport Example
|
||||
from thinc.structs cimport ExampleC
|
||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.linalg cimport VecVec
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
from .attrs cimport TAG
|
||||
from .gold cimport GoldParse
|
||||
from .attrs cimport *
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2_orth
|
||||
P2_cluster
|
||||
P2_shape
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_pos
|
||||
P2_lemma
|
||||
P2_flags
|
||||
|
||||
P1_orth
|
||||
P1_cluster
|
||||
P1_shape
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_pos
|
||||
P1_lemma
|
||||
P1_flags
|
||||
|
||||
W_orth
|
||||
W_cluster
|
||||
W_shape
|
||||
W_prefix
|
||||
W_suffix
|
||||
W_pos
|
||||
W_lemma
|
||||
W_flags
|
||||
|
||||
N1_orth
|
||||
N1_cluster
|
||||
N1_shape
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_pos
|
||||
N1_lemma
|
||||
N1_flags
|
||||
|
||||
N2_orth
|
||||
N2_cluster
|
||||
N2_shape
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_pos
|
||||
N2_lemma
|
||||
N2_flags
|
||||
|
||||
N_CONTEXT_FIELDS
|
||||
|
||||
|
||||
cdef class TaggerModel(AveragedPerceptron):
|
||||
def update(self, Example eg):
|
||||
self.time += 1
|
||||
guess = eg.guess
|
||||
best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)
|
||||
if guess != best:
|
||||
for feat in eg.c.features[:eg.c.nr_feat]:
|
||||
self.update_weight(feat.key, best, -feat.value)
|
||||
self.update_weight(feat.key, guess, feat.value)
|
||||
|
||||
cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
|
||||
_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
|
||||
_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
|
||||
_fill_from_token(&eg.atoms[W_orth], &tokens[i])
|
||||
_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
|
||||
_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])
|
||||
|
||||
eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.lower
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
context[4] = t.lex.suffix
|
||||
context[5] = t.tag
|
||||
context[6] = t.lemma
|
||||
if t.lex.flags & (1 << IS_ALPHA):
|
||||
context[7] = 1
|
||||
elif t.lex.flags & (1 << IS_PUNCT):
|
||||
context[7] = 2
|
||||
elif t.lex.flags & (1 << LIKE_URL):
|
||||
context[7] = 3
|
||||
elif t.lex.flags & (1 << LIKE_NUM):
|
||||
context[7] = 4
|
||||
else:
|
||||
context[7] = 0
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
"""Annotate part-of-speech tags on Doc objects."""
|
||||
|
||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
||||
"""Create a Tagger.
|
||||
|
||||
vocab (Vocab): The vocabulary object. Must be shared with documents to
|
||||
be processed.
|
||||
model (thinc.linear.AveragedPerceptron): The statistical model.
|
||||
RETURNS (Tagger): The newly constructed object.
|
||||
"""
|
||||
if model is None:
|
||||
model = TaggerModel(cfg.get('features', self.feature_templates),
|
||||
L1=0.0)
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.model.l1_penalty = 0.0
|
||||
# TODO: Move this to tag map
|
||||
self.freqs = {TAG: defaultdict(int)}
|
||||
for tag in self.tag_names:
|
||||
self.freqs[TAG][self.vocab.strings[tag]] = 1
|
||||
self.freqs[TAG][0] = 1
|
||||
self.cfg = cfg
|
||||
|
||||
@property
|
||||
def tag_names(self):
|
||||
return self.vocab.morphology.tag_names
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.vocab, self.model), None, None)
|
||||
|
||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
||||
doc (Doc): The tokens to be tagged.
|
||||
"""
|
||||
if tokens.length == 0:
|
||||
return 0
|
||||
|
||||
cdef Pool mem = Pool()
|
||||
|
||||
cdef int i, tag
|
||||
cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS,
|
||||
nr_class=self.vocab.morphology.n_tags,
|
||||
nr_feat=self.model.nr_feat)
|
||||
for i in range(tokens.length):
|
||||
if tokens.c[i].pos == 0:
|
||||
self.model.set_featuresC(&eg.c, tokens.c, i)
|
||||
self.model.set_scoresC(eg.c.scores,
|
||||
eg.c.features, eg.c.nr_feat)
|
||||
guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
|
||||
self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
|
||||
eg.fill_scores(0, eg.c.nr_class)
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
"""Tag a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to tag.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the Matcher implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in stream:
|
||||
self(doc)
|
||||
yield doc
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||
"""Update the statistical model, with tags supplied for the given document.
|
||||
|
||||
doc (Doc): The document to update on.
|
||||
gold (GoldParse): Manager for the gold-standard tags.
|
||||
RETURNS (int): Number of tags predicted correctly.
|
||||
"""
|
||||
gold_tag_strs = gold.tags
|
||||
assert len(tokens) == len(gold_tag_strs)
|
||||
for tag in gold_tag_strs:
|
||||
if tag != None and tag not in self.tag_names:
|
||||
msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "
|
||||
"gold tags, to maintain coarse-grained mapping.")
|
||||
raise ValueError(msg % tag)
|
||||
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
|
||||
cdef int correct = 0
|
||||
cdef Pool mem = Pool()
|
||||
cdef Example eg = Example(
|
||||
nr_atom=N_CONTEXT_FIELDS,
|
||||
nr_class=self.vocab.morphology.n_tags,
|
||||
nr_feat=self.model.nr_feat)
|
||||
for i in range(tokens.length):
|
||||
self.model.set_featuresC(&eg.c, tokens.c, i)
|
||||
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
|
||||
self.model.set_scoresC(eg.c.scores,
|
||||
eg.c.features, eg.c.nr_feat)
|
||||
self.model.update(eg)
|
||||
|
||||
self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)
|
||||
|
||||
correct += eg.cost == 0
|
||||
self.freqs[TAG][tokens.c[i].tag] += 1
|
||||
eg.fill_scores(0, eg.c.nr_class)
|
||||
eg.fill_costs(0, eg.c.nr_class)
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
return correct
|
||||
|
||||
|
||||
feature_templates = (
|
||||
(W_orth,),
|
||||
(P1_lemma, P1_pos),
|
||||
(P2_lemma, P2_pos),
|
||||
(N1_orth,),
|
||||
(N2_orth,),
|
||||
|
||||
(W_suffix,),
|
||||
(W_prefix,),
|
||||
|
||||
(P1_pos,),
|
||||
(P2_pos,),
|
||||
(P1_pos, P2_pos),
|
||||
(P1_pos, W_orth),
|
||||
(P1_suffix,),
|
||||
(N1_suffix,),
|
||||
|
||||
(W_shape,),
|
||||
(W_cluster,),
|
||||
(N1_cluster,),
|
||||
(N2_cluster,),
|
||||
(P1_cluster,),
|
||||
(P2_cluster,),
|
||||
|
||||
(W_flags,),
|
||||
(N1_flags,),
|
||||
(N2_flags,),
|
||||
(P1_flags,),
|
||||
(P2_flags,),
|
||||
)
|
|
@ -40,6 +40,8 @@ def parser(vocab):
|
|||
def test_init_parser(parser):
|
||||
pass
|
||||
|
||||
# TODO: This is flakey, because it depends on what the parser first learns.
|
||||
@pytest.mark.xfail
|
||||
def test_add_label(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = parser(doc)
|
||||
|
|
|
@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
|
|||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
import regex as re
|
||||
|
||||
from .strings cimport hash_string
|
||||
from . import util
|
||||
cimport cython
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
from .strings cimport hash_string
|
||||
from . import util
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
|
@ -21,7 +20,7 @@ cdef class Tokenizer:
|
|||
boundaries.
|
||||
"""
|
||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||
suffix_search=None, infix_finditer=None, token_match=None):
|
||||
suffix_search=None, infix_finditer=None, token_match=None):
|
||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||
|
||||
vocab (Vocab): A storage container for lexical types.
|
||||
|
@ -74,9 +73,8 @@ cdef class Tokenizer:
|
|||
RETURNS (Doc): A container for linguistic annotations.
|
||||
"""
|
||||
if len(string) >= (2 ** 30):
|
||||
raise ValueError(
|
||||
"String is too long: %d characters. Max is 2**30." % len(string)
|
||||
)
|
||||
msg = "String is too long: %d characters. Max is 2**30."
|
||||
raise ValueError(msg % len(string))
|
||||
cdef int length = len(string)
|
||||
cdef Doc doc = Doc(self.vocab)
|
||||
if length == 0:
|
||||
|
@ -122,8 +120,8 @@ cdef class Tokenizer:
|
|||
"""Tokenize a stream of texts.
|
||||
|
||||
texts: A sequence of unicode texts.
|
||||
batch_size (int): The number of texts to accumulate in an internal buffer.
|
||||
n_threads (int): The number of threads to use, if the implementation
|
||||
batch_size (int): Number of texts to accumulate in an internal buffer.
|
||||
n_threads (int): Number of threads to use, if the implementation
|
||||
supports multi-threading. The default tokenizer is single-threaded.
|
||||
YIELDS (Doc): A sequence of Doc objects, in order.
|
||||
"""
|
||||
|
@ -232,8 +230,8 @@ cdef class Tokenizer:
|
|||
if not matches:
|
||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||
else:
|
||||
# let's say we have dyn-o-mite-dave
|
||||
# the regex finds the start and end positions of the hyphens
|
||||
# let's say we have dyn-o-mite-dave - the regex finds the
|
||||
# start and end positions of the hyphens
|
||||
start = 0
|
||||
for match in matches:
|
||||
infix_start = match.start()
|
||||
|
@ -293,8 +291,8 @@ cdef class Tokenizer:
|
|||
return list(self.infix_finditer(string))
|
||||
|
||||
def find_prefix(self, unicode string):
|
||||
"""Find the length of a prefix that should be segmented from the string,
|
||||
or None if no prefix rules match.
|
||||
"""Find the length of a prefix that should be segmented from the
|
||||
string, or None if no prefix rules match.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||
|
@ -305,8 +303,8 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def find_suffix(self, unicode string):
|
||||
"""Find the length of a suffix that should be segmented from the string,
|
||||
or None if no suffix rules match.
|
||||
"""Find the length of a suffix that should be segmented from the
|
||||
string, or None if no suffix rules match.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||
|
@ -326,8 +324,8 @@ cdef class Tokenizer:
|
|||
|
||||
string (unicode): The string to specially tokenize.
|
||||
token_attrs (iterable): A sequence of dicts, where each dict describes
|
||||
a token and its attributes. The `ORTH` fields of the attributes must
|
||||
exactly match the string when they are concatenated.
|
||||
a token and its attributes. The `ORTH` fields of the attributes
|
||||
must exactly match the string when they are concatenated.
|
||||
"""
|
||||
substrings = list(substrings)
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
|
@ -343,7 +341,7 @@ cdef class Tokenizer:
|
|||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
"""
|
||||
with path.open('wb') as file_:
|
||||
file_.write(self.to_bytes(**exclude))
|
||||
|
|
|
@ -2,4 +2,4 @@ from .doc import Doc
|
|||
from .token import Token
|
||||
from .span import Span
|
||||
|
||||
__all__ = [Doc, Token, Span]
|
||||
__all__ = ['Doc', 'Token', 'Span']
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
cdef class Binder:
|
||||
def __init__(self, *docs):
|
||||
pass
|
||||
|
||||
def __iter__(self):
|
||||
pass
|
||||
|
||||
def __reduce__(self):
|
||||
pass
|
||||
|
||||
def to_bytes(self):
|
||||
pass
|
||||
|
||||
def from_bytes(cls, data):
|
||||
pass
|
||||
|
||||
def to_disk(self):
|
||||
pass
|
||||
|
||||
def from_disk(self, path):
|
||||
pass
|
|
@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
|||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs import intify_attrs, IDS
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||
from ..attrs cimport SENT_START
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||
from ..attrs cimport ENT_TYPE, SENT_START
|
||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||
from ..util import normalize_slice
|
||||
from ..compat import is_config, copy_reg, pickle
|
||||
|
@ -78,24 +78,25 @@ def _get_chunker(lang):
|
|||
|
||||
cdef class Doc:
|
||||
"""A sequence of Token objects. Access sentences and named entities, export
|
||||
annotations to numpy arrays, losslessly serialize to compressed binary strings.
|
||||
The `Doc` object holds an array of `TokenC` structs. The Python-level
|
||||
`Token` and `Span` objects are views of this array, i.e. they don't own
|
||||
the data themselves.
|
||||
annotations to numpy arrays, losslessly serialize to compressed binary
|
||||
strings. The `Doc` object holds an array of `TokenC` structs. The
|
||||
Python-level `Token` and `Span` objects are views of this array, i.e.
|
||||
they don't own the data themselves.
|
||||
|
||||
EXAMPLE: Construction 1
|
||||
>>> doc = nlp(u'Some text')
|
||||
|
||||
Construction 2
|
||||
>>> from spacy.tokens import Doc
|
||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
|
||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||
spaces=[True, False, False])
|
||||
"""
|
||||
@classmethod
|
||||
def set_extension(cls, name, default=None, method=None,
|
||||
getter=None, setter=None):
|
||||
nr_defined = sum(t is not None for t in (default, getter, setter, method))
|
||||
assert nr_defined == 1
|
||||
Underscore.doc_extensions[name] = (default, method, getter, setter)
|
||||
Underscore.doc_extensions[name] = (default, method, getter, setter)
|
||||
|
||||
@classmethod
|
||||
def get_extension(cls, name):
|
||||
|
@ -109,15 +110,14 @@ cdef class Doc:
|
|||
orths_and_spaces=None):
|
||||
"""Create a Doc object.
|
||||
|
||||
vocab (Vocab): A vocabulary object, which must match any models you want
|
||||
to use (e.g. tokenizer, parser, entity recognizer).
|
||||
vocab (Vocab): A vocabulary object, which must match any models you
|
||||
want to use (e.g. tokenizer, parser, entity recognizer).
|
||||
words (list or None): A list of unicode strings to add to the document
|
||||
as words. If `None`, defaults to empty list.
|
||||
spaces (list or None): A list of boolean values, of the same length as
|
||||
words. True means that the word is followed by a space, False means
|
||||
it is not. If `None`, defaults to `[True]*len(words)`
|
||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||
|
||||
RETURNS (Doc): The newly constructed object.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
|
@ -153,10 +153,10 @@ cdef class Doc:
|
|||
spaces = [True] * len(words)
|
||||
elif len(spaces) != len(words):
|
||||
raise ValueError(
|
||||
"Arguments 'words' and 'spaces' should be sequences of the "
|
||||
"same length, or 'spaces' should be left default at None. "
|
||||
"spaces should be a sequence of booleans, with True meaning "
|
||||
"that the word owns a ' ' character following it.")
|
||||
"Arguments 'words' and 'spaces' should be sequences of "
|
||||
"the same length, or 'spaces' should be left default at "
|
||||
"None. spaces should be a sequence of booleans, with True "
|
||||
"meaning that the word owns a ' ' character following it.")
|
||||
orths_and_spaces = zip(words, spaces)
|
||||
if orths_and_spaces is not None:
|
||||
for orth_space in orths_and_spaces:
|
||||
|
@ -166,7 +166,8 @@ cdef class Doc:
|
|||
elif isinstance(orth_space, bytes):
|
||||
raise ValueError(
|
||||
"orths_and_spaces expects either List(unicode) or "
|
||||
"List((unicode, bool)). Got bytes instance: %s" % (str(orth_space)))
|
||||
"List((unicode, bool)). "
|
||||
"Got bytes instance: %s" % (str(orth_space)))
|
||||
else:
|
||||
orth, has_space = orth_space
|
||||
# Note that we pass self.mem here --- we have ownership, if LexemeC
|
||||
|
@ -186,7 +187,8 @@ cdef class Doc:
|
|||
def __getitem__(self, object i):
|
||||
"""Get a `Token` or `Span` object.
|
||||
|
||||
i (int or tuple) The index of the token, or the slice of the document to get.
|
||||
i (int or tuple) The index of the token, or the slice of the document
|
||||
to get.
|
||||
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
|
||||
`doc[start : end]`.
|
||||
|
||||
|
@ -199,11 +201,11 @@ cdef class Doc:
|
|||
>>> doc[start : end]]
|
||||
Get a `Span` object, starting at position `start` and ending at
|
||||
position `end`, where `start` and `end` are token indices. For
|
||||
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
|
||||
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
|
||||
as `Span` objects must be contiguous (cannot have gaps). You can use
|
||||
negative indices and open-ended ranges, which have their normal
|
||||
Python semantics.
|
||||
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and
|
||||
4. Stepped slices (e.g. `doc[start : end : step]`) are not
|
||||
supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||
You can use negative indices and open-ended ranges, which have
|
||||
their normal Python semantics.
|
||||
"""
|
||||
if isinstance(i, slice):
|
||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
|
@ -262,8 +264,10 @@ cdef class Doc:
|
|||
doc (Doc): The parent document.
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
if not isinstance(label, int):
|
||||
|
@ -322,7 +326,8 @@ cdef class Doc:
|
|||
if self._vector is not None:
|
||||
return self._vector
|
||||
elif not len(self):
|
||||
self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||
self._vector = numpy.zeros((self.vocab.vectors_length,),
|
||||
dtype='f')
|
||||
return self._vector
|
||||
elif self.has_vector:
|
||||
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||
|
@ -334,7 +339,8 @@ cdef class Doc:
|
|||
self._vector = self.tensor.mean(axis=0)
|
||||
return self._vector
|
||||
else:
|
||||
return numpy.zeros((self.vocab.vectors_length,), dtype='float32')
|
||||
return numpy.zeros((self.vocab.vectors_length,),
|
||||
dtype='float32')
|
||||
|
||||
def __set__(self, value):
|
||||
self._vector = value
|
||||
|
@ -377,13 +383,14 @@ cdef class Doc:
|
|||
return self.text
|
||||
|
||||
property ents:
|
||||
"""Iterate over the entities in the document. Yields named-entity `Span`
|
||||
objects, if the entity recognizer has been applied to the document.
|
||||
"""Iterate over the entities in the document. Yields named-entity
|
||||
`Span` objects, if the entity recognizer has been applied to the
|
||||
document.
|
||||
|
||||
YIELDS (Span): Entities in the document.
|
||||
|
||||
EXAMPLE: Iterate over the span to get individual Token objects, or access
|
||||
the label:
|
||||
EXAMPLE: Iterate over the span to get individual Token objects,
|
||||
or access the label:
|
||||
|
||||
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
>>> ents = list(tokens.ents)
|
||||
|
@ -419,7 +426,8 @@ cdef class Doc:
|
|||
def __set__(self, ents):
|
||||
# TODO:
|
||||
# 1. Allow negative matches
|
||||
# 2. Ensure pre-set NERs are not over-written during statistical prediction
|
||||
# 2. Ensure pre-set NERs are not over-written during statistical
|
||||
# prediction
|
||||
# 3. Test basic data-driven ORTH gazetteer
|
||||
# 4. Test more nuanced date and currency regex
|
||||
cdef int i
|
||||
|
@ -428,7 +436,7 @@ cdef class Doc:
|
|||
# At this point we don't know whether the NER has run over the
|
||||
# Doc. If the ent_iob is missing, leave it missing.
|
||||
if self.c[i].ent_iob != 0:
|
||||
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
|
||||
self.c[i].ent_iob = 2 # Means O. Non-O are set from ents.
|
||||
cdef attr_t ent_type
|
||||
cdef int start, end
|
||||
for ent_info in ents:
|
||||
|
@ -456,10 +464,11 @@ cdef class Doc:
|
|||
|
||||
property noun_chunks:
|
||||
"""Iterate over the base noun phrases in the document. Yields base
|
||||
noun-phrase #[code Span] objects, if the document has been syntactically
|
||||
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
|
||||
not permit other NPs to be nested within it – so no NP-level
|
||||
coordination, no prepositional phrases, and no relative clauses.
|
||||
noun-phrase #[code Span] objects, if the document has been
|
||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
||||
phrase that does not permit other NPs to be nested within it – so no
|
||||
NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
|
||||
YIELDS (Span): Noun chunks in the document.
|
||||
"""
|
||||
|
@ -467,12 +476,14 @@ cdef class Doc:
|
|||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
"noun_chunks requires the dependency parse, which "
|
||||
"requires data to be installed. For more info, see the "
|
||||
"requires a statistical model to be installed and loaded. "
|
||||
"For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
# Accumulate the result before beginning to iterate over it. This prevents
|
||||
# the tokenisation from being changed out from under us during the iteration.
|
||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||
# so it's okay once we have the Span objects. See Issue #375
|
||||
# Accumulate the result before beginning to iterate over it. This
|
||||
# prevents the tokenisation from being changed out from under us
|
||||
# during the iteration. The tricky thing here is that Span accepts
|
||||
# its tokenisation changing, so it's okay once we have the Span
|
||||
# objects. See Issue #375.
|
||||
spans = []
|
||||
for start, end, label in self.noun_chunks_iterator(self):
|
||||
spans.append(Span(self, start, end, label=label))
|
||||
|
@ -497,8 +508,9 @@ cdef class Doc:
|
|||
|
||||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
"sentence boundary detection requires the dependency parse, which "
|
||||
"requires data to be installed. For more info, see the "
|
||||
"Sentence boundary detection requires the dependency "
|
||||
"parse, which requires a statistical model to be "
|
||||
"installed and loaded. For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
cdef int i
|
||||
start = 0
|
||||
|
@ -537,12 +549,11 @@ cdef class Doc:
|
|||
@cython.boundscheck(False)
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Export given token attributes to a numpy `ndarray`.
|
||||
|
||||
If `attr_ids` is a sequence of M attributes, the output array will
|
||||
be of shape `(N, M)`, where N is the length of the `Doc`
|
||||
(in tokens). If `attr_ids` is a single attribute, the output shape will
|
||||
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
|
||||
or string name (e.g. 'LEMMA' or 'lemma').
|
||||
If `attr_ids` is a sequence of M attributes, the output array will be
|
||||
of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If
|
||||
`attr_ids` is a single attribute, the output shape will be (N,). You
|
||||
can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or
|
||||
string name (e.g. 'LEMMA' or 'lemma').
|
||||
|
||||
attr_ids (list[]): A list of attributes (int IDs or string names).
|
||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||
|
@ -566,18 +577,19 @@ cdef class Doc:
|
|||
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
||||
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
|
||||
for id_ in py_attr_ids]
|
||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||
# dict iteration.
|
||||
# Make an array from the attributes --- otherwise our inner loop is
|
||||
# Python dict iteration.
|
||||
attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)),
|
||||
dtype=numpy.uint64)
|
||||
for i in range(self.length):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i, j] = get_token_attr(&self.c[i], feature)
|
||||
# Handle 1d case
|
||||
return output if len(attr_ids) >= 2 else output.reshape((self.length,))
|
||||
|
||||
|
||||
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
||||
def count_by(self, attr_id_t attr_id, exclude=None,
|
||||
PreshCounter counts=None):
|
||||
"""Count the frequencies of a given attribute. Produces a dict of
|
||||
`{attribute (int): count (ints)}` frequencies, keyed by the values of
|
||||
the given attribute ID.
|
||||
|
@ -641,13 +653,12 @@ cdef class Doc:
|
|||
def from_array(self, attrs, array):
|
||||
if SENT_START in attrs and HEAD in attrs:
|
||||
raise ValueError(
|
||||
"Conflicting attributes specified in doc.from_array():\n"
|
||||
"Conflicting attributes specified in doc.from_array(): "
|
||||
"(HEAD, SENT_START)\n"
|
||||
"The HEAD attribute currently sets sentence boundaries implicitly,\n"
|
||||
"based on the tree structure. This means the HEAD attribute would "
|
||||
"potentially override the sentence boundaries set by SENT_START.\n"
|
||||
"See https://github.com/spacy-io/spaCy/issues/235 for details and "
|
||||
"workarounds, and to propose solutions.")
|
||||
"The HEAD attribute currently sets sentence boundaries "
|
||||
"implicitly, based on the tree structure. This means the HEAD "
|
||||
"attribute would potentially override the sentence boundaries "
|
||||
"set by SENT_START.")
|
||||
cdef int i, col
|
||||
cdef attr_id_t attr_id
|
||||
cdef TokenC* tokens = self.c
|
||||
|
@ -675,18 +686,14 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def get_lca_matrix(self):
|
||||
'''
|
||||
Calculates the lowest common ancestor matrix
|
||||
for a given Spacy doc.
|
||||
Returns LCA matrix containing the integer index
|
||||
of the ancestor, or -1 if no common ancestor is
|
||||
found (ex if span excludes a necessary ancestor).
|
||||
Apologies about the recursion, but the
|
||||
impact on performance is negligible given
|
||||
the natural limitations on the depth of a typical human sentence.
|
||||
'''
|
||||
"""Calculates the lowest common ancestor matrix for a given `Doc`.
|
||||
Returns LCA matrix containing the integer index of the ancestor, or -1
|
||||
if no common ancestor is found (ex if span excludes a necessary
|
||||
ancestor). Apologies about the recursion, but the impact on
|
||||
performance is negligible given the natural limitations on the depth
|
||||
of a typical human sentence.
|
||||
"""
|
||||
# Efficiency notes:
|
||||
#
|
||||
# We can easily improve the performance here by iterating in Cython.
|
||||
# To loop over the tokens in Cython, the easiest way is:
|
||||
# for token in doc.c[:doc.c.length]:
|
||||
|
@ -705,7 +712,8 @@ cdef class Doc:
|
|||
elif (token_j.head == token_j) and (token_k.head == token_k):
|
||||
lca_index = -1
|
||||
else:
|
||||
lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix)
|
||||
lca_index = __pairwise_lca(token_j.head, token_k.head,
|
||||
lca_matrix)
|
||||
lca_matrix[token_j.i][token_k.i] = lca_index
|
||||
lca_matrix[token_k.i][token_j.i] = lca_index
|
||||
|
||||
|
@ -719,14 +727,13 @@ cdef class Doc:
|
|||
token_k = self[k]
|
||||
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
|
||||
lca_matrix[k][j] = lca_matrix[j][k]
|
||||
|
||||
return lca_matrix
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
"""
|
||||
with path.open('wb') as file_:
|
||||
file_.write(self.to_bytes(**exclude))
|
||||
|
@ -749,7 +756,7 @@ cdef class Doc:
|
|||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||
all annotations.
|
||||
"""
|
||||
array_head = [LENGTH,SPACY,TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE]
|
||||
array_head = [LENGTH, SPACY, TAG, LEMMA, HEAD, DEP, ENT_IOB, ENT_TYPE]
|
||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||
# vexing for user data. As a best guess, we *know* that within
|
||||
# keys, we must have tuples. In values we just have to hope
|
||||
|
@ -792,7 +799,8 @@ cdef class Doc:
|
|||
# keys, we must have tuples. In values we just have to hope
|
||||
# users don't mind getting a list instead of a tuple.
|
||||
if 'user_data' not in exclude and 'user_data_keys' in msg:
|
||||
user_data_keys = msgpack.loads(msg['user_data_keys'], use_list=False)
|
||||
user_data_keys = msgpack.loads(msg['user_data_keys'],
|
||||
use_list=False)
|
||||
user_data_values = msgpack.loads(msg['user_data_values'])
|
||||
for key, value in zip(user_data_keys, user_data_values):
|
||||
self.user_data[key] = value
|
||||
|
@ -819,14 +827,15 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
|
||||
is merged into a single token. If `start_idx` and `end_idx `do not mark
|
||||
start and end token boundaries, the document remains unchanged.
|
||||
"""Retokenize the document, such that the span at
|
||||
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
||||
`start_idx` and `end_idx `do not mark start and end token boundaries,
|
||||
the document remains unchanged.
|
||||
|
||||
start_idx (int): The character index of the start of the slice to merge.
|
||||
end_idx (int): The character index after the end of the slice to merge.
|
||||
start_idx (int): Character index of the start of the slice to merge.
|
||||
end_idx (int): Character index after the end of the slice to merge.
|
||||
**attributes: Attributes to assign to the merged token. By default,
|
||||
attributes are inherited from the syntactic root token of the span.
|
||||
attributes are inherited from the syntactic root of the span.
|
||||
RETURNS (Token): The newly merged token, or `None` if the start and end
|
||||
indices did not fall at token boundaries.
|
||||
"""
|
||||
|
@ -847,10 +856,11 @@ cdef class Doc:
|
|||
attributes[ENT_TYPE] = attributes['ent_type']
|
||||
elif args:
|
||||
raise ValueError(
|
||||
"Doc.merge received %d non-keyword arguments. "
|
||||
"Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
|
||||
"Doc.merge received %d non-keyword arguments. Expected either "
|
||||
"3 arguments (deprecated), or 0 (use keyword arguments). "
|
||||
"Arguments supplied:\n%s\n"
|
||||
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
||||
"Keyword arguments: %s\n" % (len(args), repr(args),
|
||||
repr(attributes)))
|
||||
|
||||
# More deprecated attribute handling =/
|
||||
if 'label' in attributes:
|
||||
|
@ -882,8 +892,9 @@ cdef class Doc:
|
|||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
# Begin by setting all the head indices to absolute token positions
|
||||
# This is easier to work with for now than the offsets
|
||||
# Before thinking of something simpler, beware the case where a dependency
|
||||
# bridges over the entity. Here the alignment of the tokens changes.
|
||||
# Before thinking of something simpler, beware the case where a
|
||||
# dependency bridges over the entity. Here the alignment of the
|
||||
# tokens changes.
|
||||
span_root = span.root.i
|
||||
token.dep = span.root.dep
|
||||
# We update token.lex after keeping span root and dep, since
|
||||
|
@ -932,8 +943,9 @@ cdef class Doc:
|
|||
>>> trees = doc.print_tree()
|
||||
>>> trees[1]
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
|
||||
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
|
||||
'lemma': 'Alice'},
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||
|
@ -1008,7 +1020,7 @@ def pickle_doc(doc):
|
|||
|
||||
def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
||||
user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data)
|
||||
|
||||
|
||||
doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data,
|
||||
exclude='user_data')
|
||||
doc.user_hooks.update(doc_hooks)
|
||||
|
@ -1018,4 +1030,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data):
|
|||
|
||||
|
||||
copy_reg.pickle(Doc, pickle_doc, unpickle_doc)
|
||||
|
||||
|
|
|
@ -43,8 +43,8 @@ def POS_tree(root, light=False, flat=False):
|
|||
|
||||
|
||||
def parse_tree(doc, light=False, flat=False):
|
||||
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
||||
the one used in displaCy. Generates the POS tree for all sentences in a doc.
|
||||
"""Make a copy of the doc and construct a syntactic parse tree similar to
|
||||
displaCy. Generates the POS tree for all sentences in a doc.
|
||||
|
||||
doc (Doc): The doc for parsing.
|
||||
RETURNS (dict): The parse tree.
|
||||
|
@ -66,8 +66,9 @@ def parse_tree(doc, light=False, flat=False):
|
|||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||
"""
|
||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
||||
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
|
||||
merge_ents(doc_clone) # merge the entities into single tokens first
|
||||
return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
|
||||
return [POS_tree(sent.root, light=light, flat=flat)
|
||||
for sent in doc_clone.sents]
|
||||
|
|
|
@ -35,15 +35,16 @@ cdef class Span:
|
|||
def has_extension(cls, name):
|
||||
return name in Underscore.span_extensions
|
||||
|
||||
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
|
||||
vector_norm=None):
|
||||
def __cinit__(self, Doc doc, int start, int end, attr_t label=0,
|
||||
vector=None, vector_norm=None):
|
||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first token of the span.
|
||||
end (int): The index of the first token after the span.
|
||||
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
|
||||
of the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
if not (0 <= start <= end <= len(doc)):
|
||||
|
@ -127,14 +128,17 @@ cdef class Span:
|
|||
|
||||
@property
|
||||
def _(self):
|
||||
"""User space for adding custom attribute extensions."""
|
||||
return Underscore(Underscore.span_extensions, self,
|
||||
start=self.start_char, end=self.end_char)
|
||||
|
||||
def as_doc(self):
|
||||
'''Create a Doc object view of the Span's data.
|
||||
# TODO: fix
|
||||
"""Create a `Doc` object view of the Span's data. This is mostly
|
||||
useful for C-typed interfaces.
|
||||
|
||||
This is mostly useful for C-typed interfaces.
|
||||
'''
|
||||
RETURNS (Doc): The `Doc` view of the span.
|
||||
"""
|
||||
cdef Doc doc = Doc(self.doc.vocab)
|
||||
doc.length = self.end-self.start
|
||||
doc.c = &self.doc.c[self.start]
|
||||
|
@ -162,7 +166,8 @@ cdef class Span:
|
|||
attributes are inherited from the syntactic root token of the span.
|
||||
RETURNS (Token): The newly merged token.
|
||||
"""
|
||||
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||
return self.doc.merge(self.start_char, self.end_char, *args,
|
||||
**attributes)
|
||||
|
||||
def similarity(self, other):
|
||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||
|
@ -179,24 +184,19 @@ cdef class Span:
|
|||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
def get_lca_matrix(self):
|
||||
'''
|
||||
Calculates the lowest common ancestor matrix
|
||||
for a given Spacy span.
|
||||
Returns LCA matrix containing the integer index
|
||||
of the ancestor, or -1 if no common ancestor is
|
||||
found (ex if span excludes a necessary ancestor).
|
||||
Apologies about the recursion, but the
|
||||
impact on performance is negligible given
|
||||
the natural limitations on the depth of a typical human sentence.
|
||||
'''
|
||||
|
||||
"""Calculates the lowest common ancestor matrix for a given `Span`.
|
||||
Returns LCA matrix containing the integer index of the ancestor, or -1
|
||||
if no common ancestor is found (ex if span excludes a necessary
|
||||
ancestor). Apologies about the recursion, but the impact on
|
||||
performance is negligible given the natural limitations on the depth
|
||||
of a typical human sentence.
|
||||
"""
|
||||
def __pairwise_lca(token_j, token_k, lca_matrix, margins):
|
||||
offset = margins[0]
|
||||
token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
|
||||
token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
|
||||
token_j_i = token_j.i - offset
|
||||
token_k_i = token_k.i - offset
|
||||
|
||||
if lca_matrix[token_j_i][token_k_i] != -2:
|
||||
return lca_matrix[token_j_i][token_k_i]
|
||||
elif token_j == token_k:
|
||||
|
@ -209,23 +209,19 @@ cdef class Span:
|
|||
lca_index = -1
|
||||
else:
|
||||
lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
|
||||
|
||||
lca_matrix[token_j_i][token_k_i] = lca_index
|
||||
lca_matrix[token_k_i][token_j_i] = lca_index
|
||||
|
||||
return lca_index
|
||||
|
||||
lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
|
||||
lca_matrix.fill(-2)
|
||||
margins = [self.start, self.end]
|
||||
|
||||
for j in range(len(self)):
|
||||
token_j = self[j]
|
||||
for k in range(len(self)):
|
||||
token_k = self[k]
|
||||
lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
|
||||
lca_matrix[k][j] = lca_matrix[j][k]
|
||||
|
||||
return lca_matrix
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
|
@ -266,10 +262,7 @@ cdef class Span:
|
|||
self.end = end + 1
|
||||
|
||||
property sent:
|
||||
"""The sentence span that this span is a part of.
|
||||
|
||||
RETURNS (Span): The sentence span that the span is a part of.
|
||||
"""
|
||||
"""RETURNS (Span): The sentence span that the span is a part of."""
|
||||
def __get__(self):
|
||||
if 'sent' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['sent'](self)
|
||||
|
@ -282,13 +275,10 @@ cdef class Span:
|
|||
n += 1
|
||||
if n >= self.doc.length:
|
||||
raise RuntimeError
|
||||
return self.doc[root.l_edge : root.r_edge + 1]
|
||||
return self.doc[root.l_edge:root.r_edge + 1]
|
||||
|
||||
property has_vector:
|
||||
"""A boolean value indicating whether a word vector is associated with
|
||||
the object.
|
||||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.doc.user_span_hooks:
|
||||
|
@ -310,10 +300,7 @@ cdef class Span:
|
|||
return self._vector
|
||||
|
||||
property vector_norm:
|
||||
"""The L2 norm of the document's vector representation.
|
||||
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
"""
|
||||
"""RETURNS (float): The L2 norm of the vector representation."""
|
||||
def __get__(self):
|
||||
if 'vector_norm' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['vector'](self)
|
||||
|
@ -327,7 +314,9 @@ cdef class Span:
|
|||
return self._vector_norm
|
||||
|
||||
property sentiment:
|
||||
# TODO: docstring
|
||||
"""RETURNS (float): A scalar value indicating the positivity or
|
||||
negativity of the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'sentiment' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['sentiment'](self)
|
||||
|
@ -335,10 +324,7 @@ cdef class Span:
|
|||
return sum([token.sentiment for token in self]) / len(self)
|
||||
|
||||
property text:
|
||||
"""A unicode representation of the span text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the span.
|
||||
"""
|
||||
"""RETURNS (unicode): The original verbatim text of the span."""
|
||||
def __get__(self):
|
||||
text = self.text_with_ws
|
||||
if self[-1].whitespace_:
|
||||
|
@ -349,7 +335,8 @@ cdef class Span:
|
|||
"""The text content of the span with a trailing whitespace character if
|
||||
the last token has one.
|
||||
|
||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
||||
RETURNS (unicode): The text content of the span (with trailing
|
||||
whitespace).
|
||||
"""
|
||||
def __get__(self):
|
||||
return u''.join([t.text_with_ws for t in self])
|
||||
|
@ -358,7 +345,8 @@ cdef class Span:
|
|||
"""Yields base noun-phrase `Span` objects, if the document has been
|
||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
||||
phrase that does not permit other NPs to be nested within it – so no
|
||||
NP-level coordination, no prepositional phrases, and no relative clauses.
|
||||
NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
|
||||
YIELDS (Span): Base noun-phrase `Span` objects
|
||||
"""
|
||||
|
@ -366,12 +354,14 @@ cdef class Span:
|
|||
if not self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
"noun_chunks requires the dependency parse, which "
|
||||
"requires data to be installed. For more info, see the "
|
||||
"requires a statistical model to be installed and loaded. "
|
||||
"For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
# Accumulate the result before beginning to iterate over it. This prevents
|
||||
# the tokenisation from being changed out from under us during the iteration.
|
||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||
# so it's okay once we have the Span objects. See Issue #375
|
||||
# Accumulate the result before beginning to iterate over it. This
|
||||
# prevents the tokenisation from being changed out from under us
|
||||
# during the iteration. The tricky thing here is that Span accepts
|
||||
# its tokenisation changing, so it's okay once we have the Span
|
||||
# objects. See Issue #375
|
||||
spans = []
|
||||
cdef attr_t label
|
||||
for start, end, label in self.doc.noun_chunks_iterator(self):
|
||||
|
@ -385,9 +375,9 @@ cdef class Span:
|
|||
|
||||
RETURNS (Token): The root token.
|
||||
|
||||
EXAMPLE: The root token has the shortest path to the root of the sentence
|
||||
(or is the root itself). If multiple words are equally high in the
|
||||
tree, the first word is taken. For example:
|
||||
EXAMPLE: The root token has the shortest path to the root of the
|
||||
sentence (or is the root itself). If multiple words are equally
|
||||
high in the tree, the first word is taken. For example:
|
||||
|
||||
>>> toks = nlp(u'I like New York in Autumn.')
|
||||
|
||||
|
@ -437,11 +427,11 @@ cdef class Span:
|
|||
if self.doc.c[i].head == 0:
|
||||
return self.doc[i]
|
||||
# If we don't have a sentence root, we do something that's not so
|
||||
# algorithmically clever, but I think should be quite fast, especially
|
||||
# for short spans.
|
||||
# algorithmically clever, but I think should be quite fast,
|
||||
# especially for short spans.
|
||||
# For each word, we count the path length, and arg min this measure.
|
||||
# We could use better tree logic to save steps here...But I think this
|
||||
# should be okay.
|
||||
# We could use better tree logic to save steps here...But I
|
||||
# think this should be okay.
|
||||
cdef int current_best = self.doc.length
|
||||
cdef int root = -1
|
||||
for i in range(self.start, self.end):
|
||||
|
@ -463,7 +453,7 @@ cdef class Span:
|
|||
YIELDS (Token):A left-child of a token of the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
for token in reversed(self): # Reverse, so we get the tokens in order
|
||||
for token in reversed(self): # Reverse, so we get tokens in order
|
||||
for left in token.lefts:
|
||||
if left.i < self.start:
|
||||
yield left
|
||||
|
@ -480,6 +470,22 @@ cdef class Span:
|
|||
if right.i >= self.end:
|
||||
yield right
|
||||
|
||||
property n_lefts:
|
||||
"""RETURNS (int): The number of leftward immediate children of the
|
||||
span, in the syntactic dependency parse.
|
||||
"""
|
||||
# TODO: implement
|
||||
def __get__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
property n_rights:
|
||||
"""RETURNS (int): The number of rightward immediate children of the
|
||||
span, in the syntactic dependency parse.
|
||||
"""
|
||||
# TODO: implement
|
||||
def __get__(self):
|
||||
raise NotImplementedError
|
||||
|
||||
property subtree:
|
||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
||||
|
||||
|
@ -493,66 +499,55 @@ cdef class Span:
|
|||
yield from word.subtree
|
||||
|
||||
property ent_id:
|
||||
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||
|
||||
RETURNS (uint64): The entity ID.
|
||||
"""
|
||||
"""RETURNS (uint64): The entity ID."""
|
||||
def __get__(self):
|
||||
return self.root.ent_id
|
||||
|
||||
def __set__(self, hash_t key):
|
||||
# TODO
|
||||
raise NotImplementedError(
|
||||
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
||||
"tracker: http://github.com/explosion/spaCy/issues")
|
||||
"Can't yet set ent_id from Span. Vote for this feature on "
|
||||
"the issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
property ent_id_:
|
||||
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||
|
||||
RETURNS (unicode): The entity ID.
|
||||
"""
|
||||
"""RETURNS (unicode): The (string) entity ID."""
|
||||
def __get__(self):
|
||||
return self.root.ent_id_
|
||||
|
||||
def __set__(self, hash_t key):
|
||||
# TODO
|
||||
raise NotImplementedError(
|
||||
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
|
||||
"tracker: http://github.com/explosion/spaCy/issues")
|
||||
"Can't yet set ent_id_ from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
property orth_:
|
||||
# TODO: docstring
|
||||
"""Verbatim text content (identical to Span.text). Exists mostly for
|
||||
consistency with other attributes.
|
||||
|
||||
RETURNS (unicode): The span's text."""
|
||||
def __get__(self):
|
||||
return ''.join([t.string for t in self]).strip()
|
||||
return ''.join([t.orth_ for t in self]).strip()
|
||||
|
||||
property lemma_:
|
||||
"""The span's lemma.
|
||||
|
||||
RETURNS (unicode): The span's lemma.
|
||||
"""
|
||||
"""RETURNS (unicode): The span's lemma."""
|
||||
def __get__(self):
|
||||
return ' '.join([t.lemma_ for t in self]).strip()
|
||||
|
||||
property upper_:
|
||||
# TODO: docstring
|
||||
"""Deprecated. Use Span.text.upper() instead."""
|
||||
def __get__(self):
|
||||
return ''.join([t.string.upper() for t in self]).strip()
|
||||
return ''.join([t.text_with_ws.upper() for t in self]).strip()
|
||||
|
||||
property lower_:
|
||||
# TODO: docstring
|
||||
"""Deprecated. Use Span.text.lower() instead."""
|
||||
def __get__(self):
|
||||
return ''.join([t.string.lower() for t in self]).strip()
|
||||
return ''.join([t.text_with_ws.lower() for t in self]).strip()
|
||||
|
||||
property string:
|
||||
# TODO: docstring
|
||||
"""Deprecated: Use Span.text_with_ws instead."""
|
||||
def __get__(self):
|
||||
return ''.join([t.string for t in self])
|
||||
return ''.join([t.text_with_ws for t in self])
|
||||
|
||||
property label_:
|
||||
"""The span's label.
|
||||
|
||||
RETURNS (unicode): The span's label.
|
||||
"""
|
||||
"""RETURNS (unicode): The span's label."""
|
||||
def __get__(self):
|
||||
return self.doc.vocab.strings[self.label]
|
||||
|
||||
|
@ -570,7 +565,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
|||
n += 1
|
||||
if n >= sent_length:
|
||||
raise RuntimeError(
|
||||
"Array bounds exceeded while searching for root word. This likely "
|
||||
"means the parse tree is in an invalid state. Please report this "
|
||||
"issue here: http://github.com/explosion/spaCy/issues")
|
||||
"Array bounds exceeded while searching for root word. This "
|
||||
"likely means the parse tree is in an invalid state. Please "
|
||||
"report this issue here: "
|
||||
"http://github.com/explosion/spaCy/issues")
|
||||
return n
|
||||
|
|
|
@ -14,17 +14,18 @@ from ..typedefs cimport hash_t
|
|||
from ..lexeme cimport Lexeme
|
||||
from .. import parts_of_speech
|
||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
|
||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport LEMMA, POS, TAG, DEP
|
||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||||
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
||||
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
||||
from ..compat import is_config
|
||||
from .. import about
|
||||
from .underscore import Underscore
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
|
||||
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
||||
etc."""
|
||||
@classmethod
|
||||
def set_extension(cls, name, default=None, method=None,
|
||||
getter=None, setter=None):
|
||||
|
@ -144,37 +145,33 @@ cdef class Token:
|
|||
return self.doc.user_token_hooks['similarity'](self)
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
return (numpy.dot(self.vector, other.vector) /
|
||||
(self.vector_norm * other.vector_norm))
|
||||
|
||||
property lex_id:
|
||||
"""ID of the token's lexical type.
|
||||
|
||||
RETURNS (int): ID of the token's lexical type."""
|
||||
"""RETURNS (int): Sequential ID of the token's lexical type."""
|
||||
def __get__(self):
|
||||
return self.c.lex.id
|
||||
|
||||
property rank:
|
||||
# TODO: add docstring
|
||||
"""RETURNS (int): Sequential ID of the token's lexical type, used to
|
||||
index into tables, e.g. for word vectors."""
|
||||
def __get__(self):
|
||||
return self.c.lex.id
|
||||
|
||||
property string:
|
||||
"""Deprecated: Use Token.text_with_ws instead."""
|
||||
def __get__(self):
|
||||
return self.text_with_ws
|
||||
|
||||
property text:
|
||||
"""A unicode representation of the token text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the token.
|
||||
"""
|
||||
"""RETURNS (unicode): The original verbatim text of the token."""
|
||||
def __get__(self):
|
||||
return self.orth_
|
||||
|
||||
property text_with_ws:
|
||||
"""The text content of the token with a trailing whitespace character if
|
||||
it has one.
|
||||
|
||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
||||
"""RETURNS (unicode): The text content of the span (with trailing
|
||||
whitespace).
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
||||
|
@ -184,74 +181,104 @@ cdef class Token:
|
|||
return orth
|
||||
|
||||
property prob:
|
||||
"""RETURNS (float): Smoothed log probability estimate of token type."""
|
||||
def __get__(self):
|
||||
return self.c.lex.prob
|
||||
|
||||
property sentiment:
|
||||
"""RETURNS (float): A scalar value indicating the positivity or
|
||||
negativity of the token."""
|
||||
def __get__(self):
|
||||
if 'sentiment' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['sentiment'](self)
|
||||
return self.c.lex.sentiment
|
||||
|
||||
property lang:
|
||||
"""RETURNS (uint64): ID of the language of the parent document's
|
||||
vocabulary.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.lex.lang
|
||||
|
||||
property idx:
|
||||
"""RETURNS (int): The character offset of the token within the parent
|
||||
document.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.idx
|
||||
|
||||
property cluster:
|
||||
"""RETURNS (int): Brown cluster ID."""
|
||||
def __get__(self):
|
||||
return self.c.lex.cluster
|
||||
|
||||
property orth:
|
||||
"""RETURNS (uint64): ID of the verbatim text content."""
|
||||
def __get__(self):
|
||||
return self.c.lex.orth
|
||||
|
||||
property lower:
|
||||
"""RETURNS (uint64): ID of the lowercase token text."""
|
||||
def __get__(self):
|
||||
return self.c.lex.lower
|
||||
|
||||
property norm:
|
||||
"""RETURNS (uint64): ID of the token's norm, i.e. a normalised form of
|
||||
the token text. Usually set in the language's tokenizer exceptions
|
||||
or norm exceptions.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.lex.norm
|
||||
|
||||
property shape:
|
||||
"""RETURNS (uint64): ID of the token's shape, a transform of the
|
||||
tokens's string, to show orthographic features (e.g. "Xxxx", "dd").
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.lex.shape
|
||||
|
||||
property prefix:
|
||||
"""RETURNS (uint64): ID of a length-N substring from the start of the
|
||||
token. Defaults to `N=1`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.lex.prefix
|
||||
|
||||
property suffix:
|
||||
"""RETURNS (uint64): ID of a length-N substring from the end of the
|
||||
token. Defaults to `N=3`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.lex.suffix
|
||||
|
||||
property lemma:
|
||||
"""Base form of the word, with no inflectional suffixes.
|
||||
|
||||
RETURNS (uint64): Token lemma.
|
||||
"""RETURNS (uint64): ID of the base form of the word, with no
|
||||
inflectional suffixes.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.lemma
|
||||
|
||||
def __set__(self, attr_t lemma):
|
||||
self.c.lemma = lemma
|
||||
|
||||
property pos:
|
||||
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
|
||||
def __get__(self):
|
||||
return self.c.pos
|
||||
|
||||
property tag:
|
||||
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
|
||||
def __get__(self):
|
||||
return self.c.tag
|
||||
|
||||
def __set__(self, attr_t tag):
|
||||
self.vocab.morphology.assign_tag(self.c, tag)
|
||||
|
||||
property dep:
|
||||
"""RETURNS (uint64): ID of syntactic dependency label."""
|
||||
def __get__(self):
|
||||
return self.c.dep
|
||||
|
||||
def __set__(self, attr_t label):
|
||||
self.c.dep = label
|
||||
|
||||
|
@ -292,23 +319,29 @@ cdef class Token:
|
|||
return numpy.sqrt((vector ** 2).sum())
|
||||
|
||||
property n_lefts:
|
||||
"""RETURNS (int): The number of leftward immediate children of the
|
||||
word, in the syntactic dependency parse.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.l_kids
|
||||
|
||||
property n_rights:
|
||||
"""RETURNS (int): The number of rightward immediate children of the
|
||||
word, in the syntactic dependency parse.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.r_kids
|
||||
|
||||
property sent_start:
|
||||
# TODO: fix and document
|
||||
def __get__(self):
|
||||
return self.c.sent_start
|
||||
|
||||
def __set__(self, value):
|
||||
if self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
'Refusing to write to token.sent_start if its document is parsed, '
|
||||
'because this may cause inconsistent state. '
|
||||
'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.')
|
||||
"Refusing to write to token.sent_start if its document "
|
||||
"is parsed, because this may cause inconsistent state.")
|
||||
if value is None:
|
||||
self.c.sent_start = 0
|
||||
elif value is True:
|
||||
|
@ -316,15 +349,16 @@ cdef class Token:
|
|||
elif value is False:
|
||||
self.c.sent_start = -1
|
||||
else:
|
||||
raise ValueError("Invalid value for token.sent_start -- must be one of "
|
||||
"None, True, False")
|
||||
raise ValueError("Invalid value for token.sent_start. Must be "
|
||||
"one of: None, True, False")
|
||||
|
||||
property lefts:
|
||||
"""The leftward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
|
||||
YIELDS (Token): A left-child of the token.
|
||||
"""
|
||||
def __get__(self):
|
||||
"""
|
||||
The leftward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
"""
|
||||
cdef int nr_iter = 0
|
||||
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
|
||||
while ptr < self.c:
|
||||
|
@ -334,15 +368,16 @@ cdef class Token:
|
|||
nr_iter += 1
|
||||
# This is ugly, but it's a way to guard out infinite loops
|
||||
if nr_iter >= 10000000:
|
||||
raise RuntimeError(
|
||||
"Possibly infinite loop encountered while looking for token.lefts")
|
||||
raise RuntimeError("Possibly infinite loop encountered "
|
||||
"while looking for token.lefts")
|
||||
|
||||
property rights:
|
||||
"""The rightward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
|
||||
YIELDS (Token): A right-child of the token.
|
||||
"""
|
||||
def __get__(self):
|
||||
"""
|
||||
The rightward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
"""
|
||||
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
||||
tokens = []
|
||||
cdef int nr_iter = 0
|
||||
|
@ -352,27 +387,26 @@ cdef class Token:
|
|||
ptr -= 1
|
||||
nr_iter += 1
|
||||
if nr_iter >= 10000000:
|
||||
raise RuntimeError(
|
||||
"Possibly infinite loop encountered while looking for token.rights")
|
||||
raise RuntimeError("Possibly infinite loop encountered "
|
||||
"while looking for token.rights")
|
||||
tokens.reverse()
|
||||
for t in tokens:
|
||||
yield t
|
||||
|
||||
property children:
|
||||
"""
|
||||
A sequence of the token's immediate syntactic children.
|
||||
"""A sequence of the token's immediate syntactic children.
|
||||
|
||||
Yields: Token A child token such that child.head==self
|
||||
YIELDS (Token): A child token such that child.head==self
|
||||
"""
|
||||
def __get__(self):
|
||||
yield from self.lefts
|
||||
yield from self.rights
|
||||
|
||||
property subtree:
|
||||
"""
|
||||
A sequence of all the token's syntactic descendents.
|
||||
"""A sequence of all the token's syntactic descendents.
|
||||
|
||||
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
||||
YIELDS (Token): A descendent token such that
|
||||
`self.is_ancestor(descendent)`.
|
||||
"""
|
||||
def __get__(self):
|
||||
for word in self.lefts:
|
||||
|
@ -422,18 +456,17 @@ cdef class Token:
|
|||
"""
|
||||
if self.doc is not descendant.doc:
|
||||
return False
|
||||
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
||||
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
|
||||
|
||||
property head:
|
||||
"""The syntactic parent, or "governor", of this token.
|
||||
|
||||
RETURNS (Token): The token head.
|
||||
RETURNS (Token): The token predicted by the parser to be the head of
|
||||
the current token.
|
||||
"""
|
||||
def __get__(self):
|
||||
"""The token predicted by the parser to be the head of the current
|
||||
token.
|
||||
"""
|
||||
return self.doc[self.i + self.c.head]
|
||||
|
||||
def __set__(self, Token new_head):
|
||||
# this function sets the head of self to new_head
|
||||
# and updates the counters for left/right dependents
|
||||
|
@ -453,16 +486,18 @@ cdef class Token:
|
|||
cdef Token anc, child
|
||||
|
||||
# update number of deps of old head
|
||||
if self.c.head > 0: # left dependent
|
||||
if self.c.head > 0: # left dependent
|
||||
old_head.c.l_kids -= 1
|
||||
if self.c.l_edge == old_head.c.l_edge:
|
||||
# the token dominates the left edge so the left edge of the head
|
||||
# may change when the token is reattached
|
||||
# it may not change if the new head is a descendant of the current head
|
||||
# the token dominates the left edge so the left edge of
|
||||
# the head may change when the token is reattached, it may
|
||||
# not change if the new head is a descendant of the current
|
||||
# head
|
||||
|
||||
new_edge = self.c.l_edge
|
||||
# the new l_edge is the left-most l_edge on any of the other dependents
|
||||
# where the l_edge is left of the head, otherwise it is the head
|
||||
# the new l_edge is the left-most l_edge on any of the
|
||||
# other dependents where the l_edge is left of the head,
|
||||
# otherwise it is the head
|
||||
if not is_desc:
|
||||
new_edge = old_head.i
|
||||
for child in old_head.children:
|
||||
|
@ -472,14 +507,15 @@ cdef class Token:
|
|||
new_edge = child.c.l_edge
|
||||
old_head.c.l_edge = new_edge
|
||||
|
||||
# walk up the tree from old_head and assign new l_edge to ancestors
|
||||
# until an ancestor already has an l_edge that's further left
|
||||
# walk up the tree from old_head and assign new l_edge to
|
||||
# ancestors until an ancestor already has an l_edge that's
|
||||
# further left
|
||||
for anc in old_head.ancestors:
|
||||
if anc.c.l_edge <= new_edge:
|
||||
break
|
||||
anc.c.l_edge = new_edge
|
||||
|
||||
elif self.c.head < 0: # right dependent
|
||||
elif self.c.head < 0: # right dependent
|
||||
old_head.c.r_kids -= 1
|
||||
# do the same thing as for l_edge
|
||||
if self.c.r_edge == old_head.c.r_edge:
|
||||
|
@ -500,7 +536,7 @@ cdef class Token:
|
|||
anc.c.r_edge = new_edge
|
||||
|
||||
# update number of deps of new head
|
||||
if rel_newhead_i > 0: # left dependent
|
||||
if rel_newhead_i > 0: # left dependent
|
||||
new_head.c.l_kids += 1
|
||||
# walk up the tree from new head and set l_edge to self.l_edge
|
||||
# until you hit a token with an l_edge further to the left
|
||||
|
@ -511,7 +547,7 @@ cdef class Token:
|
|||
break
|
||||
anc.c.l_edge = self.c.l_edge
|
||||
|
||||
elif rel_newhead_i < 0: # right dependent
|
||||
elif rel_newhead_i < 0: # right dependent
|
||||
new_head.c.r_kids += 1
|
||||
# do the same as for l_edge
|
||||
if self.c.r_edge > new_head.c.r_edge:
|
||||
|
@ -542,12 +578,10 @@ cdef class Token:
|
|||
yield from word.conjuncts
|
||||
|
||||
property ent_type:
|
||||
"""Named entity type.
|
||||
|
||||
RETURNS (uint64): Named entity type.
|
||||
"""
|
||||
"""RETURNS (uint64): Named entity type."""
|
||||
def __get__(self):
|
||||
return self.c.ent_type
|
||||
|
||||
def __set__(self, ent_type):
|
||||
self.c.ent_type = ent_type
|
||||
|
||||
|
@ -561,19 +595,17 @@ cdef class Token:
|
|||
return self.c.ent_iob
|
||||
|
||||
property ent_type_:
|
||||
"""Named entity type.
|
||||
|
||||
RETURNS (unicode): Named entity type.
|
||||
"""
|
||||
"""RETURNS (unicode): Named entity type."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_type]
|
||||
|
||||
def __set__(self, ent_type):
|
||||
self.c.ent_type = self.vocab.strings.add(ent_type)
|
||||
|
||||
property ent_iob_:
|
||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||
"I" means it is inside an entity, "O" means it is outside an entity, and
|
||||
"" means no entity tag is set.
|
||||
"I" means it is inside an entity, "O" means it is outside an entity,
|
||||
and "" means no entity tag is set.
|
||||
|
||||
RETURNS (unicode): IOB code of named entity tag.
|
||||
"""
|
||||
|
@ -582,10 +614,8 @@ cdef class Token:
|
|||
return iob_strings[self.c.ent_iob]
|
||||
|
||||
property ent_id:
|
||||
"""ID of the entity the token is an instance of, if any. Usually
|
||||
assigned by patterns in the Matcher.
|
||||
|
||||
RETURNS (uint64): ID of the entity.
|
||||
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
||||
if any.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.ent_id
|
||||
|
@ -594,10 +624,8 @@ cdef class Token:
|
|||
self.c.ent_id = key
|
||||
|
||||
property ent_id_:
|
||||
"""ID of the entity the token is an instance of, if any. Usually
|
||||
assigned by patterns in the Matcher.
|
||||
|
||||
RETURNS (unicode): ID of the entity.
|
||||
"""RETURNS (unicode): ID of the entity the token is an instance of,
|
||||
if any.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_id]
|
||||
|
@ -606,107 +634,192 @@ cdef class Token:
|
|||
self.c.ent_id = self.vocab.strings.add(name)
|
||||
|
||||
property whitespace_:
|
||||
"""RETURNS (unicode): The trailing whitespace character, if present.
|
||||
"""
|
||||
def __get__(self):
|
||||
return ' ' if self.c.spacy else ''
|
||||
|
||||
property orth_:
|
||||
"""RETURNS (unicode): Verbatim text content (identical to
|
||||
`Token.text`). Existst mostly for consistency with the other
|
||||
attributes.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.orth]
|
||||
|
||||
property lower_:
|
||||
"""RETURNS (unicode): The lowercase token text. Equivalent to
|
||||
`Token.text.lower()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.lower]
|
||||
|
||||
property norm_:
|
||||
"""RETURNS (unicode): The token's norm, i.e. a normalised form of the
|
||||
token text. Usually set in the language's tokenizer exceptions or
|
||||
norm exceptions.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.norm]
|
||||
|
||||
property shape_:
|
||||
"""RETURNS (unicode): Transform of the tokens's string, to show
|
||||
orthographic features. For example, "Xxxx" or "dd".
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.shape]
|
||||
|
||||
property prefix_:
|
||||
"""RETURNS (unicode): A length-N substring from the start of the token.
|
||||
Defaults to `N=1`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.prefix]
|
||||
|
||||
property suffix_:
|
||||
"""RETURNS (unicode): A length-N substring from the end of the token.
|
||||
Defaults to `N=3`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.suffix]
|
||||
|
||||
property lang_:
|
||||
"""RETURNS (unicode): Language of the parent document's vocabulary,
|
||||
e.g. 'en'.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lex.lang]
|
||||
|
||||
property lemma_:
|
||||
"""Base form of the word, with no inflectional suffixes.
|
||||
|
||||
RETURNS (unicode): Token lemma.
|
||||
"""RETURNS (unicode): The token lemma, i.e. the base form of the word,
|
||||
with no inflectional suffixes.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lemma]
|
||||
|
||||
def __set__(self, unicode lemma_):
|
||||
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||
|
||||
property pos_:
|
||||
"""RETURNS (unicode): Coarse-grained part-of-speech tag."""
|
||||
def __get__(self):
|
||||
return parts_of_speech.NAMES[self.c.pos]
|
||||
|
||||
property tag_:
|
||||
"""RETURNS (unicode): Fine-grained part-of-speech tag."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.tag]
|
||||
|
||||
def __set__(self, tag):
|
||||
self.tag = self.vocab.strings.add(tag)
|
||||
|
||||
property dep_:
|
||||
"""RETURNS (unicode): The syntactic dependency label."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.dep]
|
||||
|
||||
def __set__(self, unicode label):
|
||||
self.c.dep = self.vocab.strings.add(label)
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||
"""RETURNS (bool): Whether the token is out-of-vocabulary."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_OOV)
|
||||
|
||||
property is_stop:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP)
|
||||
"""RETURNS (bool): Whether the token is a stop word, i.e. part of a
|
||||
"stop list" defined by the language data.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_STOP)
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
|
||||
"""RETURNS (bool): Whether the token consists of alpha characters.
|
||||
Equivalent to `token.text.isalpha()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_ALPHA)
|
||||
|
||||
property is_ascii:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
|
||||
"""RETURNS (bool): Whether the token consists of ASCII characters.
|
||||
Equivalent to `[any(ord(c) >= 128 for c in token.text)]`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_ASCII)
|
||||
|
||||
property is_digit:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
|
||||
"""RETURNS (bool): Whether the token consists of digits. Equivalent to
|
||||
`token.text.isdigit()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_DIGIT)
|
||||
|
||||
property is_lower:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
|
||||
"""RETURNS (bool): Whether the token is in lowercase. Equivalent to
|
||||
`token.text.islower()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_LOWER)
|
||||
|
||||
property is_upper:
|
||||
"""RETURNS (bool): Whether the token is in uppercase. Equivalent to
|
||||
`token.text.isupper()`
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_UPPER)
|
||||
|
||||
property is_title:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
|
||||
"""RETURNS (bool): Whether the token is in titlecase. Equivalent to
|
||||
`token.text.istitle()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_TITLE)
|
||||
|
||||
property is_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
|
||||
"""RETURNS (bool): Whether the token is punctuation."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_PUNCT)
|
||||
|
||||
property is_space:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
||||
"""RETURNS (bool): Whether the token consists of whitespace characters.
|
||||
Equivalent to `token.text.isspace()`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
||||
|
||||
property is_bracket:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
||||
"""RETURNS (bool): Whether the token is a bracket."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
||||
|
||||
property is_quote:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
||||
"""RETURNS (bool): Whether the token is a quotation mark."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
||||
|
||||
property is_left_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
||||
"""RETURNS (bool): Whether the token is a left punctuation mark."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
||||
|
||||
property is_right_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
||||
"""RETURNS (bool): Whether the token is a left punctuation mark."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
||||
"""RETURNS (bool): Whether the token resembles a URL."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
||||
|
||||
property like_num:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
|
||||
"""RETURNS (bool): Whether the token resembles a number, e.g. "10.9",
|
||||
"10", "ten", etc.
|
||||
"""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, LIKE_NUM)
|
||||
|
||||
property like_email:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
|
||||
"""RETURNS (bool): Whether the token resembles an email address."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL)
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import functools
|
||||
|
||||
|
||||
class Underscore(object):
|
||||
doc_extensions = {}
|
||||
span_extensions = {}
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
|
|
@ -10,25 +10,27 @@ from pathlib import Path
|
|||
import sys
|
||||
import textwrap
|
||||
import random
|
||||
import numpy
|
||||
import io
|
||||
import dill
|
||||
from collections import OrderedDict
|
||||
from thinc.neural._classes.model import Model
|
||||
import functools
|
||||
|
||||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
||||
from .compat import import_file
|
||||
|
||||
import msgpack
|
||||
import msgpack_numpy
|
||||
msgpack_numpy.patch()
|
||||
import ujson
|
||||
|
||||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
||||
from .compat import copy_array, normalize_string_keys, getattr_, import_file
|
||||
|
||||
|
||||
LANGUAGES = {}
|
||||
_data_path = Path(__file__).parent / 'data'
|
||||
_PRINT_ENV = False
|
||||
|
||||
|
||||
def set_env_log(value):
|
||||
global _PRINT_ENV
|
||||
_PRINT_ENV = value
|
||||
|
||||
|
||||
def get_lang_class(lang):
|
||||
|
@ -38,11 +40,12 @@ def get_lang_class(lang):
|
|||
RETURNS (Language): Language class.
|
||||
"""
|
||||
global LANGUAGES
|
||||
if not lang in LANGUAGES:
|
||||
if lang not in LANGUAGES:
|
||||
try:
|
||||
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
||||
except ImportError:
|
||||
raise ImportError("Can't import language %s from spacy.lang." %lang)
|
||||
msg = "Can't import language %s from spacy.lang."
|
||||
raise ImportError(msg % lang)
|
||||
LANGUAGES[lang] = getattr(module, module.__all__[0])
|
||||
return LANGUAGES[lang]
|
||||
|
||||
|
@ -100,14 +103,14 @@ def load_model(name, **overrides):
|
|||
data_path = get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
||||
if isinstance(name, basestring_):
|
||||
if name in set([d.name for d in data_path.iterdir()]): # in data dir / shortcut
|
||||
if isinstance(name, basestring_): # in data dir / shortcut
|
||||
if name in set([d.name for d in data_path.iterdir()]):
|
||||
return load_model_from_link(name, **overrides)
|
||||
if is_package(name): # installed as package
|
||||
if is_package(name): # installed as package
|
||||
return load_model_from_package(name, **overrides)
|
||||
if Path(name).exists(): # path to model data directory
|
||||
if Path(name).exists(): # path to model data directory
|
||||
return load_model_from_path(Path(name), **overrides)
|
||||
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
||||
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
||||
return load_model_from_path(name, **overrides)
|
||||
raise IOError("Can't find model '%s'" % name)
|
||||
|
||||
|
@ -120,7 +123,7 @@ def load_model_from_link(name, **overrides):
|
|||
except AttributeError:
|
||||
raise IOError(
|
||||
"Cant' load '%s'. If you're using a shortcut link, make sure it "
|
||||
"points to a valid model package (not just a data directory)." % name)
|
||||
"points to a valid package (not just a data directory)." % name)
|
||||
return cls.load(**overrides)
|
||||
|
||||
|
||||
|
@ -164,7 +167,8 @@ def load_model_from_init_py(init_file, **overrides):
|
|||
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
||||
data_path = model_path / data_dir
|
||||
if not model_path.exists():
|
||||
raise ValueError("Can't find model directory: %s" % path2str(data_path))
|
||||
msg = "Can't find model directory: %s"
|
||||
raise ValueError(msg % path2str(data_path))
|
||||
return load_model_from_path(data_path, meta, **overrides)
|
||||
|
||||
|
||||
|
@ -176,14 +180,16 @@ def get_model_meta(path):
|
|||
"""
|
||||
model_path = ensure_path(path)
|
||||
if not model_path.exists():
|
||||
raise ValueError("Can't find model directory: %s" % path2str(model_path))
|
||||
msg = "Can't find model directory: %s"
|
||||
raise ValueError(msg % path2str(model_path))
|
||||
meta_path = model_path / 'meta.json'
|
||||
if not meta_path.is_file():
|
||||
raise IOError("Could not read meta.json from %s" % meta_path)
|
||||
meta = read_json(meta_path)
|
||||
for setting in ['lang', 'name', 'version']:
|
||||
if setting not in meta or not meta[setting]:
|
||||
raise ValueError("No valid '%s' setting found in model meta.json" % setting)
|
||||
msg = "No valid '%s' setting found in model meta.json"
|
||||
raise ValueError(msg % setting)
|
||||
return meta
|
||||
|
||||
|
||||
|
@ -240,7 +246,7 @@ def get_async(stream, numpy_array):
|
|||
return numpy_array
|
||||
else:
|
||||
array = cupy.ndarray(numpy_array.shape, order='C',
|
||||
dtype=numpy_array.dtype)
|
||||
dtype=numpy_array.dtype)
|
||||
array.set(numpy_array, stream=stream)
|
||||
return array
|
||||
|
||||
|
@ -274,12 +280,6 @@ def itershuffle(iterable, bufsize=1000):
|
|||
raise StopIteration
|
||||
|
||||
|
||||
_PRINT_ENV = False
|
||||
def set_env_log(value):
|
||||
global _PRINT_ENV
|
||||
_PRINT_ENV = value
|
||||
|
||||
|
||||
def env_opt(name, default=None):
|
||||
if type(default) is float:
|
||||
type_convert = float
|
||||
|
@ -305,17 +305,20 @@ def read_regex(path):
|
|||
path = ensure_path(path)
|
||||
with path.open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
expression = '|'.join(['^' + re.escape(piece)
|
||||
for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
|
||||
|
||||
def compile_prefix_regex(entries):
|
||||
if '(' in entries:
|
||||
# Handle deprecated data
|
||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
expression = '|'.join(['^' + re.escape(piece)
|
||||
for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
else:
|
||||
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
|
||||
expression = '|'.join(['^' + piece
|
||||
for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
|
||||
|
||||
|
@ -359,16 +362,15 @@ def update_exc(base_exceptions, *addition_dicts):
|
|||
exc = dict(base_exceptions)
|
||||
for additions in addition_dicts:
|
||||
for orth, token_attrs in additions.items():
|
||||
if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs):
|
||||
msg = "Invalid value for ORTH in exception: key='%s', orths='%s'"
|
||||
if not all(isinstance(attr[ORTH], unicode_)
|
||||
for attr in token_attrs):
|
||||
msg = "Invalid ORTH value in exception: key='%s', orths='%s'"
|
||||
raise ValueError(msg % (orth, token_attrs))
|
||||
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
||||
if orth != described_orth:
|
||||
raise ValueError("Invalid tokenizer exception: ORTH values "
|
||||
"combined don't match original string. "
|
||||
"key='%s', orths='%s'" % (orth, described_orth))
|
||||
# overlap = set(exc.keys()).intersection(set(additions))
|
||||
# assert not overlap, overlap
|
||||
msg = ("Invalid tokenizer exception: ORTH values combined "
|
||||
"don't match original string. key='%s', orths='%s'")
|
||||
raise ValueError(msg % (orth, described_orth))
|
||||
exc.update(additions)
|
||||
exc = expand_exc(exc, "'", "’")
|
||||
return exc
|
||||
|
@ -401,17 +403,15 @@ def normalize_slice(length, start, stop, step=None):
|
|||
raise ValueError("Stepped slices not supported in Span objects."
|
||||
"Try: list(tokens)[start:stop:step] instead.")
|
||||
if start is None:
|
||||
start = 0
|
||||
start = 0
|
||||
elif start < 0:
|
||||
start += length
|
||||
start += length
|
||||
start = min(length, max(0, start))
|
||||
|
||||
if stop is None:
|
||||
stop = length
|
||||
stop = length
|
||||
elif stop < 0:
|
||||
stop += length
|
||||
stop += length
|
||||
stop = min(length, max(start, stop))
|
||||
|
||||
assert 0 <= start <= stop <= length
|
||||
return start, stop
|
||||
|
||||
|
@ -428,7 +428,7 @@ def compounding(start, stop, compound):
|
|||
>>> assert next(sizes) == 1.5 * 1.5
|
||||
"""
|
||||
def clip(value):
|
||||
return max(value, stop) if (start>stop) else min(value, stop)
|
||||
return max(value, stop) if (start > stop) else min(value, stop)
|
||||
curr = float(start)
|
||||
while True:
|
||||
yield clip(curr)
|
||||
|
@ -438,7 +438,7 @@ def compounding(start, stop, compound):
|
|||
def decaying(start, stop, decay):
|
||||
"""Yield an infinite series of linearly decaying values."""
|
||||
def clip(value):
|
||||
return max(value, stop) if (start>stop) else min(value, stop)
|
||||
return max(value, stop) if (start > stop) else min(value, stop)
|
||||
nr_upd = 1.
|
||||
while True:
|
||||
yield clip(start * 1./(1. + decay * nr_upd))
|
||||
|
@ -530,17 +530,19 @@ def print_markdown(data, title=None):
|
|||
|
||||
if isinstance(data, dict):
|
||||
data = list(data.items())
|
||||
markdown = ["* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)]
|
||||
markdown = ["* **{}:** {}".format(l, unicode_(v))
|
||||
for l, v in data if not excl_value(v)]
|
||||
if title:
|
||||
print("\n## {}".format(title))
|
||||
print('\n{}\n'.format('\n'.join(markdown)))
|
||||
|
||||
|
||||
def prints(*texts, **kwargs):
|
||||
"""Print formatted message (manual ANSI escape sequences to avoid dependency)
|
||||
"""Print formatted message (manual ANSI escape sequences to avoid
|
||||
dependency)
|
||||
|
||||
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
|
||||
**kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
|
||||
**kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
|
||||
"""
|
||||
exits = kwargs.get('exits', None)
|
||||
title = kwargs.get('title', None)
|
||||
|
@ -570,7 +572,8 @@ def _wrap(text, wrap_max=80, indent=4):
|
|||
|
||||
def minify_html(html):
|
||||
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
||||
Disclaimer: NOT a general-purpose solution, only removes indentation/newlines.
|
||||
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
||||
newlines.
|
||||
|
||||
html (unicode): Markup to minify.
|
||||
RETURNS (unicode): "Minified" HTML.
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from libc.stdint cimport int32_t, uint64_t
|
||||
|
||||
import numpy
|
||||
from collections import OrderedDict
|
||||
import msgpack
|
||||
|
@ -9,23 +10,20 @@ cimport numpy as np
|
|||
from thinc.neural.util import get_array_module
|
||||
from thinc.neural._classes.model import Model
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .strings cimport StringStore
|
||||
from . import util
|
||||
from .compat import basestring_, path2str
|
||||
from . import util
|
||||
|
||||
|
||||
cdef class Vectors:
|
||||
'''Store, save and load word vectors.
|
||||
"""Store, save and load word vectors.
|
||||
|
||||
Vectors data is kept in the vectors.data attribute, which should be an
|
||||
instance of numpy.ndarray (for CPU vectors)
|
||||
or cupy.ndarray (for GPU vectors).
|
||||
|
||||
vectors.key2row is a dictionary mapping word hashes to rows
|
||||
in the vectors.data table. The array `vectors.keys` keeps
|
||||
the keys in order, such that keys[vectors.key2row[key]] == key.
|
||||
'''
|
||||
instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
|
||||
(for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
|
||||
rows in the vectors.data table. The array `vectors.keys` keeps the keys in
|
||||
order, such that `keys[vectors.key2row[key]] == key`.
|
||||
"""
|
||||
cdef public object data
|
||||
cdef readonly StringStore strings
|
||||
cdef public object key2row
|
||||
|
@ -33,6 +31,16 @@ cdef class Vectors:
|
|||
cdef public int i
|
||||
|
||||
def __init__(self, strings, width=0, data=None):
|
||||
"""Create a new vector store. To keep the vector table empty, pass
|
||||
`width=0`. You can also create the vector table and add vectors one by
|
||||
one, or set the vector values directly on initialisation.
|
||||
|
||||
strings (StringStore or list): List of strings or StringStore that maps
|
||||
strings to hash values, and vice versa.
|
||||
width (int): Number of dimensions.
|
||||
data (numpy.ndarray): The vector data.
|
||||
RETURNS (Vectors): The newly created object.
|
||||
"""
|
||||
if isinstance(strings, StringStore):
|
||||
self.strings = strings
|
||||
else:
|
||||
|
@ -55,11 +63,13 @@ cdef class Vectors:
|
|||
return (Vectors, (self.strings, self.data))
|
||||
|
||||
def __getitem__(self, key):
|
||||
'''Get a vector by key. If key is a string, it is hashed
|
||||
to an integer ID using the vectors.strings table.
|
||||
"""Get a vector by key. If key is a string, it is hashed to an integer
|
||||
ID using the vectors.strings table. If the integer key is not found in
|
||||
the table, a KeyError is raised.
|
||||
|
||||
If the integer key is not found in the table, a KeyError is raised.
|
||||
'''
|
||||
key (unicode / int): The key to get the vector for.
|
||||
RETURNS (numpy.ndarray): The vector for the key.
|
||||
"""
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings[key]
|
||||
i = self.key2row[key]
|
||||
|
@ -69,30 +79,47 @@ cdef class Vectors:
|
|||
return self.data[i]
|
||||
|
||||
def __setitem__(self, key, vector):
|
||||
'''Set a vector for the given key. If key is a string, it is hashed
|
||||
"""Set a vector for the given key. If key is a string, it is hashed
|
||||
to an integer ID using the vectors.strings table.
|
||||
'''
|
||||
|
||||
key (unicode / int): The key to set the vector for.
|
||||
vector (numpy.ndarray): The vector to set.
|
||||
"""
|
||||
if isinstance(key, basestring):
|
||||
key = self.strings.add(key)
|
||||
i = self.key2row[key]
|
||||
self.data[i] = vector
|
||||
|
||||
def __iter__(self):
|
||||
'''Yield vectors from the table.'''
|
||||
"""Yield vectors from the table.
|
||||
|
||||
YIELDS (numpy.ndarray): A vector.
|
||||
"""
|
||||
yield from self.data
|
||||
|
||||
def __len__(self):
|
||||
'''Return the number of vectors that have been assigned.'''
|
||||
"""Return the number of vectors that have been assigned.
|
||||
|
||||
RETURNS (int): The number of vectors in the data.
|
||||
"""
|
||||
return self.i
|
||||
|
||||
def __contains__(self, key):
|
||||
'''Check whether a key has a vector entry in the table.'''
|
||||
"""Check whether a key has a vector entry in the table.
|
||||
|
||||
key (unicode / int): The key to check.
|
||||
RETURNS (bool): Whether the key has a vector entry.
|
||||
"""
|
||||
if isinstance(key, basestring_):
|
||||
key = self.strings[key]
|
||||
return key in self.key2row
|
||||
|
||||
def add(self, key, vector=None):
|
||||
'''Add a key to the table, optionally setting a vector value as well.'''
|
||||
"""Add a key to the table, optionally setting a vector value as well.
|
||||
|
||||
key (unicode / int): The key to add.
|
||||
vector (numpy.ndarray): An optional vector to add.
|
||||
"""
|
||||
if isinstance(key, basestring_):
|
||||
key = self.strings.add(key)
|
||||
if key not in self.key2row:
|
||||
|
@ -110,24 +137,36 @@ cdef class Vectors:
|
|||
return i
|
||||
|
||||
def items(self):
|
||||
'''Iterate over (string key, vector) pairs, in order.'''
|
||||
"""Iterate over `(string key, vector)` pairs, in order.
|
||||
|
||||
YIELDS (tuple): A key/vector pair.
|
||||
"""
|
||||
for i, key in enumerate(self.keys):
|
||||
string = self.strings[key]
|
||||
yield string, self.data[i]
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
"""Get `(rows, dims)` tuples of number of rows and number of dimensions
|
||||
in the vector table.
|
||||
|
||||
RETURNS (tuple): A `(rows, dims)` pair.
|
||||
"""
|
||||
return self.data.shape
|
||||
|
||||
def most_similar(self, key):
|
||||
# TODO: implement
|
||||
raise NotImplementedError
|
||||
|
||||
def from_glove(self, path):
|
||||
'''Load GloVe vectors from a directory. Assumes binary format,
|
||||
"""Load GloVe vectors from a directory. Assumes binary format,
|
||||
that the vocab is in a vocab.txt, and that vectors are named
|
||||
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
|
||||
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
|
||||
By default GloVe outputs 64-bit vectors.'''
|
||||
By default GloVe outputs 64-bit vectors.
|
||||
|
||||
path (unicode / Path): The path to load the GloVe vectors from.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
for name in path.iterdir():
|
||||
if name.parts[-1].startswith('vectors'):
|
||||
|
@ -150,9 +189,15 @@ cdef class Vectors:
|
|||
self.data
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode / Path): A path to a directory, which will be created if
|
||||
it doesn't exists. Either a string or a Path-like object.
|
||||
"""
|
||||
xp = get_array_module(self.data)
|
||||
if xp is numpy:
|
||||
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
|
||||
save_array = lambda arr, file_: xp.save(file_, arr,
|
||||
allow_pickle=False)
|
||||
else:
|
||||
save_array = lambda arr, file_: xp.save(file_, arr)
|
||||
serializers = OrderedDict((
|
||||
|
@ -162,6 +207,12 @@ cdef class Vectors:
|
|||
return util.to_disk(path, serializers, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode / Path): Directory path, string or Path-like object.
|
||||
RETURNS (Vectors): The modified object.
|
||||
"""
|
||||
def load_keys(path):
|
||||
if path.exists():
|
||||
self.keys = numpy.load(path2str(path))
|
||||
|
@ -182,6 +233,11 @@ cdef class Vectors:
|
|||
return self
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `Vectors` object.
|
||||
"""
|
||||
def serialize_weights():
|
||||
if hasattr(self.data, 'to_bytes'):
|
||||
return self.data.to_bytes()
|
||||
|
@ -194,6 +250,12 @@ cdef class Vectors:
|
|||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
||||
data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (Vectors): The `Vectors` object.
|
||||
"""
|
||||
def deserialize_weights(b):
|
||||
if hasattr(self.data, 'from_bytes'):
|
||||
self.data.from_bytes()
|
||||
|
|
119
spacy/vocab.pyx
119
spacy/vocab.pyx
|
@ -1,32 +1,24 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
import re
|
||||
import numpy
|
||||
import dill
|
||||
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdint cimport int32_t
|
||||
from libc.math cimport sqrt
|
||||
from cymem.cymem cimport Address
|
||||
from collections import OrderedDict
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport Lexeme
|
||||
from .strings cimport hash_string
|
||||
from .typedefs cimport attr_t
|
||||
from .tokens.token cimport Token
|
||||
from .attrs cimport PROB, LANG
|
||||
from .attrs cimport PROB, LANG, ORTH, TAG
|
||||
from .structs cimport SerializedLexemeC
|
||||
|
||||
from .compat import copy_reg, pickle, basestring_
|
||||
from .compat import copy_reg, basestring_
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from .vectors import Vectors
|
||||
from . import util
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
from ._ml import link_vectors_to_models
|
||||
from . import util
|
||||
|
||||
|
||||
cdef class Vocab:
|
||||
|
@ -35,23 +27,22 @@ cdef class Vocab:
|
|||
C-data that is shared between `Doc` objects.
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
strings=tuple(), **deprecated_kwargs):
|
||||
strings=tuple(), **deprecated_kwargs):
|
||||
"""Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
|
||||
to compute them. Defaults to `None`.
|
||||
tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||
functions to compute them. Defaults to `None`.
|
||||
tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
|
||||
parts-of-speech, and optionally morphological attributes.
|
||||
lemmatizer (object): A lemmatizer. Defaults to `None`.
|
||||
strings (StringStore): StringStore that maps strings to integers, and
|
||||
vice versa.
|
||||
RETURNS (Vocab): The newly constructed vocab object.
|
||||
RETURNS (Vocab): The newly constructed object.
|
||||
"""
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
tag_map = tag_map if tag_map is not None else {}
|
||||
if lemmatizer in (None, True, False):
|
||||
lemmatizer = Lemmatizer({}, {}, {})
|
||||
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
|
@ -83,19 +74,20 @@ cdef class Vocab:
|
|||
|
||||
The flag_getter function will be called over the words currently in the
|
||||
vocab, and then applied to new words as they occur. You'll then be able
|
||||
to access the flag value on each token, using token.check_flag(flag_id).
|
||||
to access the flag value on each token using token.check_flag(flag_id).
|
||||
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
|
||||
`Token.check_flag`.
|
||||
|
||||
flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
|
||||
value.
|
||||
flag_getter (callable): A function `f(unicode) -> bool`, to get the
|
||||
flag value.
|
||||
flag_id (int): An integer between 1 and 63 (inclusive), specifying
|
||||
the bit at which the flag will be stored. If -1, the lowest
|
||||
available bit will be chosen.
|
||||
RETURNS (int): The integer ID by which the flag value can be checked.
|
||||
|
||||
EXAMPLE:
|
||||
>>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
|
||||
>>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy']
|
||||
>>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter)
|
||||
>>> doc = nlp(u'I like spaCy')
|
||||
>>> assert doc[2].check_flag(MY_PRODUCT) == True
|
||||
"""
|
||||
|
@ -106,9 +98,10 @@ cdef class Vocab:
|
|||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
"Cannot find empty bit for new lexical flag. All bits between "
|
||||
"0 and 63 are occupied. You can replace one by specifying the "
|
||||
"flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA")
|
||||
"Cannot find empty bit for new lexical flag. All bits "
|
||||
"between 0 and 63 are occupied. You can replace one by "
|
||||
"specifying the flag_id explicitly, e.g. "
|
||||
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
|
||||
elif flag_id >= 64 or flag_id < 1:
|
||||
raise ValueError(
|
||||
"Invalid value for flag_id: %d. Flag IDs must be between "
|
||||
|
@ -119,9 +112,9 @@ cdef class Vocab:
|
|||
return flag_id
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||
`Lexeme` if necessary using memory acquired from the given pool. If the
|
||||
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""
|
||||
if string == u'':
|
||||
return &EMPTY_LEXEME
|
||||
|
@ -138,9 +131,9 @@ cdef class Vocab:
|
|||
return self._new_lexeme(mem, string)
|
||||
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||
`Lexeme` if necessary using memory acquired from the given pool. If the
|
||||
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""
|
||||
if orth == 0:
|
||||
return &EMPTY_LEXEME
|
||||
|
@ -202,8 +195,8 @@ cdef class Vocab:
|
|||
for orth, addr in self._by_orth.items():
|
||||
yield Lexeme(self, orth)
|
||||
|
||||
def __getitem__(self, id_or_string):
|
||||
"""Retrieve a lexeme, given an int ID or a unicode string. If a
|
||||
def __getitem__(self, id_or_string):
|
||||
"""Retrieve a lexeme, given an int ID or a unicode string. If a
|
||||
previously unseen unicode string is given, a new lexeme is created and
|
||||
stored.
|
||||
|
||||
|
@ -228,13 +221,14 @@ cdef class Vocab:
|
|||
cdef int i
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
|
||||
props = intify_attrs(props, strings_map=self.strings,
|
||||
_do_deprecated=True)
|
||||
token = &tokens[i]
|
||||
# Set the special tokens up to have arbitrary attributes
|
||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
|
||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
||||
token.lex = lex
|
||||
if attrs.TAG in props:
|
||||
self.morphology.assign_tag(token, props[attrs.TAG])
|
||||
if TAG in props:
|
||||
self.morphology.assign_tag(token, props[TAG])
|
||||
for attr_id, value in props.items():
|
||||
Token.set_struct_attr(token, attr_id, value)
|
||||
Lexeme.set_struct_attr(lex, attr_id, value)
|
||||
|
@ -253,16 +247,13 @@ cdef class Vocab:
|
|||
self.vectors = Vectors(self.strings, width=new_dim)
|
||||
|
||||
def get_vector(self, orth):
|
||||
"""Retrieve a vector for a word in the vocabulary.
|
||||
"""Retrieve a vector for a word in the vocabulary. Words can be looked
|
||||
up by string or int ID. If no vectors data is loaded, ValueError is
|
||||
raised.
|
||||
|
||||
Words can be looked up by string or int ID.
|
||||
|
||||
RETURNS:
|
||||
A word vector. Size and shape determined by the
|
||||
vocab.vectors instance. Usually, a numpy ndarray
|
||||
of shape (300,) and dtype float32.
|
||||
|
||||
RAISES: If no vectors data is loaded, ValueError is raised.
|
||||
RETURNS (numpy.ndarray): A word vector. Size
|
||||
and shape determined by the `vocab.vectors` instance. Usually, a
|
||||
numpy ndarray of shape (300,) and dtype float32.
|
||||
"""
|
||||
if isinstance(orth, basestring_):
|
||||
orth = self.strings.add(orth)
|
||||
|
@ -272,21 +263,16 @@ cdef class Vocab:
|
|||
return numpy.zeros((self.vectors_length,), dtype='f')
|
||||
|
||||
def set_vector(self, orth, vector):
|
||||
"""Set a vector for a word in the vocabulary.
|
||||
|
||||
Words can be referenced by string or int ID.
|
||||
|
||||
RETURNS:
|
||||
None
|
||||
"""Set a vector for a word in the vocabulary. Words can be referenced
|
||||
by string or int ID.
|
||||
"""
|
||||
if not isinstance(orth, basestring_):
|
||||
orth = self.strings[orth]
|
||||
self.vectors.add(orth, vector=vector)
|
||||
|
||||
def has_vector(self, orth):
|
||||
"""Check whether a word has a vector. Returns False if no
|
||||
vectors have been loaded. Words can be looked up by string
|
||||
or int ID."""
|
||||
"""Check whether a word has a vector. Returns False if no vectors have
|
||||
been loaded. Words can be looked up by string or int ID."""
|
||||
if isinstance(orth, basestring_):
|
||||
orth = self.strings.add(orth)
|
||||
return orth in self.vectors
|
||||
|
@ -295,7 +281,7 @@ cdef class Vocab:
|
|||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
if not path.exists():
|
||||
|
@ -420,16 +406,13 @@ def pickle_vocab(vocab):
|
|||
length = vocab.length
|
||||
data_dir = vocab.data_dir
|
||||
lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
|
||||
|
||||
lexemes_data = vocab.lexemes_to_bytes()
|
||||
|
||||
return (unpickle_vocab,
|
||||
(sstore, morph, data_dir, lex_attr_getters,
|
||||
lexemes_data, length))
|
||||
(sstore, morph, data_dir, lex_attr_getters, lexemes_data, length))
|
||||
|
||||
|
||||
def unpickle_vocab(sstore, morphology, data_dir,
|
||||
lex_attr_getters, bytes lexemes_data, int length):
|
||||
lex_attr_getters, bytes lexemes_data, int length):
|
||||
cdef Vocab vocab = Vocab()
|
||||
vocab.length = length
|
||||
vocab.strings = sstore
|
||||
|
@ -449,12 +432,10 @@ class LookupError(Exception):
|
|||
@classmethod
|
||||
def mismatched_strings(cls, id_, id_string, original_string):
|
||||
return cls(
|
||||
"Error fetching a Lexeme from the Vocab. When looking up a string, "
|
||||
"the lexeme returned had an orth ID that did not match the query string. "
|
||||
"This means that the cached lexeme structs are mismatched to the "
|
||||
"string encoding table. The mismatched:\n"
|
||||
"Query string: {query}\n"
|
||||
"Orth cached: {orth_str}\n"
|
||||
"ID of orth: {orth_id}".format(
|
||||
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
|
||||
)
|
||||
"Error fetching a Lexeme from the Vocab. When looking up a "
|
||||
"string, the lexeme returned had an orth ID that did not match "
|
||||
"the query string. This means that the cached lexeme structs are "
|
||||
"mismatched to the string encoding table. The mismatched:\n"
|
||||
"Query string: {}\n"
|
||||
"Orth cached: {}\n"
|
||||
"Orth ID: {}".format(repr(original_string), repr(id_string), id_))
|
||||
|
|
|
@ -134,11 +134,12 @@ p
|
|||
p
|
||||
| Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format]
|
||||
| for use with the #[code train] command and other experiment management
|
||||
| functions. The right converter is chosen based on the file extension of
|
||||
| the input file. Currently only supports #[code .conllu].
|
||||
| functions. The converter can be specified on the command line, or
|
||||
| chosen based on the file extension of the input file.
|
||||
|
||||
+code(false, "bash", "$", false, false, true).
|
||||
spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
|
||||
spacy convert [input_file] [output_dir] [--converter] [--n-sents]
|
||||
[--morphology]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
|
@ -151,6 +152,11 @@ p
|
|||
+cell positional
|
||||
+cell Output directory for converted JSON file.
|
||||
|
||||
+row
|
||||
+cell #[code converter], #[code -c]
|
||||
+cell option
|
||||
+cell #[+tag-new(2)] Name of converter to use (see below).
|
||||
|
||||
+row
|
||||
+cell #[code --n-sents], #[code -n]
|
||||
+cell option
|
||||
|
@ -166,6 +172,25 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
p The following converters are available:
|
||||
|
||||
+table(["ID", "Description"])
|
||||
+row
|
||||
+cell #[code auto]
|
||||
+cell Automatically pick converter based on file extension (default).
|
||||
|
||||
+row
|
||||
+cell #[code conllu], #[code conll]
|
||||
+cell Universal Dependencies #[code .conllu] or #[code .conll] format.
|
||||
|
||||
+row
|
||||
+cell #[code ner]
|
||||
+cell Tab-based named entity recognition format.
|
||||
|
||||
+row
|
||||
+cell #[code iob]
|
||||
+cell IOB named entity recognition format.
|
||||
|
||||
+h(3, "train") Train
|
||||
|
||||
p
|
||||
|
|
|
@ -332,6 +332,26 @@ p
|
|||
+cell dict
|
||||
+cell A dictionary mapping attributes to integer counts.
|
||||
|
||||
+h(2, "get_lca_matrix") Doc.get_lca_matrix
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Calculates the lowest common ancestor matrix for a given #[code Doc].
|
||||
| Returns LCA matrix containing the integer index of the ancestor, or
|
||||
| #[code -1] if no common ancestor is found, e.g. if span excludes a
|
||||
| necessary ancestor.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u"This is a test")
|
||||
matrix = doc.get_lca_matrix()
|
||||
# array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell The lowest common ancestor matrix of the #[code Doc].
|
||||
|
||||
+h(2, "to_array") Doc.to_array
|
||||
+tag method
|
||||
|
||||
|
@ -764,3 +784,10 @@ p
|
|||
+cell
|
||||
| A dictionary that allows customisation of properties of
|
||||
| #[code Span] children.
|
||||
|
||||
+row
|
||||
+cell #[code _]
|
||||
+cell #[code Underscore]
|
||||
+cell
|
||||
| User space for adding custom
|
||||
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
|
||||
|
|
|
@ -157,27 +157,61 @@ p The L2 norm of the lexeme's vector representation.
|
|||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell
|
||||
+cell The lexeme's vocabulary.
|
||||
|
||||
+row
|
||||
+cell #[code text]
|
||||
+cell unicode
|
||||
+cell Verbatim text content.
|
||||
|
||||
+row
|
||||
+cell #[code orth]
|
||||
+cell int
|
||||
+cell ID of the verbatim text content.
|
||||
|
||||
+row
|
||||
+cell #[code orth_]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Verbatim text content (identical to #[code Lexeme.text]). Existst
|
||||
| mostly for consistency with the other attributes.
|
||||
|
||||
+row
|
||||
+cell #[code lex_id]
|
||||
+cell int
|
||||
+cell ID of the lexeme's lexical type.
|
||||
|
||||
+row
|
||||
+cell #[code rank]
|
||||
+cell int
|
||||
+cell
|
||||
| Sequential ID of the lexemes's lexical type, used to index into
|
||||
| tables, e.g. for word vectors.
|
||||
|
||||
+row
|
||||
+cell #[code flags]
|
||||
+cell int
|
||||
+cell Container of the lexeme's binary flags.
|
||||
|
||||
+row
|
||||
+cell #[code norm]
|
||||
+cell int
|
||||
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
|
||||
|
||||
+row
|
||||
+cell #[code norm_]
|
||||
+cell unicode
|
||||
+cell The lexemes's norm, i.e. a normalised form of the lexeme text.
|
||||
|
||||
+row
|
||||
+cell #[code lower]
|
||||
+cell int
|
||||
+cell Lower-case form of the word.
|
||||
+cell Lowercase form of the word.
|
||||
|
||||
+row
|
||||
+cell #[code lower_]
|
||||
+cell unicode
|
||||
+cell Lower-case form of the word.
|
||||
+cell Lowercase form of the word.
|
||||
|
||||
+row
|
||||
+cell #[code shape]
|
||||
|
@ -192,22 +226,30 @@ p The L2 norm of the lexeme's vector representation.
|
|||
+row
|
||||
+cell #[code prefix]
|
||||
+cell int
|
||||
+cell Length-N substring from the start of the word. Defaults to #[code N=1].
|
||||
+cell
|
||||
| Length-N substring from the start of the word. Defaults to
|
||||
| #[code N=1].
|
||||
|
||||
+row
|
||||
+cell #[code prefix_]
|
||||
+cell unicode
|
||||
+cell Length-N substring from the start of the word. Defaults to #[code N=1].
|
||||
+cell
|
||||
| Length-N substring from the start of the word. Defaults to
|
||||
| #[code N=1].
|
||||
|
||||
+row
|
||||
+cell #[code suffix]
|
||||
+cell int
|
||||
+cell Length-N substring from the end of the word. Defaults to #[code N=3].
|
||||
+cell
|
||||
| Length-N substring from the end of the word. Defaults to
|
||||
| #[code N=3].
|
||||
|
||||
+row
|
||||
+cell #[code suffix_]
|
||||
+cell unicode
|
||||
+cell Length-N substring from the start of the word. Defaults to #[code N=3].
|
||||
+cell
|
||||
| Length-N substring from the start of the word. Defaults to
|
||||
| #[code N=3].
|
||||
|
||||
+row
|
||||
+cell #[code is_alpha]
|
||||
|
@ -237,6 +279,13 @@ p The L2 norm of the lexeme's vector representation.
|
|||
| Is the lexeme in lowercase? Equivalent to
|
||||
| #[code lexeme.text.islower()].
|
||||
|
||||
+row
|
||||
+cell #[code is_upper]
|
||||
+cell bool
|
||||
+cell
|
||||
| Is the lexeme in uppercase? Equivalent to
|
||||
| #[code lexeme.text.isupper()].
|
||||
|
||||
+row
|
||||
+cell #[code is_title]
|
||||
+cell bool
|
||||
|
@ -249,6 +298,16 @@ p The L2 norm of the lexeme's vector representation.
|
|||
+cell bool
|
||||
+cell Is the lexeme punctuation?
|
||||
|
||||
+row
|
||||
+cell #[code is_left_punct]
|
||||
+cell bool
|
||||
+cell Is the lexeme a left punctuation mark, e.g. #[code (]?
|
||||
|
||||
+row
|
||||
+cell #[code is_right_punct]
|
||||
+cell bool
|
||||
+cell Is the lexeme a right punctuation mark, e.g. #[code )]?
|
||||
|
||||
+row
|
||||
+cell #[code is_space]
|
||||
+cell bool
|
||||
|
@ -256,6 +315,16 @@ p The L2 norm of the lexeme's vector representation.
|
|||
| Does the lexeme consist of whitespace characters? Equivalent to
|
||||
| #[code lexeme.text.isspace()].
|
||||
|
||||
+row
|
||||
+cell #[code is_bracket]
|
||||
+cell bool
|
||||
+cell Is the lexeme a bracket?
|
||||
|
||||
+row
|
||||
+cell #[code is_quote]
|
||||
+cell bool
|
||||
+cell Is the lexeme a quotation mark?
|
||||
|
||||
+row
|
||||
+cell #[code like_url]
|
||||
+cell bool
|
||||
|
@ -285,6 +354,7 @@ p The L2 norm of the lexeme's vector representation.
|
|||
+cell #[code lang]
|
||||
+cell int
|
||||
+cell Language of the parent vocabulary.
|
||||
|
||||
+row
|
||||
+cell #[code lang_]
|
||||
+cell unicode
|
||||
|
@ -293,9 +363,16 @@ p The L2 norm of the lexeme's vector representation.
|
|||
+row
|
||||
+cell #[code prob]
|
||||
+cell float
|
||||
+cell Smoothed log probability estimate of lexeme's type.
|
||||
+cell Smoothed log probability estimate of the lexeme's type.
|
||||
|
||||
+row
|
||||
+cell #[code cluster]
|
||||
+cell int
|
||||
+cell Brown cluster ID.
|
||||
|
||||
+row
|
||||
+cell #[code sentiment]
|
||||
+cell float
|
||||
+cell A scalar value indicating the positivity or negativity of the lexeme.
|
||||
+cell
|
||||
| A scalar value indicating the positivity or negativity of the
|
||||
| lexeme.
|
||||
|
|
|
@ -248,6 +248,28 @@ p
|
|||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "get_lca_matrix") Span.get_lca_matrix
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Calculates the lowest common ancestor matrix for a given #[code Span].
|
||||
| Returns LCA matrix containing the integer index of the ancestor, or
|
||||
| #[code -1] if no common ancestor is found, e.g. if span excludes a
|
||||
| necessary ancestor.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn')
|
||||
span = doc[1:4]
|
||||
matrix = span.get_lca_matrix()
|
||||
# array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell The lowest common ancestor matrix of the #[code Span].
|
||||
|
||||
|
||||
+h(2, "to_array") Span.to_array
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
@ -347,7 +369,7 @@ p
|
|||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p Tokens that are to the left of the span, whose head is within the span.
|
||||
p Tokens that are to the left of the span, whose heads are within the span.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
|
@ -364,7 +386,7 @@ p Tokens that are to the left of the span, whose head is within the span.
|
|||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p Tokens that are to the right of the span, whose head is within the span.
|
||||
p Tokens that are to the right of the span, whose heads are within the span.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
|
@ -377,6 +399,42 @@ p Tokens that are to the right of the span, whose head is within the span.
|
|||
+cell #[code Token]
|
||||
+cell A right-child of a token of the span.
|
||||
|
||||
+h(2, "n_lefts") Span.n_lefts
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| The number of tokens that are to the left of the span, whose heads are
|
||||
| within the span.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
assert doc[3:7].n_lefts == 1
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of left-child tokens.
|
||||
|
||||
+h(2, "n_rights") Span.n_rights
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| The number of tokens that are to the right of the span, whose heads are
|
||||
| within the span.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
assert doc[2:4].n_rights == 1
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of right-child tokens.
|
||||
|
||||
+h(2, "subtree") Span.subtree
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
@ -495,6 +553,18 @@ p
|
|||
| The text content of the span with a trailing whitespace character
|
||||
| if the last token has one.
|
||||
|
||||
+row
|
||||
+cell #[code orth]
|
||||
+cell int
|
||||
+cell ID of the verbatim text content.
|
||||
|
||||
+row
|
||||
+cell #[code orth_]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Verbatim text content (identical to #[code Span.text]). Existst
|
||||
| mostly for consistency with the other attributes.
|
||||
|
||||
+row
|
||||
+cell #[code label]
|
||||
+cell int
|
||||
|
@ -519,3 +589,17 @@ p
|
|||
+cell #[code ent_id_]
|
||||
+cell unicode
|
||||
+cell The string ID of the named entity the token is an instance of.
|
||||
|
||||
+row
|
||||
+cell #[code sentiment]
|
||||
+cell float
|
||||
+cell
|
||||
| A scalar value indicating the positivity or negativity of the
|
||||
| span.
|
||||
|
||||
+row
|
||||
+cell #[code _]
|
||||
+cell #[code Underscore]
|
||||
+cell
|
||||
| User space for adding custom
|
||||
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
|
||||
|
|
|
@ -302,6 +302,80 @@ p A sequence of the token's immediate syntactic children.
|
|||
+cell #[code Token]
|
||||
+cell A child token such that #[code child.head==self].
|
||||
|
||||
+h(2, "lefts") Token.lefts
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| The leftward immediate children of the word, in the syntactic dependency
|
||||
| parse.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
lefts = [t.text for t in doc[3].lefts]
|
||||
assert lefts == [u'New']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A left-child of the token.
|
||||
|
||||
+h(2, "rights") Token.rights
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| The rightward immediate children of the word, in the syntactic
|
||||
| dependency parse.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
rights = [t.text for t in doc[3].rights]
|
||||
assert rights == [u'in']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A right-child of the token.
|
||||
|
||||
+h(2, "n_lefts") Token.n_lefts
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| The number of leftward immediate children of the word, in the syntactic
|
||||
| dependency parse.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
assert doc[3].n_lefts == 1
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of left-child tokens.
|
||||
|
||||
+h(2, "n_rights") Token.n_rights
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| The number of rightward immediate children of the word, in the syntactic
|
||||
| dependency parse.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
assert doc[3].n_rights == 1
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of right-child tokens.
|
||||
|
||||
+h(2, "subtree") Token.subtree
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
@ -489,15 +563,35 @@ p The L2 norm of the token's vector representation.
|
|||
+cell unicode
|
||||
+cell Base form of the token, with no inflectional suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code norm]
|
||||
+cell int
|
||||
+cell
|
||||
| The token's norm, i.e. a normalised form of the token text.
|
||||
| Usually set in the language's
|
||||
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
|
||||
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
|
||||
|
||||
+row
|
||||
+cell #[code norm_]
|
||||
+cell unicode
|
||||
+cell
|
||||
| The token's norm, i.e. a normalised form of the token text.
|
||||
| Usually set in the language's
|
||||
| #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or
|
||||
| #[+a("/usage/adding-languages#norm-exceptions") norm exceptions].
|
||||
|
||||
+row
|
||||
+cell #[code lower]
|
||||
+cell int
|
||||
+cell Lower-case form of the token.
|
||||
+cell Lowercase form of the token.
|
||||
|
||||
+row
|
||||
+cell #[code lower_]
|
||||
+cell unicode
|
||||
+cell Lower-case form of the token.
|
||||
+cell
|
||||
| Lowercase form of the token text. Equivalent to
|
||||
| #[code Token.text.lower()].
|
||||
|
||||
+row
|
||||
+cell #[code shape]
|
||||
|
@ -537,7 +631,9 @@ p The L2 norm of the token's vector representation.
|
|||
+row
|
||||
+cell #[code suffix_]
|
||||
+cell unicode
|
||||
+cell Length-N substring from the end of the token. Defaults to #[code N=3].
|
||||
+cell
|
||||
| Length-N substring from the end of the token. Defaults to
|
||||
| #[code N=3].
|
||||
|
||||
+row
|
||||
+cell #[code is_alpha]
|
||||
|
@ -672,6 +768,7 @@ p The L2 norm of the token's vector representation.
|
|||
+cell #[code lang]
|
||||
+cell int
|
||||
+cell Language of the parent document's vocabulary.
|
||||
|
||||
+row
|
||||
+cell #[code lang_]
|
||||
+cell unicode
|
||||
|
@ -690,9 +787,30 @@ p The L2 norm of the token's vector representation.
|
|||
+row
|
||||
+cell #[code sentiment]
|
||||
+cell float
|
||||
+cell A scalar value indicating the positivity or negativity of the token.
|
||||
+cell
|
||||
| A scalar value indicating the positivity or negativity of the
|
||||
| token.
|
||||
|
||||
+row
|
||||
+cell #[code lex_id]
|
||||
+cell int
|
||||
+cell ID of the token's lexical type.
|
||||
+cell Sequential ID of the token's lexical type.
|
||||
|
||||
+row
|
||||
+cell #[code rank]
|
||||
+cell int
|
||||
+cell
|
||||
| Sequential ID of the token's lexical type, used to index into
|
||||
| tables, e.g. for word vectors.
|
||||
|
||||
+row
|
||||
+cell #[code cluster]
|
||||
+cell int
|
||||
+cell Brown cluster ID.
|
||||
|
||||
+row
|
||||
+cell #[code _]
|
||||
+cell #[code Underscore]
|
||||
+cell
|
||||
| User space for adding custom
|
||||
| #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions].
|
||||
|
|
|
@ -36,12 +36,14 @@ p
|
|||
| that maps strings to hash values, and vice versa.
|
||||
|
||||
+row
|
||||
+cell #[code data]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell #[code width]
|
||||
+cell int
|
||||
+cell Number of dimensions.
|
||||
|
||||
+row
|
||||
+cell #[code width]
|
||||
+cell Number of dimensions.
|
||||
+cell #[code data]
|
||||
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell The vector data.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
|
@ -208,7 +210,7 @@ p
|
|||
+row("foot")
|
||||
+cell returns
|
||||
+cell tuple
|
||||
+cell #[code (rows, dims)] pairs.
|
||||
+cell A #[code (rows, dims)] pair.
|
||||
|
||||
+h(2, "from_glove") Vectors.from_glove
|
||||
+tag method
|
||||
|
@ -238,11 +240,16 @@ p Save the current state to a directory.
|
|||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell unicode / #[code Path]
|
||||
+cell
|
||||
| A path to a directory, which will be created if it doesn't exist.
|
||||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being saved.
|
||||
|
||||
+h(2, "from_disk") Vectors.from_disk
|
||||
+tag method
|
||||
|
||||
|
@ -255,7 +262,7 @@ p Loads state from a directory. Modifies the object in place and returns it.
|
|||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell unicode / #[code Path]
|
||||
+cell
|
||||
| A path to a directory. Paths may be either strings or
|
||||
| #[code Path]-like objects.
|
||||
|
@ -297,7 +304,7 @@ p Load state from a binary string.
|
|||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code bytes_data]
|
||||
+cell #[code data]
|
||||
+cell bytes
|
||||
+cell The data to load from.
|
||||
|
||||
|
|
|
@ -111,11 +111,13 @@ p
|
|||
|
||||
p
|
||||
| A few more convenience attributes are provided for iterating around the
|
||||
| local tree from the token. The #[code .lefts] and #[code .rights]
|
||||
| attributes provide sequences of syntactic children that occur before and
|
||||
| after the token. Both sequences are in sentences order. There are also
|
||||
| two integer-typed attributes, #[code .n_rights] and #[code .n_lefts],
|
||||
| that give the number of left and right children.
|
||||
| local tree from the token. The #[+api("token#lefts") #[code Token.lefts]]
|
||||
| and #[+api("token#rights") #[code Token.rights]] attributes provide
|
||||
| sequences of syntactic children that occur before and after the token.
|
||||
| Both sequences are in sentence order. There are also two integer-typed
|
||||
| attributes, #[+api("token#n_rights") #[code Token.n_rights]] and
|
||||
| #[+api("token#n_lefts") #[code Token.n_lefts]], that give the number of
|
||||
| left and right children.
|
||||
|
||||
+code.
|
||||
doc = nlp(u'bright red apples on the tree')
|
||||
|
@ -126,10 +128,11 @@ p
|
|||
|
||||
p
|
||||
| You can get a whole phrase by its syntactic head using the
|
||||
| #[code .subtree] attribute. This returns an ordered sequence of tokens.
|
||||
| You can walk up the tree with the #[code .ancestors] attribute, and
|
||||
| check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]]
|
||||
| method.
|
||||
| #[+api("token#subtree") #[code Token.subtree]] attribute. This returns an
|
||||
| ordered sequence of tokens. You can walk up the tree with the
|
||||
| #[+api("token#ancestors") #[code Token.ancestors]] attribute, and
|
||||
| check dominance with
|
||||
| #[+api("token#is_ancestor") #[code Token.is_ancestor()]].
|
||||
|
||||
+aside("Projective vs. non-projective")
|
||||
| For the #[+a("/models/en") default English model], the
|
||||
|
|
Loading…
Reference in New Issue
Block a user