diff --git a/bin/get_freqs.py b/bin/get_freqs.py deleted file mode 100755 index 54d90ef8c..000000000 --- a/bin/get_freqs.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python - -from __future__ import unicode_literals, print_function - -import plac -import joblib -from os import path -import os -import bz2 -import ujson -from preshed.counter import PreshCounter -from joblib import Parallel, delayed -import io - -from spacy.en import English -from spacy.strings import StringStore -from spacy.attrs import ORTH -from spacy.tokenizer import Tokenizer -from spacy.vocab import Vocab - - -def iter_comments(loc): - with bz2.BZ2File(loc) as file_: - for line in file_: - yield ujson.loads(line) - - -def count_freqs(input_loc, output_loc): - print(output_loc) - vocab = English.default_vocab(get_lex_attr=None) - tokenizer = Tokenizer.from_dir(vocab, - path.join(English.default_data_dir(), 'tokenizer')) - - counts = PreshCounter() - for json_comment in iter_comments(input_loc): - doc = tokenizer(json_comment['body']) - doc.count_by(ORTH, counts=counts) - - with io.open(output_loc, 'w', 'utf8') as file_: - for orth, freq in counts: - string = tokenizer.vocab.strings[orth] - if not string.isspace(): - file_.write('%d\t%s\n' % (freq, string)) - - -def parallelize(func, iterator, n_jobs): - Parallel(n_jobs=n_jobs)(delayed(func)(*item) for item in iterator) - - -def merge_counts(locs, out_loc): - string_map = StringStore() - counts = PreshCounter() - for loc in locs: - with io.open(loc, 'r', encoding='utf8') as file_: - for line in file_: - freq, word = line.strip().split('\t', 1) - orth = string_map[word] - counts.inc(orth, int(freq)) - with io.open(out_loc, 'w', encoding='utf8') as file_: - for orth, count in counts: - string = string_map[orth] - file_.write('%d\t%s\n' % (count, string)) - - -@plac.annotations( - input_loc=("Location of input file list"), - freqs_dir=("Directory for frequency files"), - output_loc=("Location for output file"), - n_jobs=("Number of workers", "option", "n", int), - skip_existing=("Skip inputs where an output file exists", "flag", "s", bool), -) -def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False): - tasks = [] - outputs = [] - for input_path in open(input_loc): - input_path = input_path.strip() - if not input_path: - continue - filename = input_path.split('/')[-1] - output_path = path.join(freqs_dir, filename.replace('bz2', 'freq')) - outputs.append(output_path) - if not path.exists(output_path) or not skip_existing: - tasks.append((input_path, output_path)) - - if tasks: - parallelize(count_freqs, tasks, n_jobs) - - print("Merge") - merge_counts(outputs, output_loc) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/munge_ewtb.py b/bin/munge_ewtb.py deleted file mode 100755 index 4e21ceb07..000000000 --- a/bin/munge_ewtb.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python -from __future__ import unicode_literals - -from xml.etree import cElementTree as ElementTree -import json -import re - -import plac -from pathlib import Path -from os import path - - -escaped_tokens = { - '-LRB-': '(', - '-RRB-': ')', - '-LSB-': '[', - '-RSB-': ']', - '-LCB-': '{', - '-RCB-': '}', -} - -def read_parses(parse_loc): - offset = 0 - doc = [] - for parse in open(str(parse_loc) + '.dep').read().strip().split('\n\n'): - parse = _adjust_token_ids(parse, offset) - offset += len(parse.split('\n')) - doc.append(parse) - return doc - -def _adjust_token_ids(parse, offset): - output = [] - for line in parse.split('\n'): - pieces = line.split() - pieces[0] = str(int(pieces[0]) + offset) - pieces[5] = str(int(pieces[5]) + offset) if pieces[5] != '0' else '0' - output.append('\t'.join(pieces)) - return '\n'.join(output) - - -def _fmt_doc(filename, paras): - return {'id': filename, 'paragraphs': [_fmt_para(*para) for para in paras]} - - -def _fmt_para(raw, sents): - return {'raw': raw, 'sentences': [_fmt_sent(sent) for sent in sents]} - - -def _fmt_sent(sent): - return { - 'tokens': [_fmt_token(*t.split()) for t in sent.strip().split('\n')], - 'brackets': []} - - -def _fmt_token(id_, word, hyph, pos, ner, head, dep, blank1, blank2, blank3): - head = int(head) - 1 - id_ = int(id_) - 1 - head = (head - id_) if head != -1 else 0 - return {'id': id_, 'orth': word, 'tag': pos, 'dep': dep, 'head': head} - - -tags_re = re.compile(r'<[\w\?/][^>]+>') -def main(out_dir, ewtb_dir='/usr/local/data/eng_web_tbk'): - ewtb_dir = Path(ewtb_dir) - out_dir = Path(out_dir) - if not out_dir.exists(): - out_dir.mkdir() - for genre_dir in ewtb_dir.joinpath('data').iterdir(): - #if 'answers' in str(genre_dir): continue - parse_dir = genre_dir.joinpath('penntree') - docs = [] - for source_loc in genre_dir.joinpath('source').joinpath('source_original').iterdir(): - filename = source_loc.parts[-1].replace('.sgm.sgm', '') - filename = filename.replace('.xml', '') - filename = filename.replace('.txt', '') - parse_loc = parse_dir.joinpath(filename + '.xml.tree') - parses = read_parses(parse_loc) - source = source_loc.open().read().strip() - if 'answers' in str(genre_dir): - source = tags_re.sub('', source).strip() - docs.append(_fmt_doc(filename, [[source, parses]])) - - out_loc = out_dir.joinpath(genre_dir.parts[-1] + '.json') - with open(str(out_loc), 'w') as out_file: - out_file.write(json.dumps(docs, indent=4)) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/ner_tag.py b/bin/ner_tag.py deleted file mode 100644 index f990f21a1..000000000 --- a/bin/ner_tag.py +++ /dev/null @@ -1,32 +0,0 @@ -import io -import plac - -from spacy.en import English - - -def main(text_loc): - with io.open(text_loc, 'r', encoding='utf8') as file_: - text = file_.read() - NLU = English() - for paragraph in text.split('\n\n'): - tokens = NLU(paragraph) - - ent_starts = {} - ent_ends = {} - for span in tokens.ents: - ent_starts[span.start] = span.label_ - ent_ends[span.end] = span.label_ - - output = [] - for token in tokens: - if token.i in ent_starts: - output.append('<%s>' % ent_starts[token.i]) - output.append(token.orth_) - if (token.i+1) in ent_ends: - output.append('' % ent_ends[token.i+1]) - output.append('\n\n') - print ' '.join(output) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/conll_train.py b/bin/parser/conll_train.py deleted file mode 100755 index 8075dcd8a..000000000 --- a/bin/parser/conll_train.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import io -import random -import time -import gzip - -import plac -import cProfile -import pstats - -import spacy.util -from spacy.en import English -from spacy.gold import GoldParse - -from spacy.syntax.util import Config -from spacy.syntax.arc_eager import ArcEager -from spacy.syntax.parser import Parser -from spacy.scorer import Scorer -from spacy.tagger import Tagger - -# Last updated for spaCy v0.97 - - -def read_conll(file_): - """Read a standard CoNLL/MALT-style format""" - sents = [] - for sent_str in file_.read().strip().split('\n\n'): - ids = [] - words = [] - heads = [] - labels = [] - tags = [] - for i, line in enumerate(sent_str.split('\n')): - word, pos_string, head_idx, label = _parse_line(line) - words.append(word) - if head_idx < 0: - head_idx = i - ids.append(i) - heads.append(head_idx) - labels.append(label) - tags.append(pos_string) - text = ' '.join(words) - annot = (ids, words, tags, heads, labels, ['O'] * len(ids)) - sents.append((None, [(annot, [])])) - return sents - - -def _parse_line(line): - pieces = line.split() - if len(pieces) == 4: - word, pos, head_idx, label = pieces - head_idx = int(head_idx) - elif len(pieces) == 15: - id_ = int(pieces[0].split('_')[-1]) - word = pieces[1] - pos = pieces[4] - head_idx = int(pieces[8])-1 - label = pieces[10] - else: - id_ = int(pieces[0].split('_')[-1]) - word = pieces[1] - pos = pieces[4] - head_idx = int(pieces[6])-1 - label = pieces[7] - if head_idx == 0: - label = 'ROOT' - return word, pos, head_idx, label - - -def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=False) - scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct')) - - -def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, - gold_preproc=False, force_gold=False): - dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') - if path.exists(dep_model_dir): - shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) - os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) - - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=ArcEager.get_labels(gold_tuples)) - - nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) - nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) - nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) - - print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") - for itn in range(n_iter): - scorer = Scorer() - loss = 0 - for _, sents in gold_tuples: - for annot_tuples, _ in sents: - if len(annot_tuples[1]) == 1: - continue - - score_model(scorer, nlp, None, annot_tuples, verbose=False) - - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=True) - if not gold.is_projective: - raise Exception( - "Non-projective sentence in training, after we should " - "have enforced projectivity: %s" % annot_tuples - ) - - loss += nlp.parser.train(tokens, gold) - nlp.tagger.train(tokens, gold.tags) - random.shuffle(gold_tuples) - print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, - scorer.tags_acc, scorer.token_acc)) - print('end training') - nlp.end_training(model_dir) - print('done') - - -@plac.annotations( - train_loc=("Location of CoNLL 09 formatted training file"), - dev_loc=("Location of CoNLL 09 formatted development file"), - model_dir=("Location of output model directory"), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - n_iter=("Number of training iterations", "option", "i", int), -) -def main(train_loc, dev_loc, model_dir, n_iter=15): - with io.open(train_loc, 'r', encoding='utf8') as file_: - train_sents = read_conll(file_) - if not eval_only: - train(English, train_sents, model_dir, n_iter=n_iter) - nlp = English(data_dir=model_dir) - dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8')) - scorer = Scorer() - for _, sents in dev_sents: - for annot_tuples, _ in sents: - score_model(scorer, nlp, None, annot_tuples) - print('TOK', 100-scorer.token_acc) - print('POS', scorer.tags_acc) - print('UAS', scorer.uas) - print('LAS', scorer.las) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/train.py b/bin/parser/train.py deleted file mode 100755 index 26b545b6d..000000000 --- a/bin/parser/train.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals -from __future__ import print_function - -import os -from os import path -import shutil -import io -import random - -import plac -import re - -import spacy.util - -from spacy.syntax.util import Config -from spacy.gold import read_json_file -from spacy.gold import GoldParse -from spacy.gold import merge_sents - -from spacy.scorer import Scorer - -from spacy.syntax.arc_eager import ArcEager -from spacy.syntax.ner import BiluoPushDown -from spacy.tagger import Tagger -from spacy.syntax.parser import Parser -from spacy.syntax.nonproj import PseudoProjectivity - - -def _corrupt(c, noise_level): - if random.random() >= noise_level: - return c - elif c == ' ': - return '\n' - elif c == '\n': - return ' ' - elif c in ['.', "'", "!", "?"]: - return '' - else: - return c.lower() - - -def add_noise(orig, noise_level): - if random.random() >= noise_level: - return orig - elif type(orig) == list: - corrupted = [_corrupt(word, noise_level) for word in orig] - corrupted = [w for w in corrupted if w] - return corrupted - else: - return ''.join(_corrupt(c, noise_level) for c in orig) - - -def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - else: - tokens = nlp.tokenizer(raw_text) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) - - -def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg, - n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0): - print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") - format_str = '{:d}\t{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}' - with Language.train(model_dir, train_data, - tagger_cfg, parser_cfg, entity_cfg) as trainer: - loss = 0 - for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc, - augment_data=None)): - for doc, gold in epoch: - trainer.update(doc, gold) - dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc) - print(format_str.format(itn, trainer.nlp.parser.model.nr_weight, - trainer.nlp.parser.model.nr_active_feat, **dev_scores.scores)) - - -def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, - beam_width=None, cand_preproc=None): - print("Load parser", model_dir) - nlp = Language(path=model_dir) - if nlp.lang == 'de': - nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string]) - if beam_width is not None: - nlp.parser.cfg.beam_width = beam_width - scorer = Scorer() - for raw_text, sents in gold_tuples: - if gold_preproc: - raw_text = None - else: - sents = merge_sents(sents) - for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - nlp.entity(tokens) - else: - tokens = nlp(raw_text) - gold = GoldParse.from_annot_tuples(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) - return scorer - - -def write_parses(Language, dev_loc, model_dir, out_loc): - nlp = Language(data_dir=model_dir) - gold_tuples = read_json_file(dev_loc) - scorer = Scorer() - out_file = io.open(out_loc, 'w', 'utf8') - for raw_text, sents in gold_tuples: - sents = _merge_sents(sents) - for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - else: - tokens = nlp(raw_text) - #gold = GoldParse(tokens, annot_tuples) - #scorer.score(tokens, gold, verbose=False) - for sent in tokens.sents: - for t in sent: - if not t.is_space: - out_file.write( - '%d\t%s\t%s\t%s\t%s\n' % (t.i, t.orth_, t.tag_, t.head.orth_, t.dep_) - ) - out_file.write('\n') - - -@plac.annotations( - language=("The language to train", "positional", None, str, ['en','de', 'zh']), - train_loc=("Location of training file or directory"), - dev_loc=("Location of development file or directory"), - model_dir=("Location of output model directory",), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - corruption_level=("Amount of noise to add to training data", "option", "c", float), - gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), - out_loc=("Out location", "option", "o", str), - n_sents=("Number of training sentences", "option", "n", int), - n_iter=("Number of training iterations", "option", "i", int), - verbose=("Verbose error reporting", "flag", "v", bool), - debug=("Debug mode", "flag", "d", bool), - pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool), - L1=("L1 regularization penalty", "option", "L", float), -) -def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False, - L1=1e-6): - parser_cfg = dict(locals()) - tagger_cfg = dict(locals()) - entity_cfg = dict(locals()) - - lang = spacy.util.get_lang_class(language) - - parser_cfg['features'] = lang.Defaults.parser_features - entity_cfg['features'] = lang.Defaults.entity_features - - if not eval_only: - gold_train = list(read_json_file(train_loc)) - gold_dev = list(read_json_file(dev_loc)) - if n_sents > 0: - gold_train = gold_train[:n_sents] - train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg, - n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level, - n_iter=n_iter) - if out_loc: - write_parses(lang, dev_loc, model_dir, out_loc) - scorer = evaluate(lang, list(read_json_file(dev_loc)), - model_dir, gold_preproc=gold_preproc, verbose=verbose) - print('TOK', scorer.token_acc) - print('POS', scorer.tags_acc) - print('UAS', scorer.uas) - print('LAS', scorer.las) - - print('NER P', scorer.ents_p) - print('NER R', scorer.ents_r) - print('NER F', scorer.ents_f) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py deleted file mode 100644 index 53ef906d5..000000000 --- a/bin/parser/train_ud.py +++ /dev/null @@ -1,201 +0,0 @@ -from __future__ import unicode_literals, print_function -import plac -import json -import random -import pathlib - -from spacy.tokens import Doc -from spacy.syntax.nonproj import PseudoProjectivity -from spacy.language import Language -from spacy.gold import GoldParse -from spacy.tagger import Tagger -from spacy.pipeline import DependencyParser, TokenVectorEncoder -from spacy.syntax.parser import get_templates -from spacy.syntax.arc_eager import ArcEager -from spacy.scorer import Scorer -from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP -import spacy.attrs -import io -from thinc.neural.ops import CupyOps -from thinc.neural import Model -from spacy.es import Spanish -from spacy.attrs import POS - - -from thinc.neural import Model - - -try: - import cupy - from thinc.neural.ops import CupyOps -except: - cupy = None - - -def read_conllx(loc, n=0): - with io.open(loc, 'r', encoding='utf8') as file_: - text = file_.read() - i = 0 - for sent in text.strip().split('\n\n'): - lines = sent.strip().split('\n') - if lines: - while lines[0].startswith('#'): - lines.pop(0) - tokens = [] - for line in lines: - id_, word, lemma, pos, tag, morph, head, dep, _1, \ - _2 = line.split('\t') - if '-' in id_ or '.' in id_: - continue - try: - id_ = int(id_) - 1 - head = (int(head) - 1) if head != '0' else id_ - dep = 'ROOT' if dep == 'root' else dep #'unlabelled' - tag = pos+'__'+dep+'__'+morph - Spanish.Defaults.tag_map[tag] = {POS: pos} - tokens.append((id_, word, tag, head, dep, 'O')) - except: - raise - tuples = [list(t) for t in zip(*tokens)] - yield (None, [[tuples, []]]) - i += 1 - if n >= 1 and i >= n: - break - - -def score_model(vocab, encoder, parser, Xs, ys, verbose=False): - scorer = Scorer() - correct = 0. - total = 0. - for doc, gold in zip(Xs, ys): - doc = Doc(vocab, words=[w.text for w in doc]) - encoder(doc) - parser(doc) - PseudoProjectivity.deprojectivize(doc) - scorer.score(doc, gold, verbose=verbose) - for token, tag in zip(doc, gold.tags): - if '_' in token.tag_: - univ_guess, _ = token.tag_.split('_', 1) - else: - univ_guess = '' - univ_truth, _ = tag.split('_', 1) - correct += univ_guess == univ_truth - total += 1 - return scorer - - -def organize_data(vocab, train_sents): - Xs = [] - ys = [] - for _, doc_sents in train_sents: - for (ids, words, tags, heads, deps, ner), _ in doc_sents: - doc = Doc(vocab, words=words) - gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) - Xs.append(doc) - ys.append(gold) - return Xs, ys - - -def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): - LangClass = spacy.util.get_lang_class(lang_name) - train_sents = list(read_conllx(train_loc)) - dev_sents = list(read_conllx(dev_loc)) - train_sents = PseudoProjectivity.preprocess_training_data(train_sents) - - actions = ArcEager.get_actions(gold_parses=train_sents) - features = get_templates('basic') - - model_dir = pathlib.Path(model_dir) - if not model_dir.exists(): - model_dir.mkdir() - if not (model_dir / 'deps').exists(): - (model_dir / 'deps').mkdir() - if not (model_dir / 'pos').exists(): - (model_dir / 'pos').mkdir() - with (model_dir / 'deps' / 'config.json').open('wb') as file_: - file_.write( - json.dumps( - {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) - - vocab = LangClass.Defaults.create_vocab() - if not (model_dir / 'vocab').exists(): - (model_dir / 'vocab').mkdir() - else: - if (model_dir / 'vocab' / 'strings.json').exists(): - with (model_dir / 'vocab' / 'strings.json').open() as file_: - vocab.strings.load(file_) - if (model_dir / 'vocab' / 'lexemes.bin').exists(): - vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') - - if clusters_loc is not None: - clusters_loc = pathlib.Path(clusters_loc) - with clusters_loc.open() as file_: - for line in file_: - try: - cluster, word, freq = line.split() - except ValueError: - continue - lex = vocab[word] - lex.cluster = int(cluster[::-1], 2) - # Populate vocab - for _, doc_sents in train_sents: - for (ids, words, tags, heads, deps, ner), _ in doc_sents: - for word in words: - _ = vocab[word] - for dep in deps: - _ = vocab[dep] - for tag in tags: - _ = vocab[tag] - if vocab.morphology.tag_map: - for tag in tags: - vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]} - tagger = Tagger(vocab) - encoder = TokenVectorEncoder(vocab, width=64) - parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) - - Xs, ys = organize_data(vocab, train_sents) - dev_Xs, dev_ys = organize_data(vocab, dev_sents) - with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer): - docs = list(Xs) - for doc in docs: - encoder(doc) - nn_loss = [0.] - def track_progress(): - with encoder.tagger.use_params(optimizer.averages): - with parser.model.use_params(optimizer.averages): - scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys) - itn = len(nn_loss) - print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc)) - nn_loss.append(0.) - track_progress() - trainer.each_epoch.append(track_progress) - trainer.batch_size = 24 - trainer.nb_epoch = 40 - for docs, golds in trainer.iterate(Xs, ys, progress_bar=True): - docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs] - tokvecs, upd_tokvecs = encoder.begin_update(docs) - for doc, tokvec in zip(docs, tokvecs): - doc.tensor = tokvec - d_tokvecs = parser.update(docs, golds, sgd=optimizer) - upd_tokvecs(d_tokvecs, sgd=optimizer) - encoder.update(docs, golds, sgd=optimizer) - nlp = LangClass(vocab=vocab, parser=parser) - scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc)) - print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) - #nlp.end_training(model_dir) - #scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) - #print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) - - -if __name__ == '__main__': - import cProfile - import pstats - if 1: - plac.call(main) - else: - cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") - s = pstats.Stats("Profile.prof") - s.strip_dirs().sort_stats("time").print_stats() - - - plac.call(main) diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py deleted file mode 100644 index f9f4eec21..000000000 --- a/bin/prepare_treebank.py +++ /dev/null @@ -1,194 +0,0 @@ -"""Convert OntoNotes into a json format. - -doc: { - id: string, - paragraphs: [{ - raw: string, - sents: [int], - tokens: [{ - start: int, - tag: string, - head: int, - dep: string}], - ner: [{ - start: int, - end: int, - label: string}], - brackets: [{ - start: int, - end: int, - label: string}]}]} - -Consumes output of spacy/munge/align_raw.py -""" -from __future__ import unicode_literals -import plac -import json -from os import path -import os -import re -import io -from collections import defaultdict - -from spacy.munge import read_ptb -from spacy.munge import read_conll -from spacy.munge import read_ner - - -def _iter_raw_files(raw_loc): - files = json.load(open(raw_loc)) - for f in files: - yield f - - -def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text): - ptb_sents = read_ptb.split(ptb_text) - dep_sents = read_conll.split(dep_text) - if len(ptb_sents) != len(dep_sents): - return None - if ner_text is not None: - ner_sents = read_ner.split(ner_text) - else: - ner_sents = [None] * len(ptb_sents) - - i = 0 - doc = {'id': file_id} - if raw_paras is None: - doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)] - #for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents): - # doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent])) - else: - doc['paragraphs'] = [] - for raw_sents in raw_paras: - para = format_para( - ' '.join(raw_sents).replace('', ''), - ptb_sents[i:i+len(raw_sents)], - dep_sents[i:i+len(raw_sents)], - ner_sents[i:i+len(raw_sents)]) - if para['sentences']: - doc['paragraphs'].append(para) - i += len(raw_sents) - return doc - - -def format_para(raw_text, ptb_sents, dep_sents, ner_sents): - para = {'raw': raw_text, 'sentences': []} - offset = 0 - assert len(ptb_sents) == len(dep_sents) == len(ner_sents) - for ptb_text, dep_text, ner_text in zip(ptb_sents, dep_sents, ner_sents): - _, deps = read_conll.parse(dep_text, strip_bad_periods=True) - if deps and 'VERB' in [t['tag'] for t in deps]: - continue - if ner_text is not None: - _, ner = read_ner.parse(ner_text, strip_bad_periods=True) - else: - ner = ['-' for _ in deps] - _, brackets = read_ptb.parse(ptb_text, strip_bad_periods=True) - # Necessary because the ClearNLP converter deletes EDITED words. - if len(ner) != len(deps): - ner = ['-' for _ in deps] - para['sentences'].append(format_sentence(deps, ner, brackets)) - return para - - -def format_sentence(deps, ner, brackets): - sent = {'tokens': [], 'brackets': []} - for token_id, (token, token_ent) in enumerate(zip(deps, ner)): - sent['tokens'].append(format_token(token_id, token, token_ent)) - - for label, start, end in brackets: - if start != end: - sent['brackets'].append({ - 'label': label, - 'first': start, - 'last': (end-1)}) - return sent - - -def format_token(token_id, token, ner): - assert token_id == token['id'] - head = (token['head'] - token_id) if token['head'] != -1 else 0 - return { - 'id': token_id, - 'orth': token['word'], - 'tag': token['tag'], - 'head': head, - 'dep': token['dep'], - 'ner': ner} - - -def read_file(*pieces): - loc = path.join(*pieces) - if not path.exists(loc): - return None - else: - return io.open(loc, 'r', encoding='utf8').read().strip() - - -def get_file_names(section_dir, subsection): - filenames = [] - for fn in os.listdir(path.join(section_dir, subsection)): - filenames.append(fn.rsplit('.', 1)[0]) - return list(sorted(set(filenames))) - - -def read_wsj_with_source(onto_dir, raw_dir): - # Now do WSJ, with source alignment - onto_dir = path.join(onto_dir, 'data', 'english', 'annotations', 'nw', 'wsj') - docs = {} - for i in range(25): - section = str(i) if i >= 10 else ('0' + str(i)) - raw_loc = path.join(raw_dir, 'wsj%s.json' % section) - for j, (filename, raw_paras) in enumerate(_iter_raw_files(raw_loc)): - if section == '00': - j += 1 - if section == '04' and filename == '55': - continue - ptb = read_file(onto_dir, section, '%s.parse' % filename) - dep = read_file(onto_dir, section, '%s.parse.dep' % filename) - ner = read_file(onto_dir, section, '%s.name' % filename) - if ptb is not None and dep is not None: - docs[filename] = format_doc(filename, raw_paras, ptb, dep, ner) - return docs - - -def get_doc(onto_dir, file_path, wsj_docs): - filename = file_path.rsplit('/', 1)[1] - if filename in wsj_docs: - return wsj_docs[filename] - else: - ptb = read_file(onto_dir, file_path + '.parse') - dep = read_file(onto_dir, file_path + '.parse.dep') - ner = read_file(onto_dir, file_path + '.name') - if ptb is not None and dep is not None: - return format_doc(filename, None, ptb, dep, ner) - else: - return None - - -def read_ids(loc): - return open(loc).read().strip().split('\n') - - -def main(onto_dir, raw_dir, out_dir): - wsj_docs = read_wsj_with_source(onto_dir, raw_dir) - - for partition in ('train', 'test', 'development'): - ids = read_ids(path.join(onto_dir, '%s.id' % partition)) - docs_by_genre = defaultdict(list) - for file_path in ids: - doc = get_doc(onto_dir, file_path, wsj_docs) - if doc is not None: - genre = file_path.split('/')[3] - docs_by_genre[genre].append(doc) - part_dir = path.join(out_dir, partition) - if not path.exists(part_dir): - os.mkdir(part_dir) - for genre, docs in sorted(docs_by_genre.items()): - out_loc = path.join(part_dir, genre + '.json') - with open(out_loc, 'w') as file_: - json.dump(docs, file_, indent=4) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/prepare_vecs.py b/bin/prepare_vecs.py deleted file mode 100644 index b55dafee3..000000000 --- a/bin/prepare_vecs.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Read a vector file, and prepare it as binary data, for easy consumption""" - -import plac - -from spacy.vocab import write_binary_vectors - - -def main(in_loc, out_loc): - write_binary_vectors(in_loc, out_loc) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/tagger/train.py b/bin/tagger/train.py deleted file mode 100755 index 9cd8cc011..000000000 --- a/bin/tagger/train.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals -from __future__ import print_function - -import os -from os import path -import shutil -import codecs -import random - -import plac -import re - -import spacy.util -from spacy.en import English - -from spacy.tagger import Tagger - -from spacy.syntax.util import Config -from spacy.gold import read_json_file -from spacy.gold import GoldParse - -from spacy.scorer import Scorer - - -def score_model(scorer, nlp, raw_text, annot_tuples): - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - else: - tokens = nlp.tokenizer(raw_text) - nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold) - - -def _merge_sents(sents): - m_deps = [[], [], [], [], [], []] - m_brackets = [] - i = 0 - for (ids, words, tags, heads, labels, ner), brackets in sents: - m_deps[0].extend(id_ + i for id_ in ids) - m_deps[1].extend(words) - m_deps[2].extend(tags) - m_deps[3].extend(head + i for head in heads) - m_deps[4].extend(labels) - m_deps[5].extend(ner) - m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) - i += len(ids) - return [(m_deps, m_brackets)] - - -def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', - seed=0, gold_preproc=False, n_sents=0, corruption_level=0, - beam_width=1, verbose=False, - use_orig_arc_eager=False): - if n_sents > 0: - gold_tuples = gold_tuples[:n_sents] - - templates = Tagger.default_templates() - nlp = Language(data_dir=model_dir, tagger=False) - nlp.tagger = Tagger.blank(nlp.vocab, templates) - - print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") - for itn in range(n_iter): - scorer = Scorer() - loss = 0 - for raw_text, sents in gold_tuples: - if gold_preproc: - raw_text = None - else: - sents = _merge_sents(sents) - for annot_tuples, ctnt in sents: - words = annot_tuples[1] - gold_tags = annot_tuples[2] - score_model(scorer, nlp, raw_text, annot_tuples) - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(words) - else: - tokens = nlp.tokenizer(raw_text) - loss += nlp.tagger.train(tokens, gold_tags) - random.shuffle(gold_tuples) - print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, - scorer.tags_acc, - scorer.token_acc)) - nlp.end_training(model_dir) - -def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, - beam_width=None): - nlp = Language(data_dir=model_dir) - if beam_width is not None: - nlp.parser.cfg.beam_width = beam_width - scorer = Scorer() - for raw_text, sents in gold_tuples: - if gold_preproc: - raw_text = None - else: - sents = _merge_sents(sents) - for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - else: - tokens = nlp(raw_text, merge_mwes=False) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) - return scorer - - -def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): - nlp = Language(data_dir=model_dir) - if beam_width is not None: - nlp.parser.cfg.beam_width = beam_width - gold_tuples = read_json_file(dev_loc) - scorer = Scorer() - out_file = codecs.open(out_loc, 'w', 'utf8') - for raw_text, sents in gold_tuples: - sents = _merge_sents(sents) - for annot_tuples, brackets in sents: - if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.entity(tokens) - nlp.parser(tokens) - else: - tokens = nlp(raw_text, merge_mwes=False) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=False) - for t in tokens: - out_file.write( - '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) - ) - return scorer - - -@plac.annotations( - train_loc=("Location of training file or directory"), - dev_loc=("Location of development file or directory"), - model_dir=("Location of output model directory",), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - corruption_level=("Amount of noise to add to training data", "option", "c", float), - gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), - out_loc=("Out location", "option", "o", str), - n_sents=("Number of training sentences", "option", "n", int), - n_iter=("Number of training iterations", "option", "i", int), - verbose=("Verbose error reporting", "flag", "v", bool), - debug=("Debug mode", "flag", "d", bool), -) -def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, - debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False): - if not eval_only: - gold_train = list(read_json_file(train_loc)) - train(English, gold_train, model_dir, - feat_set='basic' if not debug else 'debug', - gold_preproc=gold_preproc, n_sents=n_sents, - corruption_level=corruption_level, n_iter=n_iter, - verbose=verbose) - #if out_loc: - # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) - scorer = evaluate(English, list(read_json_file(dev_loc)), - model_dir, gold_preproc=gold_preproc, verbose=verbose) - print('TOK', scorer.token_acc) - print('POS', scorer.tags_acc) - print('UAS', scorer.uas) - print('LAS', scorer.las) - - print('NER P', scorer.ents_p) - print('NER R', scorer.ents_r) - print('NER F', scorer.ents_f) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/tagger/train_german_tagger.py b/bin/tagger/train_german_tagger.py deleted file mode 100644 index 4927a6e9a..000000000 --- a/bin/tagger/train_german_tagger.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import io -import random -import time -import gzip -import ujson - -import plac -import cProfile -import pstats - -import spacy.util -from spacy.de import German -from spacy.gold import GoldParse -from spacy.tagger import Tagger -from spacy.scorer import PRFScore - -from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags -from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags -from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags -from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags -from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS - - -def default_templates(): - return spacy.tagger.Tagger.default_templates() - -def default_templates_without_clusters(): - return ( - (W_orth,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_orth,), - (N2_orth,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_orth), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - - (W_flags,), - (N1_flags,), - (N2_flags,), - (P1_flags,), - (P2_flags,), - ) - - -def make_tagger(vocab, templates): - model = spacy.tagger.TaggerModel(templates) - return spacy.tagger.Tagger(vocab,model) - - -def read_conll(file_): - def sentences(): - words, tags = [], [] - for line in file_: - line = line.strip() - if line: - word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09) - words.append(word) - tags.append(tag) - elif words: - yield words, tags - words, tags = [], [] - if words: - yield words, tags - return [ s for s in sentences() ] - - -def score_model(score, nlp, words, gold_tags): - tokens = nlp.tokenizer.tokens_from_list(words) - assert(len(tokens) == len(gold_tags)) - nlp.tagger(tokens) - - for token, gold_tag in zip(tokens,gold_tags): - score.score_set(set([token.tag_]),set([gold_tag])) - - -def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21): - # make shuffling deterministic - random.seed(seed) - - # set up directory for model - pos_model_dir = path.join(model_dir, 'pos') - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) - os.mkdir(pos_model_dir) - - nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) - nlp.tagger = make_tagger(nlp.vocab,default_templates()) - - print("Itn.\ttrain acc %\tdev acc %") - for itn in range(n_iter): - # train on train set - #train_acc = PRFScore() - correct, total = 0., 0. - for words, gold_tags in train_sents: - tokens = nlp.tokenizer.tokens_from_list(words) - correct += nlp.tagger.train(tokens, gold_tags) - total += len(words) - train_acc = correct/total - - # test on dev set - dev_acc = PRFScore() - for words, gold_tags in dev_sents: - score_model(dev_acc, nlp, words, gold_tags) - - random.shuffle(train_sents) - print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision)) - - - print('end training') - nlp.end_training(model_dir) - print('done') - - -@plac.annotations( - train_loc=("Location of CoNLL 09 formatted training file"), - dev_loc=("Location of CoNLL 09 formatted development file"), - model_dir=("Location of output model directory"), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - n_iter=("Number of training iterations", "option", "i", int), -) -def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15): - # training - if not eval_only: - with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \ - io.open(dev_loc, 'r', encoding='utf8') as devfile_: - train_sents = read_conll(trainfile_) - dev_sents = read_conll(devfile_) - train(German, train_sents, dev_sents, model_dir, n_iter=n_iter) - - # testing - with io.open(dev_loc, 'r', encoding='utf8') as file_: - dev_sents = read_conll(file_) - nlp = German(data_dir=model_dir) - - dev_acc = PRFScore() - for words, gold_tags in dev_sents: - score_model(dev_acc, nlp, words, gold_tags) - - print('POS: %6.2f %%' % (100*dev_acc.precision)) - - -if __name__ == '__main__': - plac.call(main) diff --git a/setup.py b/setup.py index 9fb4970da..37bfd0495 100755 --- a/setup.py +++ b/setup.py @@ -24,7 +24,6 @@ MOD_NAMES = [ 'spacy.vocab', 'spacy.attrs', 'spacy.morphology', - 'spacy.tagger', 'spacy.pipeline', 'spacy.syntax.stateclass', 'spacy.syntax._state', diff --git a/spacy/__init__.py b/spacy/__init__.py index ba2479106..9acc566ad 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -3,8 +3,6 @@ from __future__ import unicode_literals from .cli.info import info as cli_info from .glossary import explain -from .deprecated import resolve_load_name -#from .about import __version__ from .about import __version__ from . import util diff --git a/spacy/__main__.py b/spacy/__main__.py index 99d6b116c..48460c9e3 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import print_function # NB! This breaks in plac on Python 2!! -#from __future__ import unicode_literals +# from __future__ import unicode_literals if __name__ == '__main__': import plac diff --git a/spacy/_ml.py b/spacy/_ml.py index b60851fda..de89e04d0 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -1,49 +1,42 @@ -import ujson -from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU +# coding: utf8 +from __future__ import unicode_literals + +import numpy +from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu from thinc.i2v import HashEmbed, StaticVectors from thinc.t2t import ExtractWindow, ParametricAttention -from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool +from thinc.t2v import Pooling, sum_pool from thinc.misc import Residual -from thinc.misc import BatchNorm as BN from thinc.misc import LayerNorm as LN - from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.api import FeatureExtracter, with_getitem -from thinc.api import uniqued, wrap, flatten_add_lengths, noop - +from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths +from thinc.api import uniqued, wrap, noop from thinc.linear.linear import LinearModel from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module, copy_array from thinc.neural._lsuv import svd_orthonormal -import random -import cytoolz - from thinc import describe from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.neural._classes.affine import _set_dimensions_if_needed import thinc.extra.load_nlp from thinc.neural._lsuv import svd_orthonormal -from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER -from .tokens.doc import Doc +from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE from . import util -import numpy -import io - -# TODO: Unset this once we don't want to support models previous models. -import thinc.neural._classes.layernorm -thinc.neural._classes.layernorm.set_compat_six_eight(False) VECTORS_KEY = 'spacy_pretrained_vectors' + @layerize def _flatten_add_lengths(seqs, pad=0, drop=0.): ops = Model.ops lengths = ops.asarray([len(seq) for seq in seqs], dtype='i') + def finish_update(d_X, sgd=None): return ops.unflatten(d_X, lengths, pad=pad) + X = ops.flatten(seqs, pad=pad) return (X, lengths), finish_update @@ -57,33 +50,14 @@ def _logistic(X, drop=0.): X = xp.minimum(X, 10., X) X = xp.maximum(X, -10., X) Y = 1. / (1. + xp.exp(-X)) + def logistic_bwd(dY, sgd=None): dX = dY * (Y * (1-Y)) return dX + return Y, logistic_bwd -@layerize -def add_tuples(X, drop=0.): - """Give inputs of sequence pairs, where each sequence is (vals, length), - sum the values, returning a single sequence. - - If input is: - ((vals1, length), (vals2, length) - Output is: - (vals1+vals2, length) - - vals are a single tensor for the whole batch. - """ - (vals1, length1), (vals2, length2) = X - assert length1 == length2 - - def add_tuples_bwd(dY, sgd=None): - return (dY, dY) - - return (vals1+vals2, length), add_tuples_bwd - - def _zero_init(model): def _zero_init_impl(self, X, y): self.W.fill(0) @@ -111,13 +85,11 @@ def _preprocess_doc(docs, drop=0.): nO=Dimension("Output size"), nP=Dimension("Maxout pieces"), W=Synapses("Weights matrix", - lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI) if obj.nP >= 2 - else (obj.nF, obj.nO, obj.nI)), + lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)), b=Biases("Bias vector", - lambda obj: (obj.nO, obj.nP) if obj.nP >= 2 else (obj.nO,)), + lambda obj: (obj.nO, obj.nP)), d_W=Gradient("W"), - d_b=Gradient("b") -) + d_b=Gradient("b")) class PrecomputableAffine(Model): def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs): Model.__init__(self, **kwargs) @@ -203,89 +175,6 @@ class PrecomputableAffine(Model): break -# Thinc's Embed class is a bit broken atm, so drop this here. -from thinc import describe -from thinc.neural._classes.embed import _uniform_init - - -@describe.attributes( - nV=describe.Dimension("Number of vectors"), - nO=describe.Dimension("Size of output"), - vectors=describe.Weights("Embedding table", - lambda obj: (obj.nV, obj.nO), - _uniform_init(-0.1, 0.1) - ), - d_vectors=describe.Gradient("vectors") -) -class Embed(Model): - name = 'embed' - - def __init__(self, nO, nV=None, **kwargs): - if nV is not None: - nV += 1 - Model.__init__(self, **kwargs) - if 'name' in kwargs: - self.name = kwargs['name'] - self.column = kwargs.get('column', 0) - self.nO = nO - self.nV = nV - - def predict(self, ids): - if ids.ndim == 2: - ids = ids[:, self.column] - return self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f') - - def begin_update(self, ids, drop=0.): - if ids.ndim == 2: - ids = ids[:, self.column] - vectors = self.ops.xp.ascontiguousarray(self.vectors[ids], dtype='f') - def backprop_embed(d_vectors, sgd=None): - n_vectors = d_vectors.shape[0] - self.ops.scatter_add(self.d_vectors, ids, d_vectors) - if sgd is not None: - sgd(self._mem.weights, self._mem.gradient, key=self.id) - return None - return vectors, backprop_embed - - -def HistoryFeatures(nr_class, hist_size=8, nr_dim=8): - '''Wrap a model, adding features representing action history.''' - if hist_size == 0: - return layerize(noop()) - embed_tables = [Embed(nr_dim, nr_class, column=i, name='embed%d') - for i in range(hist_size)] - embed = chain(concatenate(*embed_tables), - LN(Maxout(hist_size*nr_dim, hist_size*nr_dim))) - ops = embed.ops - def add_history_fwd(vectors_hists, drop=0.): - vectors, hist_ids = vectors_hists - hist_feats, bp_hists = embed.begin_update(hist_ids, drop=drop) - outputs = ops.xp.hstack((vectors, hist_feats)) - - def add_history_bwd(d_outputs, sgd=None): - d_vectors = d_outputs[:, :vectors.shape[1]] - d_hists = d_outputs[:, vectors.shape[1]:] - bp_hists(d_hists, sgd=sgd) - return embed.ops.xp.ascontiguousarray(d_vectors) - return outputs, add_history_bwd - return wrap(add_history_fwd, embed) - - -def drop_layer(layer, factor=2.): - def drop_layer_fwd(X, drop=0.): - if drop <= 0.: - return layer.begin_update(X, drop=drop) - else: - coinflip = layer.ops.xp.random.random() - if (coinflip / factor) >= drop: - return layer.begin_update(X, drop=drop) - else: - return X, lambda dX, sgd=None: dX - - model = wrap(drop_layer_fwd, layer) - model.predict = layer - return model - def link_vectors_to_models(vocab): vectors = vocab.vectors ops = Model.ops @@ -299,16 +188,21 @@ def link_vectors_to_models(vocab): # (unideal, I know) thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data + def Tok2Vec(width, embed_size, **kwargs): pretrained_dims = kwargs.get('pretrained_dims', 0) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, - '*': reapply}): - norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') - prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') - suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') - shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') + with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, + '+': add, '*': reapply}): + norm = HashEmbed(width, embed_size, column=cols.index(NORM), + name='embed_norm') + prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), + name='embed_prefix') + suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), + name='embed_suffix') + shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), + name='embed_shape') if pretrained_dims is not None and pretrained_dims >= 1: glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID)) @@ -320,7 +214,6 @@ def Tok2Vec(width, embed_size, **kwargs): (norm | prefix | suffix | shape) >> LN(Maxout(width, width*4, pieces=3)), column=5) - convolution = Residual( ExtractWindow(nW=1) >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces)) @@ -344,6 +237,7 @@ def reapply(layer, n_times): Y, backprop = layer.begin_update(X, drop=drop) X = Y backprops.append(backprop) + def reapply_bwd(dY, sgd=None): dX = None for backprop in reversed(backprops): @@ -353,6 +247,7 @@ def reapply(layer, n_times): else: dX += dY return dX + return Y, reapply_bwd return wrap(reapply_fwd, layer) @@ -367,13 +262,14 @@ def _divide_array(X, size): parts = [] index = 0 while index < len(X): - parts.append(X[index : index + size]) + parts.append(X[index:index + size]) index += size return parts def get_col(idx): assert idx >= 0, idx + def forward(X, drop=0.): assert idx >= 0, idx if isinstance(X, numpy.ndarray): @@ -381,30 +277,28 @@ def get_col(idx): else: ops = CupyOps() output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) + def backward(y, sgd=None): assert idx >= 0, idx dX = ops.allocate(X.shape) dX[:, idx] += y return dX + return output, backward + return layerize(forward) -def zero_init(model): - def _hook(self, X, y=None): - self.W.fill(0) - model.on_data_hooks.append(_hook) - return model - - def doc2feats(cols=None): if cols is None: cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] + def forward(docs, drop=0.): feats = [] for doc in docs: feats.append(doc.to_array(cols)) return feats, None + model = layerize(forward) model.cols = cols return model @@ -418,28 +312,14 @@ def print_shape(prefix): @layerize def get_token_vectors(tokens_attrs_vectors, drop=0.): - ops = Model.ops tokens, attrs, vectors = tokens_attrs_vectors + def backward(d_output, sgd=None): return (tokens, d_output) + return vectors, backward -@layerize -def flatten(seqs, drop=0.): - if isinstance(seqs[0], numpy.ndarray): - ops = NumpyOps() - elif hasattr(CupyOps.xp, 'ndarray') and isinstance(seqs[0], CupyOps.xp.ndarray): - ops = CupyOps() - else: - raise ValueError("Unable to flatten sequence of type %s" % type(seqs[0])) - lengths = [len(seq) for seq in seqs] - def finish_update(d_X, sgd=None): - return ops.unflatten(d_X, lengths) - X = ops.xp.vstack(seqs) - return X, finish_update - - @layerize def logistic(X, drop=0.): xp = get_array_module(X) @@ -449,9 +329,11 @@ def logistic(X, drop=0.): X = xp.minimum(X, 10., X) X = xp.maximum(X, -10., X) Y = 1. / (1. + xp.exp(-X)) + def logistic_bwd(dY, sgd=None): dX = dY * (Y * (1-Y)) return dX + return Y, logistic_bwd @@ -461,6 +343,7 @@ def zero_init(model): model.on_data_hooks.append(_zero_init_impl) return model + @layerize def preprocess_doc(docs, drop=0.): keys = [doc.to_array([LOWER]) for doc in docs] @@ -501,8 +384,6 @@ def build_tagger_model(nr_class, **cfg): @layerize def SpacyVectors(docs, drop=0.): - xp = get_array_module(docs[0].vocab.vectors.data) - width = docs[0].vocab.vectors.data.shape[1] batch = [] for doc in docs: indices = numpy.zeros((len(doc),), dtype='i') @@ -525,9 +406,7 @@ def build_text_classifier(nr_class, width=64, **cfg): model = ( SpacyVectors >> flatten_add_lengths - >> with_getitem(0, - Affine(width, pretrained_dims) - ) + >> with_getitem(0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width)) ** 2 @@ -536,7 +415,6 @@ def build_text_classifier(nr_class, width=64, **cfg): ) return model - lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width//2, nr_vector, column=2) suffix = HashEmbed(width//2, nr_vector, column=3) @@ -594,33 +472,40 @@ def build_text_classifier(nr_class, width=64, **cfg): model.lsuv = False return model + @layerize def flatten(seqs, drop=0.): ops = Model.ops lengths = ops.asarray([len(seq) for seq in seqs], dtype='i') + def finish_update(d_X, sgd=None): return ops.unflatten(d_X, lengths, pad=0) + X = ops.flatten(seqs, pad=0) return X, finish_update -def concatenate_lists(*layers, **kwargs): # pragma: no cover - '''Compose two or more models `f`, `g`, etc, such that their outputs are +def concatenate_lists(*layers, **kwargs): # pragma: no cover + """Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` - ''' + """ if not layers: return noop() drop_factor = kwargs.get('drop_factor', 1.0) ops = layers[0].ops layers = [chain(layer, flatten) for layer in layers] concat = concatenate(*layers) + def concatenate_lists_fwd(Xs, drop=0.): drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype='i') flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) + def concatenate_lists_bwd(d_ys, sgd=None): return bp_flat_y(ops.flatten(d_ys), sgd=sgd) + return ys, concatenate_lists_bwd + model = wrap(concatenate_lists_fwd, concat) return model diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 8efd9e189..8113ffebe 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): """ Normalize a dictionary of attributes, converting them to ints. - Arguments: - stringy_attrs (dict): - Dictionary keyed by attribute string names. Values can be ints or strings. - - strings_map (StringStore): - Defaults to None. If provided, encodes string values into ints. - - Returns: - inty_attrs (dict): - Attributes dictionary with keys and optionally values converted to - ints. + stringy_attrs (dict): Dictionary keyed by attribute string names. Values + can be ints or strings. + strings_map (StringStore): Defaults to None. If provided, encodes string + values into ints. + RETURNS (dict): Attributes dictionary with keys and optionally values + converted to ints. """ inty_attrs = {} if _do_deprecated: diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index d9a812a15..ad17844a1 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,10 +7,9 @@ from pathlib import Path from .converters import conllu2json, iob2json, conll_ner2json from ..util import prints -# Converters are matched by file extension. To add a converter, add a new entry -# to this dict with the file extension mapped to the converter function imported -# from /converters. - +# Converters are matched by file extension. To add a converter, add a new +# entry to this dict with the file extension mapped to the converter function +# imported from /converters. CONVERTERS = { 'conllu': conllu2json, 'conll': conllu2json, @@ -24,8 +23,7 @@ CONVERTERS = { output_dir=("output directory for converted file", "positional", None, str), n_sents=("Number of sentences per doc", "option", "n", int), converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), - morphology=("Enable appending morphology to tags", "flag", "m", bool) -) + morphology=("Enable appending morphology to tags", "flag", "m", bool)) def convert(cmd, input_file, output_dir, n_sents=1, morphology=False, converter='auto'): """ @@ -40,7 +38,7 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False, prints(output_path, title="Output directory not found", exits=1) if converter == 'auto': converter = input_path.suffix[1:] - if not converter in CONVERTERS: + if converter not in CONVERTERS: prints("Can't find converter for %s" % converter, title="Unknown format", exits=1) func = CONVERTERS[converter] diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index e3bd82e7e..fb2979652 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -8,7 +8,8 @@ from ...gold import iob_to_biluo def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): """ - Convert files in the CoNLL-2003 NER format into JSON format for use with train cli. + Convert files in the CoNLL-2003 NER format into JSON format for use with + train cli. """ docs = read_conll_ner(input_path) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 28ae07865..0d3f11153 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -13,10 +13,9 @@ from .. import about @plac.annotations( - model=("model to download (shortcut or model name)", "positional", None, str), + model=("model to download, shortcut or name)", "positional", None, str), direct=("force direct download. Needs model name with version and won't " - "perform compatibility check", "flag", "d", bool) -) + "perform compatibility check", "flag", "d", bool)) def download(cmd, model, direct=False): """ Download compatible model from default download path using pip. Model @@ -30,21 +29,25 @@ def download(cmd, model, direct=False): model_name = shortcuts.get(model, model) compatibility = get_compatibility() version = get_version(model_name, compatibility) - dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) + dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, + v=version)) if dl == 0: try: # Get package path here because link uses - # pip.get_installed_distributions() to check if model is a package, - # which fails if model was just installed via subprocess + # pip.get_installed_distributions() to check if model is a + # package, which fails if model was just installed via + # subprocess package_path = get_package_path(model_name) - link(None, model_name, model, force=True, model_path=package_path) + link(None, model_name, model, force=True, + model_path=package_path) except: - # Dirty, but since spacy.download and the auto-linking is mostly - # a convenience wrapper, it's best to show a success message and - # loading instructions, even if linking fails. - prints("Creating a shortcut link for 'en' didn't work (maybe you " - "don't have admin permissions?), but you can still load " - "the model via its full package name:", + # Dirty, but since spacy.download and the auto-linking is + # mostly a convenience wrapper, it's best to show a success + # message and loading instructions, even if linking fails. + prints( + "Creating a shortcut link for 'en' didn't work (maybe " + "you don't have admin permissions?), but you can still " + "load the model via its full package name:", "nlp = spacy.load('%s')" % model_name, title="Download successful") @@ -52,9 +55,10 @@ def download(cmd, model, direct=False): def get_json(url, desc): r = requests.get(url) if r.status_code != 200: - prints("Couldn't fetch %s. Please find a model for your spaCy installation " - "(v%s), and download it manually." % (desc, about.__version__), - about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1) + msg = ("Couldn't fetch %s. Please find a model for your spaCy " + "installation (v%s), and download it manually.") + prints(msg % (desc, about.__version__), about.__docs_models__, + title="Server error (%d)" % r.status_code, exits=1) return r.json() @@ -71,13 +75,13 @@ def get_compatibility(): def get_version(model, comp): if model not in comp: version = about.__version__ - prints("No compatible model found for '%s' (spaCy v%s)." % (model, version), - title="Compatibility error", exits=1) + msg = "No compatible model found for '%s' (spaCy v%s)." + prints(msg % (model, version), title="Compatibility error", exits=1) return comp[model][0] def download_model(filename): download_url = about.__download_url__ + '/' + filename - return subprocess.call([sys.executable, '-m', - 'pip', 'install', '--no-cache-dir', download_url], - env=os.environ.copy()) + return subprocess.call( + [sys.executable, '-m', 'pip', 'install', '--no-cache-dir', + download_url], env=os.environ.copy()) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 29e30b7d2..d4d54d8aa 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -2,27 +2,15 @@ from __future__ import unicode_literals, division, print_function import plac -import json -from collections import defaultdict -import cytoolz -from pathlib import Path -import dill -import tqdm -from thinc.neural._classes.model import Model -from thinc.neural.optimizers import linear_decay from timeit import default_timer as timer import random import numpy.random -from ..tokens.doc import Doc -from ..scorer import Scorer -from ..gold import GoldParse, merge_sents -from ..gold import GoldCorpus, minibatch +from ..gold import GoldCorpus from ..util import prints from .. import util -from .. import about from .. import displacy -from ..compat import json_dumps + random.seed(0) numpy.random.seed(0) @@ -30,17 +18,18 @@ numpy.random.seed(0) @plac.annotations( model=("Model name or path", "positional", None, str), - data_path=("Location of JSON-formatted evaluation data", "positional", None, str), + data_path=("Location of JSON-formatted evaluation data", "positional", + None, str), gold_preproc=("Use gold preprocessing", "flag", "G", bool), gpu_id=("Use GPU", "option", "g", int), - displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), - displacy_limit=("Limit of parses to render as HTML", "option", "dl", int) -) + displacy_path=("Directory to output rendered parses as HTML", "option", + "dp", str), + displacy_limit=("Limit of parses to render as HTML", "option", "dl", int)) def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None, displacy_limit=25): """ - Evaluate a model. To render a sample of parses in a HTML file, set an output - directory as the displacy_path argument. + Evaluate a model. To render a sample of parses in a HTML file, set an + output directory as the displacy_path argument. """ if gpu_id >= 0: util.use_gpu(gpu_id) @@ -50,7 +39,8 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, if not data_path.exists(): prints(data_path, title="Evaluation data not found", exits=1) if displacy_path and not displacy_path.exists(): - prints(displacy_path, title="Visualization output directory not found", exits=1) + prints(displacy_path, title="Visualization output directory not found", + exits=1) corpus = GoldCorpus(data_path, data_path) nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) @@ -64,12 +54,14 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, docs, golds = zip(*dev_docs) render_deps = 'parser' in nlp.meta.get('pipeline', []) render_ents = 'ner' in nlp.meta.get('pipeline', []) - render_parses(docs, displacy_path, model_name=model, limit=displacy_limit, - deps=render_deps, ents=render_ents) - prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit) + render_parses(docs, displacy_path, model_name=model, + limit=displacy_limit, deps=render_deps, ents=render_ents) + msg = "Generated %s parses as HTML" % displacy_limit + prints(displacy_path, title=msg) -def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True): +def render_parses(docs, output_path, model_name='', limit=250, deps=True, + ents=True): docs[0].user_data['title'] = model_name if ents: with (output_path / 'entities.html').open('w') as file_: @@ -77,7 +69,8 @@ def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=T file_.write(html) if deps: with (output_path / 'parses.html').open('w') as file_: - html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True}) + html = displacy.render(docs[:limit], style='dep', page=True, + options={'compact': True}) file_.write(html) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 5d45b271c..3636494fb 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -12,8 +12,7 @@ from .. import util @plac.annotations( model=("optional: shortcut link of model", "positional", None, str), - markdown=("generate Markdown for GitHub issues", "flag", "md", str) -) + markdown=("generate Markdown for GitHub issues", "flag", "md", str)) def info(cmd, model=None, markdown=False): """Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 5b333dae5..cfbc97e3e 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -12,8 +12,7 @@ from .. import util @plac.annotations( origin=("package name or local path to model", "positional", None, str), link_name=("name of shortuct link to create", "positional", None, str), - force=("force overwriting of existing link", "flag", "f", bool) -) + force=("force overwriting of existing link", "flag", "f", bool)) def link(cmd, origin, link_name, force=False, model_path=None): """ Create a symlink for models within the spacy/data directory. Accepts @@ -46,8 +45,9 @@ def link(cmd, origin, link_name, force=False, model_path=None): # This is quite dirty, but just making sure other errors are caught. prints("Creating a symlink in spacy/data failed. Make sure you have " "the required permissions and try re-running the command as " - "admin, or use a virtualenv. You can still import the model as a " - "module and call its load() method, or create the symlink manually.", + "admin, or use a virtualenv. You can still import the model as " + "a module and call its load() method, or create the symlink " + "manually.", "%s --> %s" % (path2str(model_path), path2str(link_path)), title="Error: Couldn't link model to '%s'" % link_name) raise diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 6b0811459..d1984fe65 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -16,10 +16,12 @@ from .. import about input_dir=("directory with model data", "positional", None, str), output_dir=("output parent directory", "positional", None, str), meta_path=("path to meta.json", "option", "m", str), - create_meta=("create meta.json, even if one exists in directory", "flag", "c", bool), - force=("force overwriting of existing folder in output directory", "flag", "f", bool) -) -def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force=False): + create_meta=("create meta.json, even if one exists in directory", "flag", + "c", bool), + force=("force overwriting of existing folder in output directory", "flag", + "f", bool)) +def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, + force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified @@ -52,13 +54,15 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force package_path = main_path / model_name create_dirs(package_path, force) - shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) + shutil.copytree(path2str(input_path), + path2str(package_path / model_name_v)) create_file(main_path / 'meta.json', json_dumps(meta)) create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'MANIFEST.in', template_manifest) create_file(package_path / '__init__.py', template_init) - prints(main_path, "To build the package, run `python setup.py sdist` in this " - "directory.", title="Successfully created package '%s'" % model_name_v) + prints(main_path, "To build the package, run `python setup.py sdist` in " + "this directory.", + title="Successfully created package '%s'" % model_name_v) def create_dirs(package_path, force): @@ -66,9 +70,10 @@ def create_dirs(package_path, force): if force: shutil.rmtree(path2str(package_path)) else: - prints(package_path, "Please delete the directory and try again, or " - "use the --force flag to overwrite existing directories.", - title="Package directory already exists", exits=1) + prints(package_path, "Please delete the directory and try again, " + "or use the --force flag to overwrite existing " + "directories.", title="Package directory already exists", + exits=1) Path.mkdir(package_path, parents=True) @@ -82,7 +87,8 @@ def generate_meta(model_path): settings = [('lang', 'Model language', 'en'), ('name', 'Model name', 'model'), ('version', 'Model version', '0.0.0'), - ('spacy_version', 'Required spaCy version', '>=%s,<3.0.0' % about.__version__), + ('spacy_version', 'Required spaCy version', + '>=%s,<3.0.0' % about.__version__), ('description', 'Model description', False), ('author', 'Author', False), ('email', 'Author email', False), diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index db6fc5b41..a394989d0 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -27,15 +27,15 @@ def read_inputs(loc): @plac.annotations( lang=("model/language", "positional", None, str), - inputs=("Location of input file", "positional", None, read_inputs) -) + inputs=("Location of input file", "positional", None, read_inputs)) def profile(cmd, lang, inputs=None): """ Profile a spaCy pipeline, to find out which functions take the most time. """ - nlp = spacy.load(lang) + nlp = spacy.load(lang) texts = list(cytoolz.take(10000, inputs)) - cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") + cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), + "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats() diff --git a/spacy/cli/train.py b/spacy/cli/train.py index da398751c..fb96e6c05 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -2,21 +2,14 @@ from __future__ import unicode_literals, division, print_function import plac -import json -from collections import defaultdict -import cytoolz from pathlib import Path import dill import tqdm from thinc.neural._classes.model import Model -from thinc.neural.optimizers import linear_decay from timeit import default_timer as timer import random import numpy.random -from ..tokens.doc import Doc -from ..scorer import Scorer -from ..gold import GoldParse, merge_sents from ..gold import GoldCorpus, minibatch from ..util import prints from .. import util @@ -31,8 +24,10 @@ numpy.random.seed(0) @plac.annotations( lang=("model language", "positional", None, str), output_dir=("output directory to store model in", "positional", None, str), - train_data=("location of JSON-formatted training data", "positional", None, str), - dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), + train_data=("location of JSON-formatted training data", "positional", + None, str), + dev_data=("location of JSON-formatted development data (optional)", + "positional", None, str), n_iter=("number of iterations", "option", "n", int), n_sents=("number of sentences", "option", "ns", int), use_gpu=("Use GPU", "option", "g", int), @@ -42,11 +37,12 @@ numpy.random.seed(0) no_entities=("Don't train NER", "flag", "N", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool), version=("Model version", "option", "V", str), - meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) -) + meta_path=("Optional path to meta.json. All relevant properties will be " + "overwritten.", "option", "m", Path)) def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, - use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, - gold_preproc=False, version="0.0.0", meta_path=None): + use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, + no_entities=False, gold_preproc=False, version="0.0.0", + meta_path=None): """ Train a model. Expects data in spaCy's JSON format. """ @@ -72,9 +68,12 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, meta.setdefault('name', 'unnamed') pipeline = ['tagger', 'parser', 'ner'] - if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') - if no_parser and 'parser' in pipeline: pipeline.remove('parser') - if no_entities and 'ner' in pipeline: pipeline.remove('ner') + if no_tagger and 'tagger' in pipeline: + pipeline.remove('tagger') + if no_parser and 'parser' in pipeline: + pipeline.remove('parser') + if no_entities and 'ner' in pipeline: + pipeline.remove('ner') # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. @@ -139,7 +138,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords/(end_time-start_time) - acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') + acc_loc = (output_path / ('model%d' % i) / 'accuracy.json') with acc_loc.open('w') as file_: file_.write(json_dumps(scorer.scores)) meta_loc = output_path / ('model%d' % i) / 'meta.json' @@ -157,7 +156,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, with meta_loc.open('w') as file_: file_.write(json_dumps(meta)) util.set_env_log(True) - print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) + print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, + gpu_wps=gpu_wps) finally: print("Saving model...") try: diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index c1f992ed6..1c645a554 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -1,5 +1,5 @@ # coding: utf8 -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import requests import pkg_resources @@ -29,8 +29,10 @@ def validate(cmd): model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) incompat_links = {l for l, d in model_links.items() if not d['compat']} - incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']} - incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']]) + incompat_models = {d['name'] for _, d in model_pkgs.items() + if not d['compat']} + incompat_models.update([d['name'] for _, d in model_links.items() + if not d['compat']]) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] @@ -90,7 +92,6 @@ def get_model_pkgs(compat, all_models): def get_model_row(compat, name, data, type='package'): - tpl_row = ' {:<10}' + (' {:<20}' * 4) tpl_red = '\x1b[38;5;1m{}\x1b[0m' tpl_green = '\x1b[38;5;2m{}\x1b[0m' if data['compat']: @@ -110,7 +111,8 @@ def get_row(*args): def is_model_path(model_path): exclude = ['cache', 'pycache', '__pycache__'] name = model_path.parts[-1] - return model_path.is_dir() and name not in exclude and not name.startswith('.') + return (model_path.is_dir() and name not in exclude + and not name.startswith('.')) def is_compat(compat, name, version): @@ -118,6 +120,7 @@ def is_compat(compat, name, version): def reformat_version(version): + """Hack to reformat old versions ending on '-alpha' to match pip format.""" if version.endswith('-alpha'): return version.replace('-alpha', 'a0') return version.replace('-alpha', 'a') diff --git a/spacy/compat.py b/spacy/compat.py index 8dd3d6b03..7cd06e545 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -91,15 +91,15 @@ def symlink_to(orig, dest): def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): - return ((python2 == None or python2 == is_python2) and - (python3 == None or python3 == is_python3) and - (windows == None or windows == is_windows) and - (linux == None or linux == is_linux) and - (osx == None or osx == is_osx)) + return ((python2 is None or python2 == is_python2) and + (python3 is None or python3 == is_python3) and + (windows is None or windows == is_windows) and + (linux is None or linux == is_linux) and + (osx is None or osx == is_osx)) def normalize_string_keys(old): - '''Given a dictionary, make sure keys are unicode strings, not bytes.''' + """Given a dictionary, make sure keys are unicode strings, not bytes.""" new = {} for key, value in old.items(): if isinstance(key, bytes_): diff --git a/spacy/deprecated.py b/spacy/deprecated.py index ad52bfe24..a1143474a 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -24,7 +24,7 @@ def depr_model_download(lang): def resolve_load_name(name, **overrides): - """Resolve model loading if deprecated path kwarg is specified in overrides. + """Resolve model loading if deprecated path kwarg in overrides. name (unicode): Name of model to load. **overrides: Overrides specified in spacy.load(). @@ -32,8 +32,9 @@ def resolve_load_name(name, **overrides): """ if overrides.get('path') not in (None, False, True): name = overrides.get('path') - prints("To load a model from a path, you can now use the first argument. " - "The model meta is used to load the required Language class.", - "OLD: spacy.load('en', path='/some/path')", "NEW: spacy.load('/some/path')", + prints("To load a model from a path, you can now use the first " + "argument. The model meta is used to load the Language class.", + "OLD: spacy.load('en', path='/some/path')", + "NEW: spacy.load('/some/path')", title="Warning: deprecated argument 'path'") return name diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 7c479f94c..e160c31b6 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -12,7 +12,7 @@ IS_JUPYTER = is_in_jupyter() def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, - options={}, manual=False): + options={}, manual=False): """Render displaCy visualisation. docs (list or Doc): Document(s) to visualise. @@ -21,7 +21,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, minify (bool): Minify HTML markup. jupyter (bool): Experimental, use Jupyter's `display()` to output markup. options (dict): Visualiser-specific options, e.g. colors. - manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. + manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. RETURNS (unicode): Rendered HTML markup. """ factories = {'dep': (DependencyRenderer, parse_deps), @@ -35,7 +35,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, parsed = [converter(doc, options) for doc in docs] if not manual else docs _html['parsed'] = renderer.render(parsed, page=page, minify=minify).strip() html = _html['parsed'] - if jupyter: # return HTML rendered by IPython display() + if jupyter: # return HTML rendered by IPython display() from IPython.core.display import display, HTML return display(HTML(html)) return html @@ -50,13 +50,15 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, page (bool): Render markup as full HTML page. minify (bool): Minify HTML markup. options (dict): Visualiser-specific options, e.g. colors. - manual (bool): Don't parse `Doc` and instead, expect a dict or list of dicts. + manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. port (int): Port to serve visualisation. """ from wsgiref import simple_server - render(docs, style=style, page=page, minify=minify, options=options, manual=manual) + render(docs, style=style, page=page, minify=minify, options=options, + manual=manual) httpd = simple_server.make_server('0.0.0.0', port, app) - prints("Using the '%s' visualizer" % style, title="Serving on port %d..." % port) + prints("Using the '%s' visualizer" % style, + title="Serving on port %d..." % port) try: httpd.serve_forever() except KeyboardInterrupt: @@ -67,7 +69,8 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, def app(environ, start_response): # headers and status need to be bytes in Python 2, see #1227 - headers = [(b_to_str(b'Content-type'), b_to_str(b'text/html; charset=utf-8'))] + headers = [(b_to_str(b'Content-type'), + b_to_str(b'text/html; charset=utf-8'))] start_response(b_to_str(b'200 OK'), headers) res = _html['parsed'].encode(encoding='utf-8') return [res] @@ -89,9 +92,9 @@ def parse_deps(orig_doc, options={}): end = word.i + 1 while end < len(doc) and doc[end].is_punct: end += 1 - span = doc[start : end] + span = doc[start:end] spans.append((span.start_char, span.end_char, word.tag_, - word.lemma_, word.ent_type_)) + word.lemma_, word.ent_type_)) for span_props in spans: doc.merge(*span_props) words = [{'text': w.text, 'tag': w.tag_} for w in doc] @@ -113,6 +116,7 @@ def parse_ents(doc, options={}): RETURNS (dict): Generated entities keyed by text (original text) and ents. """ ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} - for ent in doc.ents] - title = doc.user_data.get('title', None) if hasattr(doc, 'user_data') else None + for ent in doc.ents] + title = (doc.user_data.get('title', None) + if hasattr(doc, 'user_data') else None) return {'text': doc.text, 'ents': ents, 'title': title} diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 1050ffa87..4a494591c 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -14,13 +14,15 @@ class DependencyRenderer(object): """Initialise dependency renderer. options (dict): Visualiser-specific options (compact, word_spacing, - arrow_spacing, arrow_width, arrow_stroke, distance, - offset_x, color, bg, font) + arrow_spacing, arrow_width, arrow_stroke, distance, offset_x, + color, bg, font) """ self.compact = options.get('compact', False) self.word_spacing = options.get('word_spacing', 45) - self.arrow_spacing = options.get('arrow_spacing', 12 if self.compact else 20) - self.arrow_width = options.get('arrow_width', 6 if self.compact else 10) + self.arrow_spacing = options.get('arrow_spacing', + 12 if self.compact else 20) + self.arrow_width = options.get('arrow_width', + 6 if self.compact else 10) self.arrow_stroke = options.get('arrow_stroke', 2) self.distance = options.get('distance', 150 if self.compact else 175) self.offset_x = options.get('offset_x', 50) @@ -39,7 +41,8 @@ class DependencyRenderer(object): rendered = [self.render_svg(i, p['words'], p['arcs']) for i, p in enumerate(parsed)] if page: - content = ''.join([TPL_FIGURE.format(content=svg) for svg in rendered]) + content = ''.join([TPL_FIGURE.format(content=svg) + for svg in rendered]) markup = TPL_PAGE.format(content=content) else: markup = ''.join(rendered) @@ -63,12 +66,13 @@ class DependencyRenderer(object): self.id = render_id words = [self.render_word(w['text'], w['tag'], i) for i, w in enumerate(words)] - arcs = [self.render_arrow(a['label'], a['start'], a['end'], a['dir'], i) + arcs = [self.render_arrow(a['label'], a['start'], + a['end'], a['dir'], i) for i, a in enumerate(arcs)] content = ''.join(words) + ''.join(arcs) - return TPL_DEP_SVG.format(id=self.id, width=self.width, height=self.height, - color=self.color, bg=self.bg, font=self.font, - content=content) + return TPL_DEP_SVG.format(id=self.id, width=self.width, + height=self.height, color=self.color, + bg=self.bg, font=self.font, content=content) def render_word(self, text, tag, i): """Render individual word. @@ -96,7 +100,7 @@ class DependencyRenderer(object): x_start = self.offset_x+start*self.distance+self.arrow_spacing y = self.offset_y x_end = (self.offset_x+(end-start)*self.distance+start*self.distance - -self.arrow_spacing*(self.highest_level-level)/4) + - self.arrow_spacing*(self.highest_level-level)/4) y_curve = self.offset_y-level*self.distance/2 if self.compact: y_curve = self.offset_y-level*self.distance/6 @@ -133,8 +137,10 @@ class DependencyRenderer(object): if direction is 'left': pos1, pos2, pos3 = (x, x-self.arrow_width+2, x+self.arrow_width-2) else: - pos1, pos2, pos3 = (end, end+self.arrow_width-2, end-self.arrow_width+2) - arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, y-self.arrow_width) + pos1, pos2, pos3 = (end, end+self.arrow_width-2, + end-self.arrow_width+2) + arrowhead = (pos1, y+2, pos2, y-self.arrow_width, pos3, + y-self.arrow_width) return "M{},{} L{},{} {},{}".format(*arrowhead) def get_levels(self, arcs): @@ -159,9 +165,10 @@ class EntityRenderer(object): """ colors = {'ORG': '#7aecec', 'PRODUCT': '#bfeeb7', 'GPE': '#feca74', 'LOC': '#ff9561', 'PERSON': '#aa9cfc', 'NORP': '#c887fb', - 'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LANGUAGE': '#ff8197', - 'WORK_OF_ART': '#f0d0ff', 'DATE': '#bfe1d9', 'TIME': '#bfe1d9', - 'MONEY': '#e4e7d2', 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2', + 'FACILITY': '#9cc9cc', 'EVENT': '#ffeb80', 'LAW': '#ff8197', + 'LANGUAGE': '#ff8197', 'WORK_OF_ART': '#f0d0ff', + 'DATE': '#bfe1d9', 'TIME': '#bfe1d9', 'MONEY': '#e4e7d2', + 'QUANTITY': '#e4e7d2', 'ORDINAL': '#e4e7d2', 'CARDINAL': '#e4e7d2', 'PERCENT': '#e4e7d2'} colors.update(options.get('colors', {})) self.default_color = '#ddd' @@ -176,9 +183,11 @@ class EntityRenderer(object): minify (bool): Minify HTML markup. RETURNS (unicode): Rendered HTML markup. """ - rendered = [self.render_ents(p['text'], p['ents'], p.get('title', None)) for p in parsed] + rendered = [self.render_ents(p['text'], p['ents'], + p.get('title', None)) for p in parsed] if page: - docs = ''.join([TPL_FIGURE.format(content=doc) for doc in rendered]) + docs = ''.join([TPL_FIGURE.format(content=doc) + for doc in rendered]) markup = TPL_PAGE.format(content=docs) else: markup = ''.join(rendered) diff --git a/spacy/glossary.py b/spacy/glossary.py index fd74d85e7..78e61f8a7 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -264,7 +264,6 @@ GLOSSARY = { 'nk': 'noun kernel element', 'nmc': 'numerical component', 'oa': 'accusative object', - 'oa': 'second accusative object', 'oc': 'clausal object', 'og': 'genitive object', 'op': 'prepositional object', diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 5729af667..5adef7bf7 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -2,7 +2,6 @@ # coding: utf8 from __future__ import unicode_literals, print_function -import io import re import ujson import random @@ -10,9 +9,8 @@ import cytoolz import itertools from .syntax import nonproj -from .util import ensure_path -from . import util from .tokens import Doc +from . import util def tags_to_entities(tags): @@ -54,7 +52,8 @@ def merge_sents(sents): m_deps[3].extend(head + i for head in heads) m_deps[4].extend(labels) m_deps[5].extend(ner) - m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) + for b in brackets) i += len(ids) return [(m_deps, m_brackets)] @@ -80,6 +79,8 @@ def align(cand_words, gold_words): punct_re = re.compile(r'\W') + + def _min_edit_path(cand_words, gold_words): cdef: Pool mem @@ -98,9 +99,9 @@ def _min_edit_path(cand_words, gold_words): mem = Pool() n_cand = len(cand_words) n_gold = len(gold_words) - # Levenshtein distance, except we need the history, and we may want different - # costs. - # Mark operations with a string, and score the history using _edit_cost. + # Levenshtein distance, except we need the history, and we may want + # different costs. Mark operations with a string, and score the history + # using _edit_cost. previous_row = [] prev_costs = mem.alloc(n_gold + 1, sizeof(int)) curr_costs = mem.alloc(n_gold + 1, sizeof(int)) @@ -144,9 +145,9 @@ def _min_edit_path(cand_words, gold_words): def minibatch(items, size=8): - '''Iterate over batches of items. `size` may be an iterator, + """Iterate over batches of items. `size` may be an iterator, so that batch-size can vary on each step. - ''' + """ if isinstance(size, int): size_ = itertools.repeat(8) else: @@ -168,6 +169,7 @@ class GoldCorpus(object): train_path (unicode or Path): File or directory of training data. dev_path (unicode or Path): File or directory of development data. + RETURNS (GoldCorpus): The newly created object. """ self.train_path = util.ensure_path(train_path) self.dev_path = util.ensure_path(dev_path) @@ -213,7 +215,7 @@ class GoldCorpus(object): train_tuples = self.train_tuples if projectivize: train_tuples = nonproj.preprocess_training_data( - self.train_tuples, label_freq_cutoff=100) + self.train_tuples, label_freq_cutoff=100) random.shuffle(train_tuples) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, max_length=max_length, @@ -222,7 +224,6 @@ class GoldCorpus(object): def dev_docs(self, nlp, gold_preproc=False): gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) - #gold_docs = nlp.preprocess_gold(gold_docs) yield from gold_docs @classmethod @@ -233,7 +234,6 @@ class GoldCorpus(object): raw_text = None else: paragraph_tuples = merge_sents(paragraph_tuples) - docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=noise_level) golds = cls._make_golds(docs, paragraph_tuples) @@ -248,17 +248,20 @@ class GoldCorpus(object): raw_text = add_noise(raw_text, noise_level) return [nlp.make_doc(raw_text)] else: - return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) - for (sent_tuples, brackets) in paragraph_tuples] + return [Doc(nlp.vocab, + words=add_noise(sent_tuples[1], noise_level)) + for (sent_tuples, brackets) in paragraph_tuples] @classmethod def _make_golds(cls, docs, paragraph_tuples): assert len(docs) == len(paragraph_tuples) if len(docs) == 1: - return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])] + return [GoldParse.from_annot_tuples(docs[0], + paragraph_tuples[0][0])] else: return [GoldParse.from_annot_tuples(doc, sent_tuples) - for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)] + for doc, (sent_tuples, brackets) + in zip(docs, paragraph_tuples)] @staticmethod def walk_corpus(path): @@ -305,7 +308,7 @@ def _corrupt(c, noise_level): def read_json_file(loc, docs_filter=None, limit=None): - loc = ensure_path(loc) + loc = util.ensure_path(loc) if loc.is_dir(): for filename in loc.iterdir(): yield from read_json_file(loc / filename, limit=limit) @@ -330,16 +333,16 @@ def read_json_file(loc, docs_filter=None, limit=None): for i, token in enumerate(sent['tokens']): words.append(token['orth']) ids.append(i) - tags.append(token.get('tag','-')) - heads.append(token.get('head',0) + i) - labels.append(token.get('dep','')) + tags.append(token.get('tag', '-')) + heads.append(token.get('head', 0) + i) + labels.append(token.get('dep', '')) # Ensure ROOT label is case-insensitive if labels[-1].lower() == 'root': labels[-1] = 'ROOT' ner.append(token.get('ner', '-')) sents.append([ [ids, words, tags, heads, labels, ner], - sent.get('brackets', [])]) + sent.get('brackets', [])]) if sents: yield [paragraph.get('raw', None), sents] @@ -382,19 +385,21 @@ cdef class GoldParse: @classmethod def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): _, words, tags, heads, deps, entities = annot_tuples - return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities, - make_projective=make_projective) + return cls(doc, words=words, tags=tags, heads=heads, deps=deps, + entities=entities, make_projective=make_projective) - def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, - deps=None, entities=None, make_projective=False, + def __init__(self, doc, annot_tuples=None, words=None, tags=None, + heads=None, deps=None, entities=None, make_projective=False, cats=None): """Create a GoldParse. doc (Doc): The document the annotations refer to. words (iterable): A sequence of unicode word strings. tags (iterable): A sequence of strings, representing tag annotations. - heads (iterable): A sequence of integers, representing syntactic head offsets. - deps (iterable): A sequence of strings, representing the syntactic relation types. + heads (iterable): A sequence of integers, representing syntactic + head offsets. + deps (iterable): A sequence of strings, representing the syntactic + relation types. entities (iterable): A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. @@ -404,9 +409,10 @@ cdef class GoldParse: document (usually a sentence). Unlike entity annotations, label annotations can overlap, i.e. a single word can be covered by multiple labelled spans. The TextCategorizer component expects - true examples of a label to have the value 1.0, and negative examples - of a label to have the value 0.0. Labels not in the dictionary are - treated as missing -- the gradient for those labels will be zero. + true examples of a label to have the value 1.0, and negative + examples of a label to have the value 0.0. Labels not in the + dictionary are treated as missing - the gradient for those labels + will be zero. RETURNS (GoldParse): The newly constructed object. """ if words is None: @@ -470,11 +476,11 @@ cdef class GoldParse: self.ner[i] = entities[gold_i] cycle = nonproj.contains_cycle(self.heads) - if cycle != None: + if cycle is not None: raise Exception("Cycle found: %s" % cycle) if make_projective: - proj_heads,_ = nonproj.projectivize(self.heads, self.labels) + proj_heads, _ = nonproj.projectivize(self.heads, self.labels) self.heads = proj_heads def __len__(self): @@ -497,20 +503,19 @@ cdef class GoldParse: def biluo_tags_from_offsets(doc, entities, missing='O'): - """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out - scheme (BILUO). + """Encode labelled spans into per-token tags, using the + Begin/In/Last/Unit/Out scheme (BILUO). doc (Doc): The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. - entities (iterable): A sequence of `(start, end, label)` triples. `start` and - `end` should be character-offset integers denoting the slice into the - original string. - + entities (iterable): A sequence of `(start, end, label)` triples. `start` + and `end` should be character-offset integers denoting the slice into + the original string. RETURNS (list): A list of unicode strings, describing the tags. Each tag string will be of the form either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". The string "-" is used where the - entity offsets don't align with the tokenization in the `Doc` object. The - training algorithm will view these as missing values. "O" denotes a + entity offsets don't align with the tokenization in the `Doc` object. + The training algorithm will view these as missing values. "O" denotes a non-entity token. "B" denotes the beginning of a multi-token entity, "I" the inside of an entity of three or more tokens, and "L" the end of an entity of two or more tokens. "U" denotes a single-token entity. diff --git a/spacy/language.py b/spacy/language.py index 959fee916..05546cde4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,31 +1,28 @@ # coding: utf8 from __future__ import absolute_import, unicode_literals -from contextlib import contextmanager -import copy -from thinc.neural import Model import random import ujson -from collections import OrderedDict import itertools import weakref import functools -import tqdm +from collections import OrderedDict +from contextlib import contextmanager +from copy import copy +from thinc.neural import Model +from thinc.neural.optimizers import Adam from .tokenizer import Tokenizer from .vocab import Vocab -from .tagger import Tagger from .lemmatizer import Lemmatizer - -from .pipeline import DependencyParser, Tensorizer, Tagger -from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer - -from .compat import Optimizer -from .compat import json_dumps, izip, copy_reg +from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer +from .pipeline import SimilarityHook, TextCategorizer +from .compat import json_dumps, izip from .scorer import Scorer from ._ml import link_vectors_to_models from .attrs import IS_STOP -from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES +from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES +from .lang.punctuation import TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .lang.lex_attrs import LEX_ATTRS, is_stop @@ -57,16 +54,18 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match - prefix_search = util.compile_prefix_regex(cls.prefixes).search \ - if cls.prefixes else None - suffix_search = util.compile_suffix_regex(cls.suffixes).search \ - if cls.suffixes else None - infix_finditer = util.compile_infix_regex(cls.infixes).finditer \ - if cls.infixes else None + prefix_search = (util.compile_prefix_regex(cls.prefixes).search + if cls.prefixes else None) + suffix_search = (util.compile_suffix_regex(cls.suffixes).search + if cls.suffixes else None) + infix_finditer = (util.compile_infix_regex(cls.infixes).finditer + if cls.infixes else None) vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) return Tokenizer(vocab, rules=rules, - prefix_search=prefix_search, suffix_search=suffix_search, - infix_finditer=infix_finditer, token_match=token_match) + prefix_search=prefix_search, + suffix_search=suffix_search, + infix_finditer=infix_finditer, + token_match=token_match) pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] token_match = TOKEN_MATCH @@ -98,7 +97,7 @@ class Language(object): factories = { 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), - 'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), + 'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg), 'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), 'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), 'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), @@ -218,14 +217,14 @@ class Language(object): def add_pipe(self, component, name=None, before=None, after=None, first=None, last=None): """Add a component to the processing pipeline. Valid components are - callables that take a `Doc` object, modify it and return it. Only one of - before, after, first or last can be set. Default behaviour is "last". + callables that take a `Doc` object, modify it and return it. Only one + of before/after/first/last can be set. Default behaviour is "last". component (callable): The pipeline component. name (unicode): Name of pipeline component. Overwrites existing component.name attribute if available. If no name is set and the component exposes no name attribute, component.__name__ is - used. An error is raised if the name already exists in the pipeline. + used. An error is raised if a name already exists in the pipeline. before (unicode): Component name to insert component directly before. after (unicode): Component name to insert component directly after. first (bool): Insert component first / not first in the pipeline. @@ -240,7 +239,8 @@ class Language(object): name = component.name elif hasattr(component, '__name__'): name = component.__name__ - elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'): + elif (hasattr(component, '__class__') and + hasattr(component.__class__, '__name__')): name = component.__class__.__name__ else: name = repr(component) @@ -269,7 +269,7 @@ class Language(object): `name in nlp.pipe_names`. name (unicode): Name of the component. - RETURNS (bool): Whether a component of that name exists in the pipeline. + RETURNS (bool): Whether a component of the name exists in the pipeline. """ return name in self.pipe_names @@ -332,15 +332,12 @@ class Language(object): return doc def disable_pipes(self, *names): - '''Disable one or more pipeline components. - - If used as a context manager, the pipeline will be restored to the initial - state at the end of the block. Otherwise, a DisabledPipes object is - returned, that has a `.restore()` method you can use to undo your - changes. + """Disable one or more pipeline components. If used as a context + manager, the pipeline will be restored to the initial state at the end + of the block. Otherwise, a DisabledPipes object is returned, that has + a `.restore()` method you can use to undo your changes. EXAMPLE: - >>> nlp.add_pipe('parser') >>> nlp.add_pipe('tagger') >>> with nlp.disable_pipes('parser', 'tagger'): @@ -351,7 +348,7 @@ class Language(object): >>> assert not nlp.has_pipe('parser') >>> disabled.restore() >>> assert nlp.has_pipe('parser') - ''' + """ return DisabledPipes(self, *names) def make_doc(self, text): @@ -367,14 +364,14 @@ class Language(object): RETURNS (dict): Results from the update. EXAMPLE: - >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer): + >>> with nlp.begin_training(gold) as (trainer, optimizer): >>> for epoch in trainer.epochs(gold): >>> for docs, golds in epoch: >>> state = nlp.update(docs, golds, sgd=optimizer) """ if len(docs) != len(golds): raise IndexError("Update expects same number of docs and golds " - "Got: %d, %d" % (len(docs), len(golds))) + "Got: %d, %d" % (len(docs), len(golds))) if len(docs) == 0: return if sgd is None: @@ -382,8 +379,10 @@ class Language(object): self._optimizer = Adam(Model.ops, 0.001) sgd = self._optimizer grads = {} + def get_grads(W, dW, key=None): grads[key] = (W, dW) + pipes = list(self.pipeline) random.shuffle(pipes) for name, proc in pipes: @@ -420,8 +419,8 @@ class Language(object): eps = util.env_opt('optimizer_eps', 1e-08) L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) - self._optimizer = Optimizer(Model.ops, learn_rate, L2=L2, beta1=beta1, - beta2=beta2, eps=eps) + self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, + beta2=beta2, eps=eps) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device return self._optimizer @@ -460,8 +459,8 @@ class Language(object): eps = util.env_opt('optimizer_eps', 1e-08) L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) - self._optimizer = Optimizer(Model.ops, learn_rate, L2=L2, beta1=beta1, - beta2=beta2, eps=eps) + self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, + beta2=beta2, eps=eps) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device return self._optimizer @@ -512,17 +511,17 @@ class Language(object): pass def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, - disable=[]): - """Process texts as a stream, and yield `Doc` objects in order. Supports - GIL-free multi-threading. + disable=[]): + """Process texts as a stream, and yield `Doc` objects in order. + Supports GIL-free multi-threading. texts (iterator): A sequence of texts to process. as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. - n_threads (int): The number of worker threads to use. If -1, OpenMP will - decide how many to use at run time. Default is 2. + n_threads (int): The number of worker threads to use. If -1, OpenMP + will decide how many to use at run time. Default is 2. batch_size (int): The number of texts to buffer. disable (list): Names of the pipeline components to disable. YIELDS (Doc): Documents in the order of the original text. @@ -546,7 +545,8 @@ class Language(object): if name in disable: continue if hasattr(proc, 'pipe'): - docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) + docs = proc.pipe(docs, n_threads=n_threads, + batch_size=batch_size) else: # Apply the function, but yield the doc docs = _pipe(proc, docs) @@ -583,7 +583,7 @@ class Language(object): will include the model. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be strings or `Path`-like objects. disable (list): Names of pipeline components to disable and prevent from being saved. @@ -649,7 +649,7 @@ class Language(object): serializers = OrderedDict(( ('vocab', lambda: self.vocab.to_bytes()), ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), - ('meta', lambda: ujson.dumps(self.meta)) + ('meta', lambda: json_dumps(self.meta)) )) for i, (name, proc) in enumerate(self.pipeline): if name in disable: @@ -682,14 +682,14 @@ class Language(object): class DisabledPipes(list): - '''Manager for temporary pipeline disabling.''' + """Manager for temporary pipeline disabling.""" def __init__(self, nlp, *names): self.nlp = nlp self.names = names # Important! Not deep copy -- we just want the container (but we also # want to support people providing arbitrarily typed nlp.pipeline # objects.) - self.original_pipeline = copy.copy(nlp.pipeline) + self.original_pipeline = copy(nlp.pipeline) list.__init__(self) self.extend(nlp.remove_pipe(name) for name in names) @@ -702,7 +702,8 @@ class DisabledPipes(list): def restore(self): '''Restore the pipeline to its state when DisabledPipes was created.''' current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline - unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)] + unexpected = [name for name, pipe in current + if not self.nlp.has_pipe(name)] if unexpected: # Don't change the pipeline if we're raising an error. self.nlp.pipeline = current diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index f3327a1d7..40cd995e2 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -43,16 +43,15 @@ class Lemmatizer(object): morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')] - true_morph_key = morphology.get('morph', 0) if univ_pos == 'noun' and morphology.get('Number') == 'sing': return True elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf': return True # This maps 'VBP' to base form -- probably just need 'IS_BASE' # morphology - elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ - morphology.get('Tense') == 'pres' and \ - morphology.get('Number') is None and \ + elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and + morphology.get('Tense') == 'pres' and + morphology.get('Number') is None and not others): return True elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': @@ -89,9 +88,6 @@ class Lemmatizer(object): def lemmatize(string, index, exceptions, rules): string = string.lower() forms = [] - # TODO: Is this correct? See discussion in Issue #435. - #if string in index: - # forms.append(string) forms.extend(exceptions.get(string, [])) oov_forms = [] if not forms: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f0f5c6398..88748af33 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -2,27 +2,17 @@ # coding: utf8 from __future__ import unicode_literals, print_function -from libc.math cimport sqrt -from cpython.ref cimport Py_INCREF -from cymem.cymem cimport Pool -from murmurhash.mrmr cimport hash64 - # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray cimport numpy as np np.import_array() - from libc.string cimport memset import numpy from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP -from .attrs cimport IS_BRACKET -from .attrs cimport IS_QUOTE -from .attrs cimport IS_LEFT_PUNCT -from .attrs cimport IS_RIGHT_PUNCT -from .attrs cimport IS_OOV +from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV from . import about @@ -32,8 +22,8 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) cdef class Lexeme: """An entry in the vocabulary. A `Lexeme` has no string context – it's a word-type, as opposed to a word token. It therefore has no part-of-speech - tag, dependency parse, or lemma (lemmatization depends on the part-of-speech - tag). + tag, dependency parse, or lemma (lemmatization depends on the + part-of-speech tag). """ def __init__(self, Vocab vocab, attr_t orth): """Create a Lexeme object. @@ -60,17 +50,17 @@ cdef class Lexeme: else: a = 0 b = 1 - if op == 2: # == + if op == 2: # == return a == b - elif op == 3: # != + elif op == 3: # != return a != b - elif op == 0: # < + elif op == 0: # < return a < b - elif op == 1: # <= + elif op == 1: # <= return a <= b - elif op == 4: # > + elif op == 4: # > return a > b - elif op == 5: # >= + elif op == 5: # >= return a >= b else: raise NotImplementedError(op) @@ -104,7 +94,8 @@ cdef class Lexeme: """ if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 - return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + return (numpy.dot(self.vector, other.vector) / + (self.vector_norm * other.vector_norm)) def to_bytes(self): lex_data = Lexeme.c_to_bytes(self.c) @@ -130,19 +121,13 @@ cdef class Lexeme: self.orth = self.c.orth property has_vector: - """A boolean value indicating whether a word vector is associated with - the object. - - RETURNS (bool): Whether a word vector is associated with the object. + """RETURNS (bool): Whether a word vector is associated with the object. """ def __get__(self): return self.vocab.has_vector(self.c.orth) property vector_norm: - """The L2 norm of the lexeme's vector representation. - - RETURNS (float): The L2 norm of the vector representation. - """ + """RETURNS (float): The L2 norm of the vector representation.""" def __get__(self): vector = self.vector return numpy.sqrt((vector**2).sum()) @@ -169,149 +154,320 @@ cdef class Lexeme: self.vocab.set_vector(self.c.orth, vector) property rank: + """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used + to index into tables, e.g. for word vectors.""" def __get__(self): return self.c.id + def __set__(self, value): self.c.id = value property sentiment: + """RETURNS (float): A scalar value indicating the positivity or + negativity of the lexeme.""" def __get__(self): return self.c.sentiment + def __set__(self, float sentiment): self.c.sentiment = sentiment property orth_: + """RETURNS (unicode): The original verbatim text of the lexeme + (identical to `Lexeme.text`). Exists mostly for consistency with + the other attributes.""" def __get__(self): return self.vocab.strings[self.c.orth] property text: - """A unicode representation of the token text. - - RETURNS (unicode): The original verbatim text of the token. - """ + """RETURNS (unicode): The original verbatim text of the lexeme.""" def __get__(self): return self.orth_ property lower: - def __get__(self): return self.c.lower - def __set__(self, attr_t x): self.c.lower = x + """RETURNS (unicode): Lowercase form of the lexeme.""" + def __get__(self): + return self.c.lower + + def __set__(self, attr_t x): + self.c.lower = x property norm: - def __get__(self): return self.c.norm - def __set__(self, attr_t x): self.c.norm = x + """RETURNS (uint64): The lexemes's norm, i.e. a normalised form of the + lexeme text. + """ + def __get__(self): + return self.c.norm + + def __set__(self, attr_t x): + self.c.norm = x property shape: - def __get__(self): return self.c.shape - def __set__(self, attr_t x): self.c.shape = x + """RETURNS (uint64): Transform of the word's string, to show + orthographic features. + """ + def __get__(self): + return self.c.shape + + def __set__(self, attr_t x): + self.c.shape = x property prefix: - def __get__(self): return self.c.prefix - def __set__(self, attr_t x): self.c.prefix = x + """RETURNS (uint64): Length-N substring from the start of the word. + Defaults to `N=1`. + """ + def __get__(self): + return self.c.prefix + + def __set__(self, attr_t x): + self.c.prefix = x property suffix: - def __get__(self): return self.c.suffix - def __set__(self, attr_t x): self.c.suffix = x + """RETURNS (uint64): Length-N substring from the end of the word. + Defaults to `N=3`. + """ + def __get__(self): + return self.c.suffix + + def __set__(self, attr_t x): + self.c.suffix = x property cluster: - def __get__(self): return self.c.cluster - def __set__(self, attr_t x): self.c.cluster = x + """RETURNS (int): Brown cluster ID.""" + def __get__(self): + return self.c.cluster + + def __set__(self, attr_t x): + self.c.cluster = x property lang: - def __get__(self): return self.c.lang - def __set__(self, attr_t x): self.c.lang = x + """RETURNS (uint64): Language of the parent vocabulary.""" + def __get__(self): + return self.c.lang + + def __set__(self, attr_t x): + self.c.lang = x property prob: - def __get__(self): return self.c.prob - def __set__(self, float x): self.c.prob = x + """RETURNS (float): Smoothed log probability estimate of the lexeme's + type.""" + def __get__(self): + return self.c.prob + + def __set__(self, float x): + self.c.prob = x property lower_: - def __get__(self): return self.vocab.strings[self.c.lower] - def __set__(self, unicode x): self.c.lower = self.vocab.strings.add(x) + """RETURNS (unicode): Lowercase form of the word.""" + def __get__(self): + return self.vocab.strings[self.c.lower] + + def __set__(self, unicode x): + self.c.lower = self.vocab.strings.add(x) property norm_: - def __get__(self): return self.vocab.strings[self.c.norm] - def __set__(self, unicode x): self.c.norm = self.vocab.strings.add(x) + """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the + lexeme text. + """ + def __get__(self): + return self.vocab.strings[self.c.norm] + + def __set__(self, unicode x): + self.c.norm = self.vocab.strings.add(x) property shape_: - def __get__(self): return self.vocab.strings[self.c.shape] - def __set__(self, unicode x): self.c.shape = self.vocab.strings.add(x) + """RETURNS (unicode): Transform of the word's string, to show + orthographic features. + """ + def __get__(self): + return self.vocab.strings[self.c.shape] + + def __set__(self, unicode x): + self.c.shape = self.vocab.strings.add(x) property prefix_: - def __get__(self): return self.vocab.strings[self.c.prefix] - def __set__(self, unicode x): self.c.prefix = self.vocab.strings.add(x) + """RETURNS (unicode): Length-N substring from the start of the word. + Defaults to `N=1`. + """ + def __get__(self): + return self.vocab.strings[self.c.prefix] + + def __set__(self, unicode x): + self.c.prefix = self.vocab.strings.add(x) property suffix_: - def __get__(self): return self.vocab.strings[self.c.suffix] - def __set__(self, unicode x): self.c.suffix = self.vocab.strings.add(x) + """RETURNS (unicode): Length-N substring from the end of the word. + Defaults to `N=3`. + """ + def __get__(self): + return self.vocab.strings[self.c.suffix] + + def __set__(self, unicode x): + self.c.suffix = self.vocab.strings.add(x) property lang_: - def __get__(self): return self.vocab.strings[self.c.lang] - def __set__(self, unicode x): self.c.lang = self.vocab.strings.add(x) + """RETURNS (unicode): Language of the parent vocabulary.""" + def __get__(self): + return self.vocab.strings[self.c.lang] + + def __set__(self, unicode x): + self.c.lang = self.vocab.strings.add(x) property flags: - def __get__(self): return self.c.flags - def __set__(self, flags_t x): self.c.flags = x + """RETURNS (uint64): Container of the lexeme's binary flags.""" + def __get__(self): + return self.c.flags + + def __set__(self, flags_t x): + self.c.flags = x property is_oov: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV) - def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x) + """RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_OOV) + + def __set__(self, attr_t x): + Lexeme.c_set_flag(self.c, IS_OOV, x) property is_stop: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_STOP, x) + """RETURNS (bool): Whether the lexeme is a stop word.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_STOP) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_STOP, x) property is_alpha: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x) + """RETURNS (bool): Whether the lexeme consists of alphanumeric + characters. Equivalent to `lexeme.text.isalpha()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_ALPHA) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_ALPHA, x) property is_ascii: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x) + """RETURNS (bool): Whether the lexeme consists of ASCII characters. + Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_ASCII) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_ASCII, x) property is_digit: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_DIGIT) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_DIGIT, x) + """RETURNS (bool): Whether the lexeme consists of digits. Equivalent + to `lexeme.text.isdigit()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_DIGIT) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_DIGIT, x) property is_lower: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_LOWER) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LOWER, x) + """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to + `lexeme.text.islower()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_LOWER) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_LOWER, x) + + property is_upper: + """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to + `lexeme.text.isupper()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_UPPER) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_UPPER, x) property is_title: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_TITLE) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_TITLE, x) + """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to + `lexeme.text.istitle()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_TITLE) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_TITLE, x) property is_punct: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x) + """RETURNS (bool): Whether the lexeme is punctuation.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_PUNCT) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_PUNCT, x) property is_space: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x) + """RETURNS (bool): Whether the lexeme consist of whitespace characters. + Equivalent to `lexeme.text.isspace()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_SPACE) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_SPACE, x) property is_bracket: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x) + """RETURNS (bool): Whether the lexeme is a bracket.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_BRACKET) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_BRACKET, x) property is_quote: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x) + """RETURNS (bool): Whether the lexeme is a quotation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_QUOTE) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_QUOTE, x) property is_left_punct: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) + """RETURNS (bool): Whether the lexeme is left punctuation, e.g. ).""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) property is_right_punct: - def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) + """RETURNS (bool): Whether the lexeme is right punctuation, e.g. ).""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) property like_url: - def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) + """RETURNS (bool): Whether the lexeme resembles a URL.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, LIKE_URL) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x) + """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9", + "10", "ten", etc. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c, LIKE_NUM) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_NUM, x) property like_email: - def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_EMAIL) - def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) + """RETURNS (bool): Whether the lexeme resembles an email address.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, LIKE_EMAIL) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 401405c14..a6b02ba2c 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -4,12 +4,6 @@ from __future__ import unicode_literals import ujson - -from .typedefs cimport attr_t -from .typedefs cimport hash_t -from .attrs cimport attr_id_t -from .structs cimport TokenC - from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from libcpp.vector cimport vector @@ -17,14 +11,15 @@ from libcpp.pair cimport pair from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t -from .attrs cimport ID, NULL_ATTR, ENT_TYPE -from . import attrs -from .tokens.doc cimport get_token_attr -from .tokens.doc cimport Doc +from .typedefs cimport attr_t +from .typedefs cimport hash_t +from .structs cimport TokenC +from .tokens.doc cimport Doc, get_token_attr from .vocab cimport Vocab +from .attrs import IDS +from .attrs cimport attr_id_t, ID, NULL_ATTR from .attrs import FLAG61 as U_ENT - from .attrs import FLAG60 as B2_ENT from .attrs import FLAG59 as B3_ENT from .attrs import FLAG58 as B4_ENT @@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT from .attrs import FLAG54 as B8_ENT from .attrs import FLAG53 as B9_ENT from .attrs import FLAG52 as B10_ENT - from .attrs import FLAG51 as I3_ENT from .attrs import FLAG50 as I4_ENT from .attrs import FLAG49 as I5_ENT @@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT from .attrs import FLAG46 as I8_ENT from .attrs import FLAG45 as I9_ENT from .attrs import FLAG44 as I10_ENT - from .attrs import FLAG43 as L2_ENT from .attrs import FLAG42 as L3_ENT from .attrs import FLAG41 as L4_ENT @@ -153,7 +146,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: def _convert_strings(token_specs, string_store): # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), - '?': (ZERO_ONE,), '1': (ONE,)} + '?': (ZERO_ONE,), '1': (ONE,)} tokens = [] op = ONE for spec in token_specs: @@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store): if value in operators: ops = operators[value] else: - raise KeyError( - "Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys()))) + msg = "Unknown operator '%s'. Options: %s" + raise KeyError(msg % (value, ', '.join(operators.keys()))) if isinstance(attr, basestring): - attr = attrs.IDS.get(attr.upper()) + attr = IDS.get(attr.upper()) if isinstance(value, basestring): value = string_store.add(value) if isinstance(value, bool): @@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store): def merge_phrase(matcher, doc, i, matches): """Callback to merge a phrase on match.""" ent_id, label, start, end = matches[i] - span = doc[start : end] + span = doc[start:end] span.merge(ent_type=label, ent_id=ent_id) @@ -233,13 +226,13 @@ cdef class Matcher: return self._normalize_key(key) in self._patterns def add(self, key, on_match, *patterns): - """Add a match-rule to the matcher. A match-rule consists of: an ID key, - an on_match callback, and one or more patterns. + """Add a match-rule to the matcher. A match-rule consists of: an ID + key, an on_match callback, and one or more patterns. If the key exists, the patterns are appended to the previous ones, and - the previous on_match callback is replaced. The `on_match` callback will - receive the arguments `(matcher, doc, i, matches)`. You can also set - `on_match` to `None` to not perform any actions. + the previous on_match callback is replaced. The `on_match` callback + will receive the arguments `(matcher, doc, i, matches)`. You can also + set `on_match` to `None` to not perform any actions. A pattern consists of one or more `token_specs`, where a `token_spec` is a dictionary mapping attribute IDs to values, and optionally a @@ -253,8 +246,8 @@ cdef class Matcher: The + and * operators are usually interpretted "greedily", i.e. longer matches are returned where possible. However, if you specify two '+' and '*' patterns in a row and their matches overlap, the first - operator will behave non-greedily. This quirk in the semantics - makes the matcher more efficient, by avoiding the need for back-tracking. + operator will behave non-greedily. This quirk in the semantics makes + the matcher more efficient, by avoiding the need for back-tracking. key (unicode): The match ID. on_match (callable): Callback executed on match. @@ -268,7 +261,6 @@ cdef class Matcher: key = self._normalize_key(key) self._patterns.setdefault(key, []) self._callbacks[key] = on_match - for pattern in patterns: specs = _convert_strings(pattern, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, key, specs)) @@ -315,9 +307,9 @@ cdef class Matcher: """Match a stream of documents, yielding them in turn. docs (iterable): A stream of documents. - batch_size (int): The number of documents to accumulate into a working set. + batch_size (int): Number of documents to accumulate into a working set. n_threads (int): The number of threads with which to work on the buffer - in parallel, if the `Matcher` implementation supports multi-threading. + in parallel, if the implementation supports multi-threading. YIELDS (Doc): Documents, in order. """ for doc in docs: @@ -325,7 +317,7 @@ cdef class Matcher: yield doc def __call__(self, Doc doc): - """Find all token sequences matching the supplied patterns on the `Doc`. + """Find all token sequences matching the supplied pattern. doc (Doc): The document to match over. RETURNS (list): A list of `(key, start, end)` tuples, @@ -342,8 +334,8 @@ cdef class Matcher: for token_i in range(doc.length): token = &doc.c[token_i] q = 0 - # Go over the open matches, extending or finalizing if able. Otherwise, - # we over-write them (q doesn't advance) + # Go over the open matches, extending or finalizing if able. + # Otherwise, we over-write them (q doesn't advance) for state in partials: action = get_action(state.second, token) if action == PANIC: @@ -356,8 +348,8 @@ cdef class Matcher: if action == REPEAT: # Leave the state in the queue, and advance to next slot - # (i.e. we don't overwrite -- we want to greedily match more - # pattern. + # (i.e. we don't overwrite -- we want to greedily match + # more pattern. q += 1 elif action == REJECT: pass @@ -366,8 +358,8 @@ cdef class Matcher: partials[q].second += 1 q += 1 elif action in (ACCEPT, ACCEPT_PREV): - # TODO: What to do about patterns starting with ZERO? Need to - # adjust the start position. + # TODO: What to do about patterns starting with ZERO? Need + # to adjust the start position. start = state.first end = token_i+1 if action == ACCEPT else token_i ent_id = state.second[1].attrs[0].value @@ -388,8 +380,8 @@ cdef class Matcher: state.second = pattern partials.push_back(state) elif action == ADVANCE: - # TODO: What to do about patterns starting with ZERO? Need to - # adjust the start position. + # TODO: What to do about patterns starting with ZERO? Need + # to adjust the start position. state.first = token_i state.second = pattern + 1 partials.push_back(state) @@ -413,7 +405,6 @@ cdef class Matcher: on_match = self._callbacks.get(ent_id) if on_match is not None: on_match(self, doc, i, matches) - # TODO: only return (match_id, start, end) return matches def _normalize_key(self, key): @@ -441,7 +432,8 @@ def get_bilou(length): elif length == 8: return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] elif length == 9: - return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] + return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, + L9_ENT] elif length == 10: return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, L10_ENT] @@ -454,10 +446,8 @@ cdef class PhraseMatcher: cdef Vocab vocab cdef Matcher matcher cdef PreshMap phrase_ids - cdef int max_length cdef attr_t* _phrase_key - cdef public object _callbacks cdef public object _patterns @@ -470,7 +460,8 @@ cdef class PhraseMatcher: self.phrase_ids = PreshMap() abstract_patterns = [] for length in range(1, max_length): - abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) + abstract_patterns.append([{tag: True} + for tag in get_bilou(length)]) self.matcher.add('Candidate', None, *abstract_patterns) self._callbacks = {} @@ -496,8 +487,8 @@ cdef class PhraseMatcher: return (self.__class__, (self.vocab,), None, None) def add(self, key, on_match, *docs): - """Add a match-rule to the matcher. A match-rule consists of: an ID key, - an on_match callback, and one or more patterns. + """Add a match-rule to the matcher. A match-rule consists of: an ID + key, an on_match callback, and one or more patterns. key (unicode): The match ID. on_match (callable): Callback executed on match. @@ -513,7 +504,6 @@ cdef class PhraseMatcher: raise ValueError(msg % (len(doc), self.max_length)) cdef hash_t ent_id = self.matcher._normalize_key(key) self._callbacks[ent_id] = on_match - cdef int length cdef int i cdef hash_t phrase_hash @@ -553,9 +543,9 @@ cdef class PhraseMatcher: """Match a stream of documents, yielding them in turn. docs (iterable): A stream of documents. - batch_size (int): The number of documents to accumulate into a working set. + batch_size (int): Number of documents to accumulate into a working set. n_threads (int): The number of threads with which to work on the buffer - in parallel, if the `Matcher` implementation supports multi-threading. + in parallel, if the implementation supports multi-threading. YIELDS (Doc): Documents, in order. """ for doc in stream: @@ -569,7 +559,8 @@ cdef class PhraseMatcher: self._phrase_key[i] = 0 for i, j in enumerate(range(start, end)): self._phrase_key[i] = doc.c[j].lex.orth - cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) + cdef hash_t key = hash64(self._phrase_key, + self.max_length * sizeof(attr_t), 0) ent_id = self.phrase_ids.get(key) if ent_id == 0: return None diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 91befaa1b..b3989839d 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -4,17 +4,15 @@ from __future__ import unicode_literals from libc.string cimport memset -from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE from .attrs cimport POS, IS_SPACE +from .attrs import LEMMA, intify_attrs +from .parts_of_speech cimport SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme -from .attrs import LEMMA, intify_attrs def _normalize_props(props): - """ - Transform deprecated string keys to correct names. - """ + """Transform deprecated string keys to correct names.""" out = {} for key, value in props.items(): if key == POS: @@ -77,7 +75,8 @@ cdef class Morphology: cdef int assign_untagged(self, TokenC* token) except -1: """Set morphological attributes on a token without a POS tag. Uses the lemmatizer's lookup() method, which looks up the string in the - table provided by the language data as lemma_lookup (if available).""" + table provided by the language data as lemma_lookup (if available). + """ if token.lemma == 0: orth_str = self.strings[token.lex.orth] lemma = self.lemmatizer.lookup(orth_str) @@ -95,11 +94,10 @@ cdef class Morphology: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: if tag_id > self.n_tags: raise ValueError("Unknown tag ID: %s" % tag_id) - # TODO: It's pretty arbitrary to put this logic here. I guess the justification - # is that this is where the specific word and the tag interact. Still, - # we should have a better way to enforce this rule, or figure out why - # the statistical model fails. - # Related to Issue #220 + # TODO: It's pretty arbitrary to put this logic here. I guess the + # justification is that this is where the specific word and the tag + # interact. Still, we should have a better way to enforce this rule, or + # figure out why the statistical model fails. Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): tag_id = self.reverse_index[self.strings.add('_SP')] rich_tag = self.rich_tags[tag_id] @@ -123,14 +121,13 @@ cdef class Morphology: else: flags[0] &= ~(one << flag_id) - def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): - """ - Add a special-case rule to the morphological analyser. Tokens whose + def add_special_case(self, unicode tag_str, unicode orth_str, attrs, + force=False): + """Add a special-case rule to the morphological analyser. Tokens whose tag and orth match the rule will receive the specified properties. - Arguments: - tag (unicode): The part-of-speech tag to key the exception. - orth (unicode): The word-form to key the exception. + tag (unicode): The part-of-speech tag to key the exception. + orth (unicode): The word-form to key the exception. """ self.exc[(tag_str, orth_str)] = dict(attrs) tag = self.strings.add(tag_str) @@ -144,10 +141,9 @@ cdef class Morphology: elif force: memset(cached, 0, sizeof(cached[0])) else: - msg = ("Conflicting morphology exception for (%s, %s). Use force=True " - "to overwrite.") - msg = msg % (tag_str, orth_str) - raise ValueError(msg) + raise ValueError( + "Conflicting morphology exception for (%s, %s). Use " + "force=True to overwrite." % (tag_str, orth_str)) cached.tag = rich_tag # TODO: Refactor this to take arbitrary attributes. @@ -218,7 +214,7 @@ IDS = { "Definite_two": Definite_two, "Definite_def": Definite_def, "Definite_red": Definite_red, - "Definite_cons": Definite_cons, # U20 + "Definite_cons": Definite_cons, # U20 "Definite_ind": Definite_ind, "Degree_cmp": Degree_cmp, "Degree_comp": Degree_comp, @@ -227,7 +223,7 @@ IDS = { "Degree_sup": Degree_sup, "Degree_abs": Degree_abs, "Degree_com": Degree_com, - "Degree_dim ": Degree_dim, # du + "Degree_dim ": Degree_dim, # du "Gender_com": Gender_com, "Gender_fem": Gender_fem, "Gender_masc": Gender_masc, @@ -242,15 +238,15 @@ IDS = { "Negative_neg": Negative_neg, "Negative_pos": Negative_pos, "Negative_yes": Negative_yes, - "Polarity_neg": Polarity_neg, # U20 - "Polarity_pos": Polarity_pos, # U20 + "Polarity_neg": Polarity_neg, # U20 + "Polarity_pos": Polarity_pos, # U20 "Number_com": Number_com, "Number_dual": Number_dual, "Number_none": Number_none, "Number_plur": Number_plur, "Number_sing": Number_sing, - "Number_ptan ": Number_ptan, # bg - "Number_count ": Number_count, # bg + "Number_ptan ": Number_ptan, # bg + "Number_count ": Number_count, # bg "NumType_card": NumType_card, "NumType_dist": NumType_dist, "NumType_frac": NumType_frac, @@ -276,7 +272,7 @@ IDS = { "PronType_rel": PronType_rel, "PronType_tot": PronType_tot, "PronType_clit": PronType_clit, - "PronType_exc ": PronType_exc, # es, ca, it, fa, + "PronType_exc ": PronType_exc, # es, ca, it, fa, "Reflex_yes": Reflex_yes, "Tense_fut": Tense_fut, "Tense_imp": Tense_imp, @@ -292,19 +288,19 @@ IDS = { "VerbForm_partPres": VerbForm_partPres, "VerbForm_sup": VerbForm_sup, "VerbForm_trans": VerbForm_trans, - "VerbForm_conv": VerbForm_conv, # U20 - "VerbForm_gdv ": VerbForm_gdv, # la, + "VerbForm_conv": VerbForm_conv, # U20 + "VerbForm_gdv ": VerbForm_gdv, # la, "Voice_act": Voice_act, "Voice_cau": Voice_cau, "Voice_pass": Voice_pass, - "Voice_mid ": Voice_mid, # gkc, - "Voice_int ": Voice_int, # hb, - "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, - "AdpType_prep ": AdpType_prep, # cz, U, - "AdpType_post ": AdpType_post, # U, - "AdpType_voc ": AdpType_voc, # cz, - "AdpType_comprep ": AdpType_comprep, # cz, - "AdpType_circ ": AdpType_circ, # U, + "Voice_mid ": Voice_mid, # gkc, + "Voice_int ": Voice_int, # hb, + "Abbr_yes ": Abbr_yes, # cz, fi, sl, U, + "AdpType_prep ": AdpType_prep, # cz, U, + "AdpType_post ": AdpType_post, # U, + "AdpType_voc ": AdpType_voc, # cz, + "AdpType_comprep ": AdpType_comprep, # cz, + "AdpType_circ ": AdpType_circ, # U, "AdvType_man": AdvType_man, "AdvType_loc": AdvType_loc, "AdvType_tim": AdvType_tim, @@ -314,122 +310,122 @@ IDS = { "AdvType_sta": AdvType_sta, "AdvType_ex": AdvType_ex, "AdvType_adadj": AdvType_adadj, - "ConjType_oper ": ConjType_oper, # cz, U, - "ConjType_comp ": ConjType_comp, # cz, U, - "Connegative_yes ": Connegative_yes, # fi, - "Derivation_minen ": Derivation_minen, # fi, - "Derivation_sti ": Derivation_sti, # fi, - "Derivation_inen ": Derivation_inen, # fi, - "Derivation_lainen ": Derivation_lainen, # fi, - "Derivation_ja ": Derivation_ja, # fi, - "Derivation_ton ": Derivation_ton, # fi, - "Derivation_vs ": Derivation_vs, # fi, - "Derivation_ttain ": Derivation_ttain, # fi, - "Derivation_ttaa ": Derivation_ttaa, # fi, - "Echo_rdp ": Echo_rdp, # U, - "Echo_ech ": Echo_ech, # U, - "Foreign_foreign ": Foreign_foreign, # cz, fi, U, - "Foreign_fscript ": Foreign_fscript, # cz, fi, U, - "Foreign_tscript ": Foreign_tscript, # cz, U, - "Foreign_yes ": Foreign_yes, # sl, - "Gender_dat_masc ": Gender_dat_masc, # bq, U, - "Gender_dat_fem ": Gender_dat_fem, # bq, U, - "Gender_erg_masc ": Gender_erg_masc, # bq, - "Gender_erg_fem ": Gender_erg_fem, # bq, - "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, - "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, - "Gender_psor_neut ": Gender_psor_neut, # sl, - "Hyph_yes ": Hyph_yes, # cz, U, - "InfForm_one ": InfForm_one, # fi, - "InfForm_two ": InfForm_two, # fi, - "InfForm_three ": InfForm_three, # fi, - "NameType_geo ": NameType_geo, # U, cz, - "NameType_prs ": NameType_prs, # U, cz, - "NameType_giv ": NameType_giv, # U, cz, - "NameType_sur ": NameType_sur, # U, cz, - "NameType_nat ": NameType_nat, # U, cz, - "NameType_com ": NameType_com, # U, cz, - "NameType_pro ": NameType_pro, # U, cz, - "NameType_oth ": NameType_oth, # U, cz, - "NounType_com ": NounType_com, # U, - "NounType_prop ": NounType_prop, # U, - "NounType_class ": NounType_class, # U, - "Number_abs_sing ": Number_abs_sing, # bq, U, - "Number_abs_plur ": Number_abs_plur, # bq, U, - "Number_dat_sing ": Number_dat_sing, # bq, U, - "Number_dat_plur ": Number_dat_plur, # bq, U, - "Number_erg_sing ": Number_erg_sing, # bq, U, - "Number_erg_plur ": Number_erg_plur, # bq, U, - "Number_psee_sing ": Number_psee_sing, # U, - "Number_psee_plur ": Number_psee_plur, # U, - "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, - "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, - "NumForm_digit ": NumForm_digit, # cz, sl, U, - "NumForm_roman ": NumForm_roman, # cz, sl, U, - "NumForm_word ": NumForm_word, # cz, sl, U, - "NumValue_one ": NumValue_one, # cz, U, - "NumValue_two ": NumValue_two, # cz, U, - "NumValue_three ": NumValue_three, # cz, U, - "PartForm_pres ": PartForm_pres, # fi, - "PartForm_past ": PartForm_past, # fi, - "PartForm_agt ": PartForm_agt, # fi, - "PartForm_neg ": PartForm_neg, # fi, - "PartType_mod ": PartType_mod, # U, - "PartType_emp ": PartType_emp, # U, - "PartType_res ": PartType_res, # U, - "PartType_inf ": PartType_inf, # U, - "PartType_vbp ": PartType_vbp, # U, - "Person_abs_one ": Person_abs_one, # bq, U, - "Person_abs_two ": Person_abs_two, # bq, U, - "Person_abs_three ": Person_abs_three, # bq, U, - "Person_dat_one ": Person_dat_one, # bq, U, - "Person_dat_two ": Person_dat_two, # bq, U, - "Person_dat_three ": Person_dat_three, # bq, U, - "Person_erg_one ": Person_erg_one, # bq, U, - "Person_erg_two ": Person_erg_two, # bq, U, - "Person_erg_three ": Person_erg_three, # bq, U, - "Person_psor_one ": Person_psor_one, # fi, U, - "Person_psor_two ": Person_psor_two, # fi, U, - "Person_psor_three ": Person_psor_three, # fi, U, - "Polite_inf ": Polite_inf, # bq, U, - "Polite_pol ": Polite_pol, # bq, U, - "Polite_abs_inf ": Polite_abs_inf, # bq, U, - "Polite_abs_pol ": Polite_abs_pol, # bq, U, - "Polite_erg_inf ": Polite_erg_inf, # bq, U, - "Polite_erg_pol ": Polite_erg_pol, # bq, U, - "Polite_dat_inf ": Polite_dat_inf, # bq, U, - "Polite_dat_pol ": Polite_dat_pol, # bq, U, - "Prefix_yes ": Prefix_yes, # U, - "PrepCase_npr ": PrepCase_npr, # cz, - "PrepCase_pre ": PrepCase_pre, # U, - "PunctSide_ini ": PunctSide_ini, # U, - "PunctSide_fin ": PunctSide_fin, # U, - "PunctType_peri ": PunctType_peri, # U, - "PunctType_qest ": PunctType_qest, # U, - "PunctType_excl ": PunctType_excl, # U, - "PunctType_quot ": PunctType_quot, # U, - "PunctType_brck ": PunctType_brck, # U, - "PunctType_comm ": PunctType_comm, # U, - "PunctType_colo ": PunctType_colo, # U, - "PunctType_semi ": PunctType_semi, # U, - "PunctType_dash ": PunctType_dash, # U, - "Style_arch ": Style_arch, # cz, fi, U, - "Style_rare ": Style_rare, # cz, fi, U, - "Style_poet ": Style_poet, # cz, U, - "Style_norm ": Style_norm, # cz, U, - "Style_coll ": Style_coll, # cz, U, - "Style_vrnc ": Style_vrnc, # cz, U, - "Style_sing ": Style_sing, # cz, U, - "Style_expr ": Style_expr, # cz, U, - "Style_derg ": Style_derg, # cz, U, - "Style_vulg ": Style_vulg, # cz, U, - "Style_yes ": Style_yes, # fi, U, - "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, - "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, - "VerbType_aux ": VerbType_aux, # U, - "VerbType_cop ": VerbType_cop, # U, - "VerbType_mod ": VerbType_mod, # U, - "VerbType_light ": VerbType_light, # U, + "ConjType_oper ": ConjType_oper, # cz, U, + "ConjType_comp ": ConjType_comp, # cz, U, + "Connegative_yes ": Connegative_yes, # fi, + "Derivation_minen ": Derivation_minen, # fi, + "Derivation_sti ": Derivation_sti, # fi, + "Derivation_inen ": Derivation_inen, # fi, + "Derivation_lainen ": Derivation_lainen, # fi, + "Derivation_ja ": Derivation_ja, # fi, + "Derivation_ton ": Derivation_ton, # fi, + "Derivation_vs ": Derivation_vs, # fi, + "Derivation_ttain ": Derivation_ttain, # fi, + "Derivation_ttaa ": Derivation_ttaa, # fi, + "Echo_rdp ": Echo_rdp, # U, + "Echo_ech ": Echo_ech, # U, + "Foreign_foreign ": Foreign_foreign, # cz, fi, U, + "Foreign_fscript ": Foreign_fscript, # cz, fi, U, + "Foreign_tscript ": Foreign_tscript, # cz, U, + "Foreign_yes ": Foreign_yes, # sl, + "Gender_dat_masc ": Gender_dat_masc, # bq, U, + "Gender_dat_fem ": Gender_dat_fem, # bq, U, + "Gender_erg_masc ": Gender_erg_masc, # bq, + "Gender_erg_fem ": Gender_erg_fem, # bq, + "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U, + "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U, + "Gender_psor_neut ": Gender_psor_neut, # sl, + "Hyph_yes ": Hyph_yes, # cz, U, + "InfForm_one ": InfForm_one, # fi, + "InfForm_two ": InfForm_two, # fi, + "InfForm_three ": InfForm_three, # fi, + "NameType_geo ": NameType_geo, # U, cz, + "NameType_prs ": NameType_prs, # U, cz, + "NameType_giv ": NameType_giv, # U, cz, + "NameType_sur ": NameType_sur, # U, cz, + "NameType_nat ": NameType_nat, # U, cz, + "NameType_com ": NameType_com, # U, cz, + "NameType_pro ": NameType_pro, # U, cz, + "NameType_oth ": NameType_oth, # U, cz, + "NounType_com ": NounType_com, # U, + "NounType_prop ": NounType_prop, # U, + "NounType_class ": NounType_class, # U, + "Number_abs_sing ": Number_abs_sing, # bq, U, + "Number_abs_plur ": Number_abs_plur, # bq, U, + "Number_dat_sing ": Number_dat_sing, # bq, U, + "Number_dat_plur ": Number_dat_plur, # bq, U, + "Number_erg_sing ": Number_erg_sing, # bq, U, + "Number_erg_plur ": Number_erg_plur, # bq, U, + "Number_psee_sing ": Number_psee_sing, # U, + "Number_psee_plur ": Number_psee_plur, # U, + "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U, + "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U, + "NumForm_digit ": NumForm_digit, # cz, sl, U, + "NumForm_roman ": NumForm_roman, # cz, sl, U, + "NumForm_word ": NumForm_word, # cz, sl, U, + "NumValue_one ": NumValue_one, # cz, U, + "NumValue_two ": NumValue_two, # cz, U, + "NumValue_three ": NumValue_three, # cz, U, + "PartForm_pres ": PartForm_pres, # fi, + "PartForm_past ": PartForm_past, # fi, + "PartForm_agt ": PartForm_agt, # fi, + "PartForm_neg ": PartForm_neg, # fi, + "PartType_mod ": PartType_mod, # U, + "PartType_emp ": PartType_emp, # U, + "PartType_res ": PartType_res, # U, + "PartType_inf ": PartType_inf, # U, + "PartType_vbp ": PartType_vbp, # U, + "Person_abs_one ": Person_abs_one, # bq, U, + "Person_abs_two ": Person_abs_two, # bq, U, + "Person_abs_three ": Person_abs_three, # bq, U, + "Person_dat_one ": Person_dat_one, # bq, U, + "Person_dat_two ": Person_dat_two, # bq, U, + "Person_dat_three ": Person_dat_three, # bq, U, + "Person_erg_one ": Person_erg_one, # bq, U, + "Person_erg_two ": Person_erg_two, # bq, U, + "Person_erg_three ": Person_erg_three, # bq, U, + "Person_psor_one ": Person_psor_one, # fi, U, + "Person_psor_two ": Person_psor_two, # fi, U, + "Person_psor_three ": Person_psor_three, # fi, U, + "Polite_inf ": Polite_inf, # bq, U, + "Polite_pol ": Polite_pol, # bq, U, + "Polite_abs_inf ": Polite_abs_inf, # bq, U, + "Polite_abs_pol ": Polite_abs_pol, # bq, U, + "Polite_erg_inf ": Polite_erg_inf, # bq, U, + "Polite_erg_pol ": Polite_erg_pol, # bq, U, + "Polite_dat_inf ": Polite_dat_inf, # bq, U, + "Polite_dat_pol ": Polite_dat_pol, # bq, U, + "Prefix_yes ": Prefix_yes, # U, + "PrepCase_npr ": PrepCase_npr, # cz, + "PrepCase_pre ": PrepCase_pre, # U, + "PunctSide_ini ": PunctSide_ini, # U, + "PunctSide_fin ": PunctSide_fin, # U, + "PunctType_peri ": PunctType_peri, # U, + "PunctType_qest ": PunctType_qest, # U, + "PunctType_excl ": PunctType_excl, # U, + "PunctType_quot ": PunctType_quot, # U, + "PunctType_brck ": PunctType_brck, # U, + "PunctType_comm ": PunctType_comm, # U, + "PunctType_colo ": PunctType_colo, # U, + "PunctType_semi ": PunctType_semi, # U, + "PunctType_dash ": PunctType_dash, # U, + "Style_arch ": Style_arch, # cz, fi, U, + "Style_rare ": Style_rare, # cz, fi, U, + "Style_poet ": Style_poet, # cz, U, + "Style_norm ": Style_norm, # cz, U, + "Style_coll ": Style_coll, # cz, U, + "Style_vrnc ": Style_vrnc, # cz, U, + "Style_sing ": Style_sing, # cz, U, + "Style_expr ": Style_expr, # cz, U, + "Style_derg ": Style_derg, # cz, U, + "Style_vulg ": Style_vulg, # cz, U, + "Style_yes ": Style_yes, # fi, U, + "StyleVariant_styleShort ": StyleVariant_styleShort, # cz, + "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl, + "VerbType_aux ": VerbType_aux, # U, + "VerbType_cop ": VerbType_cop, # U, + "VerbType_mod ": VerbType_mod, # U, + "VerbType_light ": VerbType_light, # U, } diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 38d5959b6..3925a6738 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -8,7 +8,7 @@ IDS = { "ADP": ADP, "ADV": ADV, "AUX": AUX, - "CONJ": CONJ, # U20 + "CONJ": CONJ, # U20 "CCONJ": CCONJ, "DET": DET, "INTJ": INTJ, diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 089fef4e8..842e27069 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -3,26 +3,17 @@ # coding: utf8 from __future__ import unicode_literals -from thinc.api import chain, layerize, with_getitem import numpy cimport numpy as np import cytoolz -import util from collections import OrderedDict import ujson import msgpack -from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU -from thinc.i2v import HashEmbed -from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool -from thinc.t2t import ExtractWindow, ParametricAttention -from thinc.misc import Residual -from thinc.misc import BatchNorm as BN -from thinc.misc import LayerNorm as LN - +from thinc.api import chain +from thinc.v2v import Softmax +from thinc.t2v import Pooling, max_pool, mean_pool from thinc.neural.util import to_categorical - from thinc.neural._classes.difference import Siamese, CauchySimilarity from .tokens.doc cimport Doc @@ -30,29 +21,23 @@ from .syntax.nn_parser cimport Parser from .syntax import nonproj from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager -from .tagger import Tagger -from .syntax.stateclass cimport StateClass -from .gold cimport GoldParse from .morphology cimport Morphology from .vocab cimport Vocab from .syntax import nonproj from .compat import json_dumps -from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS -from ._ml import Tok2Vec, flatten -from ._ml import build_text_classifier, build_tagger_model -from ._ml import link_vectors_to_models +from .attrs import POS from .parts_of_speech import X +from ._ml import Tok2Vec, build_text_classifier, build_tagger_model +from ._ml import link_vectors_to_models +from . import util class SentenceSegmenter(object): """A simple spaCy hook, to allow custom sentence boundary detection logic - (that doesn't require the dependency parse). - - To change the sentence boundary detection strategy, pass a generator - function `strategy` on initialization, or assign a new strategy to - the .strategy attribute. - + (that doesn't require the dependency parse). To change the sentence + boundary detection strategy, pass a generator function `strategy` on + initialization, or assign a new strategy to the .strategy attribute. Sentence detection strategies should be generators that take `Doc` objects and yield `Span` objects for each sentence. """ @@ -74,16 +59,20 @@ class SentenceSegmenter(object): seen_period = False for i, word in enumerate(doc): if seen_period and not word.is_punct: - yield doc[start : word.i] + yield doc[start:word.i] start = word.i seen_period = False elif word.text in ['.', '!', '?']: seen_period = True if start < len(doc): - yield doc[start : len(doc)] + yield doc[start:len(doc)] class Pipe(object): + """This class is not instantiated directly. Components inherit from it, and + it defines the interface that components should follow to function as + components in a spaCy analysis pipeline. + """ name = None @classmethod @@ -149,8 +138,7 @@ class Pipe(object): link_vectors_to_models(self.vocab) def use_params(self, params): - """Modify the pipe's model, to use the given parameter values. - """ + """Modify the pipe's model, to use the given parameter values.""" with self.model.use_params(params): yield @@ -235,8 +223,8 @@ class Tensorizer(Pipe): """Construct a new statistical model. Weights are not allocated on initialisation. - vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` - instance with the `Doc` objects it will process. + vocab (Vocab): A `Vocab` instance. The model must share the same + `Vocab` instance with the `Doc` objects it will process. model (Model): A `Model` instance or `True` allocate one later. **cfg: Config parameters. @@ -280,7 +268,7 @@ class Tensorizer(Pipe): """Return a single tensor for a batch of documents. docs (iterable): A sequence of `Doc` objects. - RETURNS (object): Vector representations for each token in the documents. + RETURNS (object): Vector representations for each token in the docs. """ tokvecs = self.model(docs) return tokvecs @@ -289,7 +277,7 @@ class Tensorizer(Pipe): """Set the tensor attribute for a batch of documents. docs (iterable): A sequence of `Doc` objects. - tokvecs (object): Vector representation for each token in the documents. + tokvecs (object): Vector representation for each token in the docs. """ for doc, tokvecs in zip(docs, tokvecses): assert tokvecs.shape[0] == len(doc) @@ -328,12 +316,14 @@ class Tensorizer(Pipe): class Tagger(Pipe): name = 'tagger' + def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model self.cfg = dict(cfg) self.cfg.setdefault('cnn_maxout_pieces', 2) - self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) + self.cfg.setdefault('pretrained_dims', + self.vocab.vectors.data.shape[1]) def __call__(self, doc): tags = self.predict([doc]) @@ -353,8 +343,7 @@ class Tagger(Pipe): guesses = scores.argmax(axis=1) if not isinstance(guesses, numpy.ndarray): guesses = guesses.get() - guesses = self.model.ops.unflatten(guesses, - [len(d) for d in docs]) + guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs]) return guesses def set_annotations(self, docs, batch_tag_ids): @@ -387,8 +376,8 @@ class Tagger(Pipe): def get_loss(self, docs, golds, scores): scores = self.model.ops.flatten(scores) - tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)} - + tag_index = {tag: i + for i, tag in enumerate(self.vocab.morphology.tag_names)} cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype='i') guesses = scores.argmax(axis=1) @@ -443,17 +432,18 @@ class Tagger(Pipe): serialize['model'] = self.model.to_bytes serialize['vocab'] = self.vocab.to_bytes - serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map, - use_bin_type=True, - encoding='utf8') + serialize['tag_map'] = lambda: msgpack.dumps( + self.vocab.morphology.tag_map, use_bin_type=True, encoding='utf8') return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): def load_model(b): if self.model is True: - token_vector_width = util.env_opt('token_vector_width', - self.cfg.get('token_vector_width', 128)) - self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) + token_vector_width = util.env_opt( + 'token_vector_width', + self.cfg.get('token_vector_width', 128)) + self.model = self.Model(self.vocab.morphology.n_tags, + **self.cfg) self.model.from_bytes(b) def load_tag_map(b): @@ -509,11 +499,11 @@ class Tagger(Pipe): class MultitaskObjective(Tagger): - '''Assist training of a parser or tagger, by training a side-objective. - - Experimental - ''' + """Experimental: Assist training of a parser or tagger, by training a + side-objective. + """ name = 'nn_labeller' + def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): self.vocab = vocab self.model = model @@ -530,12 +520,12 @@ class MultitaskObjective(Tagger): elif hasattr(target, '__call__'): self.make_label = target else: - raise ValueError( - "MultitaskObjective target should be function or one of " - "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']") + raise ValueError("MultitaskObjective target should be function or " + "one of: dep, tag, ent, dep_tag_offset, ent_tag.") self.cfg = dict(cfg) self.cfg.setdefault('cnn_maxout_pieces', 2) - self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) + self.cfg.setdefault('pretrained_dims', + self.vocab.vectors.data.shape[1]) @property def labels(self): @@ -623,20 +613,19 @@ class MultitaskObjective(Tagger): class SimilarityHook(Pipe): """ - Experimental + Experimental: A pipeline component to install a hook for supervised + similarity into `Doc` objects. Requires a `Tensorizer` to pre-process + documents. The similarity model can be any object obeying the Thinc `Model` + interface. By default, the model concatenates the elementwise mean and + elementwise max of the two tensors, and compares them using the + Cauchy-like similarity function from Chen (2013): - A pipeline component to install a hook for supervised similarity into - Doc objects. Requires a Tensorizer to pre-process documents. The similarity - model can be any object obeying the Thinc Model interface. By default, - the model concatenates the elementwise mean and elementwise max of the two - tensors, and compares them using the Cauchy-like similarity function - from Chen (2013): - - similarity = 1. / (1. + (W * (vec1-vec2)**2).sum()) + >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum()) Where W is a vector of dimension weights, initialized to 1. """ name = 'similarity' + def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model @@ -662,8 +651,7 @@ class SimilarityHook(Pipe): sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) def begin_training(self, _=tuple(), pipeline=None): - """ - Allocate model, using width from tensorizer in pipeline. + """Allocate model, using width from tensorizer in pipeline. gold_tuples (iterable): Gold-standard training data. pipeline (list): The pipeline the model is part of. @@ -763,12 +751,14 @@ cdef class DependencyParser(Parser): for target in []: labeller = MultitaskObjective(self.vocab, target=target) tok2vec = self.model[0] - labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) + labeller.begin_training(gold_tuples, pipeline=pipeline, + tok2vec=tok2vec) pipeline.append(labeller) self._multitasks.append(labeller) def __reduce__(self): - return (DependencyParser, (self.vocab, self.moves, self.model), None, None) + return (DependencyParser, (self.vocab, self.moves, self.model), + None, None) cdef class EntityRecognizer(Parser): @@ -781,12 +771,14 @@ cdef class EntityRecognizer(Parser): for target in []: labeller = MultitaskObjective(self.vocab, target=target) tok2vec = self.model[0] - labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) + labeller.begin_training(gold_tuples, pipeline=pipeline, + tok2vec=tok2vec) pipeline.append(labeller) self._multitasks.append(labeller) def __reduce__(self): - return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None) + return (EntityRecognizer, (self.vocab, self.moves, self.model), + None, None) __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer'] diff --git a/spacy/scorer.py b/spacy/scorer.py index b1ce3faa4..673df132c 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -74,18 +74,21 @@ class Scorer(object): @property def scores(self): return { - 'uas': self.uas, 'las': self.las, - 'ents_p': self.ents_p, 'ents_r': self.ents_r, 'ents_f': self.ents_f, + 'uas': self.uas, + 'las': self.las, + 'ents_p': self.ents_p, + 'ents_r': self.ents_r, + 'ents_f': self.ents_f, 'tags_acc': self.tags_acc, 'token_acc': self.token_acc } def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): assert len(tokens) == len(gold) - gold_deps = set() gold_tags = set() - gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) + gold_ents = set(tags_to_entities([annot[-1] + for annot in gold.orig_annot])) for id_, word, tag, head, dep, ner in gold.orig_annot: gold_tags.add((id_, tag)) if dep not in (None, "") and dep.lower() not in punct_labels: diff --git a/spacy/strings.pyx b/spacy/strings.pyx index e6926a75d..647f140bb 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import cimport cython from libc.string cimport memcpy -from libc.stdint cimport uint64_t, uint32_t -from murmurhash.mrmr cimport hash64, hash32 -from preshed.maps cimport map_iter, key_t from libc.stdint cimport uint32_t +from murmurhash.mrmr cimport hash64, hash32 import ujson -import dill from .symbols import IDS as SYMBOLS_BY_STR from .symbols import NAMES as SYMBOLS_BY_INT - from .typedefs cimport hash_t -from . import util from .compat import json_dumps +from . import util cpdef hash_t hash_string(unicode string) except 0: @@ -195,7 +191,7 @@ cdef class StringStore: """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be either strings or Path-like objects. """ path = util.ensure_path(path) strings = list(self) @@ -225,7 +221,7 @@ cdef class StringStore: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `StringStore` object. """ - return ujson.dumps(list(self)) + return json_dumps(list(self)) def from_bytes(self, bytes_data, **exclude): """Load state from a binary string. diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 0e0337b6e..56422771a 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,8 +1,8 @@ # coding: utf8 #cython: optimize.unpack_method_calls=False - from __future__ import unicode_literals + IDS = { "": NIL, "IS_ALPHA": IS_ALPHA, @@ -464,9 +464,11 @@ IDS = { "LAW": LAW } + def sort_nums(x): return x[1] + NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)] # Unfortunate hack here, to work around problem with long cpdef enum # (which is generating an enormous amount of C++ in Cython 0.24+) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index da4efefbc..54e72a0e8 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -2,7 +2,7 @@ # cython: profile=True cimport numpy as np import numpy -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF +from cpython.ref cimport PyObject, Py_XDECREF from thinc.extra.search cimport Beam from thinc.extra.search import MaxViolation from thinc.typedefs cimport hash_t, class_t @@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation from .transition_system cimport TransitionSystem, Transition from .stateclass cimport StateClass from ..gold cimport GoldParse -from ..tokens.doc cimport Doc # These are passed as callbacks to thinc.search.Beam @@ -50,7 +49,7 @@ cdef class ParserBeam(object): cdef public object dones def __init__(self, TransitionSystem moves, states, golds, - int width, float density): + int width, float density): self.moves = moves self.states = states self.golds = golds @@ -59,7 +58,8 @@ cdef class ParserBeam(object): cdef StateClass state, st for state in states: beam = Beam(self.moves.n_moves, width, density) - beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) + beam.initialize(self.moves.init_beam_state, state.c.length, + state.c._sent) for i in range(beam.width): st = beam.at(i) st.c.offset = state.c.offset @@ -74,7 +74,8 @@ cdef class ParserBeam(object): @property def is_done(self): - return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams)) + return all(b.is_done or self.dones[i] + for i, b in enumerate(self.beams)) def __getitem__(self, i): return self.beams[i] @@ -126,7 +127,8 @@ cdef class ParserBeam(object): for i in range(beam.size): state = beam.at(i) if not state.c.is_final(): - self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) + self.moves.set_costs(beam.is_valid[i], beam.costs[i], + state, gold) if follow_gold: for j in range(beam.nr_class): if beam.costs[i][j] >= 1: @@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens): c_ids += ids.shape[1] return ids + nr_update = 0 + + def update_beam(TransitionSystem moves, int nr_feature, int max_steps, states, golds, state2vec, vec2scores, @@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, if pbeam.is_done and gbeam.is_done: break # The beam maps let us find the right row in the flattened scores - # arrays for each state. States are identified by (example id, history). - # We keep a different beam map for each step (since we'll have a flat - # scores array for each step). The beam map will let us take the per-state - # losses, and compute the gradient for each (step, state, class). + # arrays for each state. States are identified by (example id, + # history). We keep a different beam map for each step (since we'll + # have a flat scores array for each step). The beam map will let us + # take the per-state losses, and compute the gradient for each (step, + # state, class). beam_maps.append({}) # Gather all states from the two beams in a list. Some stats may occur # in both beams. To figure out which beam each state belonged to, # we keep two lists of indices, p_indices and g_indices - states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) + states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], + nr_update) if not states: break # Now that we have our flat list of states, feed them through the model token_ids = get_token_ids(states, nr_feature) vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) if hist_feats: - hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i') - scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop) + hists = numpy.asarray([st.history[:hist_feats] for st in states], + dtype='i') + scores, bp_scores = vec2scores.begin_update((vectors, hists), + drop=drop) else: scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) @@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps, # Unpack the flat scores into lists for the two beams. The indices arrays # tell us which example and state the scores-row refers to. - p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices] - g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices] + p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') + for indices in p_indices] + g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') + for indices in g_indices] # Now advance the states in the beams. The gold beam is contrained to # to follow only gold analyses. pbeam.advance(p_scores) @@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update): def get_gradient(nr_class, beam_maps, histories, losses): - """ - The global model assigns a loss to each parse. The beam scores + """The global model assigns a loss to each parse. The beam scores are additive, so the same gradient is applied to each action in the history. This gives the gradient of a single *action* for a beam state -- so we have "the gradient of loss for taking @@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses): if loss != 0.0 and not numpy.isnan(loss): nr_step = max(nr_step, len(hist)) for i in range(nr_step): - grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f')) + grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), + dtype='f')) assert len(histories) == len(losses) for eg_id, hists in enumerate(histories): for loss, hist in zip(losses[eg_id], hists): @@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses): grads[j][i, clas] += loss key = key + tuple([clas]) return grads - - diff --git a/spacy/syntax/_state.pyx b/spacy/syntax/_state.pyx index 83c831f0b..e69de29bb 100644 --- a/spacy/syntax/_state.pyx +++ b/spacy/syntax/_state.pyx @@ -1 +0,0 @@ -# test diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 8adb8e52c..b3c9b5563 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -4,24 +4,16 @@ # coding: utf-8 from __future__ import unicode_literals -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -import ctypes -from libc.stdint cimport uint32_t -from libc.string cimport memcpy +from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from collections import OrderedDict from thinc.extra.search cimport Beam -import numpy from .stateclass cimport StateClass -from ._state cimport StateC, is_space_token +from ._state cimport StateC from .nonproj import is_nonproj_tree -from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t -from ..gold cimport GoldParse -from ..gold cimport GoldParseC -from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT -from ..lexeme cimport Lexeme +from ..gold cimport GoldParse, GoldParseC from ..structs cimport TokenC @@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem): @classmethod def get_actions(cls, **kwargs): - actions = kwargs.get('actions', - OrderedDict(( - (SHIFT, ['']), - (REDUCE, ['']), - (RIGHT, []), - (LEFT, []), - (BREAK, ['ROOT']) - ))) + actions = kwargs.get('actions', OrderedDict(( + (SHIFT, ['']), + (REDUCE, ['']), + (RIGHT, []), + (LEFT, []), + (BREAK, ['ROOT'])) + )) seen_actions = set() for label in kwargs.get('left_labels', []): if label.upper() != 'ROOT': @@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem): if gold.cand_to_gold[i] is None: continue if state.safe_get(i).dep: - predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep])) + predicted.add((i, state.H(i), + self.strings[state.safe_get(i).dep])) else: predicted.add((i, state.H(i), 'ROOT')) id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] @@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem): if not self.has_gold(gold): return None for i in range(gold.length): - if gold.heads[i] is None or gold.labels[i] is None: # Missing values + # Missing values + if gold.heads[i] is None or gold.labels[i] is None: gold.c.heads[i] = i gold.c.has_dep[i] = False else: @@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem): # Check projectivity --- leading cause if is_nonproj_tree(gold.heads): raise ValueError( - "Could not find a gold-standard action to supervise the dependency " - "parser.\n" - "Likely cause: the tree is non-projective (i.e. it has crossing " - "arcs -- see spacy/syntax/nonproj.pyx for definitions)\n" - "The ArcEager transition system only supports projective trees.\n" - "To learn non-projective representations, transform the data " - "before training and after parsing. Either pass make_projective=True " - "to the GoldParse class, or use PseudoProjectivity.preprocess_training_data") + "Could not find a gold-standard action to supervise the " + "dependency parser. Likely cause: the tree is " + "non-projective (i.e. it has crossing arcs -- see " + "spacy/syntax/nonproj.pyx for definitions). The ArcEager " + "transition system only supports projective trees. To " + "learn non-projective representations, transform the data " + "before training and after parsing. Either pass " + "make_projective=True to the GoldParse class, or use " + "spacy.syntax.nonproj.preprocess_training_data.") else: print(gold.orig_annot) print(gold.words) @@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem): print(gold.labels) print(gold.sent_starts) raise ValueError( - "Could not find a gold-standard action to supervise the dependency " - "parser.\n" - "The GoldParse was projective.\n" - "The transition system has %d actions.\n" - "State at failure:\n" - "%s" % (self.n_moves, stcls.print_state(gold.words))) + "Could not find a gold-standard action to supervise the" + "dependency parser. The GoldParse was projective. The " + "transition system has %d actions. State at failure: %s" + % (self.n_moves, stcls.print_state(gold.words))) assert n_gold >= 1 def get_beam_annot(self, Beam beam): @@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem): deps[j].setdefault(dep, 0.0) deps[j][dep] += prob return heads, deps - diff --git a/spacy/syntax/iterators.pxd b/spacy/syntax/iterators.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx deleted file mode 100644 index 557616d18..000000000 --- a/spacy/syntax/iterators.pyx +++ /dev/null @@ -1,144 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX - - -def english_noun_chunks(obj): - """ - Detect base noun phrases from a dependency parse. - Works on both Doc and Span. - """ - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', - 'attr', 'ROOT'] - doc = obj.doc # Ensure works on both Doc and Span. - np_deps = [doc.vocab.strings.add(label) for label in labels] - conj = doc.vocab.strings.add('conj') - np_label = doc.vocab.strings.add('NP') - seen = set() - for i, word in enumerate(obj): - if word.pos not in (NOUN, PROPN, PRON): - continue - # Prevent nested chunks from being produced - if word.i in seen: - continue - if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.i+1)) - yield word.left_edge.i, word.i+1, np_label - elif word.dep == conj: - head = word.head - while head.dep == conj and head.head.i < head.i: - head = head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.i+1)) - yield word.left_edge.i, word.i+1, np_label - - -# this iterator extracts spans headed by NOUNs starting from the left-most -# syntactic dependent until the NOUN itself -# for close apposition and measurement construction, the span is sometimes -# extended to the right of the NOUN -# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not -# just "eine Tasse", same for "das Thema Familie" -def german_noun_chunks(obj): - labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app'] - doc = obj.doc # Ensure works on both Doc and Span. - np_label = doc.vocab.strings.add('NP') - np_deps = set(doc.vocab.strings.add(label) for label in labels) - close_app = doc.vocab.strings.add('nk') - - rbracket = 0 - for i, word in enumerate(obj): - if i < rbracket: - continue - if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: - rbracket = word.i+1 - # try to extend the span to the right - # to capture close apposition/measurement constructions - for rdep in doc[word.i].rights: - if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app: - rbracket = rdep.i+1 - yield word.left_edge.i, rbracket, np_label - - -def es_noun_chunks(obj): - doc = obj.doc - np_label = doc.vocab.strings['NP'] - left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed'] - right_labels = ['flat', 'fixed', 'compound', 'neg'] - stop_labels = ['punct'] - np_left_deps = [doc.vocab.strings[label] for label in left_labels] - np_right_deps = [doc.vocab.strings[label] for label in right_labels] - stop_deps = [doc.vocab.strings[label] for label in stop_labels] - - def next_token(token): - try: - return token.nbor() - except: - return None - - def noun_bounds(root): - def is_verb_token(token): - return token.pos in [VERB, AUX] - - left_bound = root - for token in reversed(list(root.lefts)): - if token.dep in np_left_deps: - left_bound = token - right_bound = root - for token in root.rights: - if (token.dep in np_right_deps): - left, right = noun_bounds(token) - if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps, - doc[left_bound.i: right.i])): - break - else: - right_bound = right - return left_bound, right_bound - - token = doc[0] - while token and token.i < len(doc): - if token.pos in [PROPN, NOUN, PRON]: - left, right = noun_bounds(token) - yield left.i, right.i+1, np_label - token = right - token = next_token(token) - - -def french_noun_chunks(obj): - labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss'] - doc = obj.doc # Ensure works on both Doc and Span. - np_deps = [doc.vocab.strings[label] for label in labels] - conj = doc.vocab.strings.add('conj') - np_label = doc.vocab.strings.add('NP') - seen = set() - for i, word in enumerate(obj): - if word.pos not in (NOUN, PROPN, PRON): - continue - # Prevent nested chunks from being produced - if word.i in seen: - continue - if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) - yield word.left_edge.i, word.right_edge.i+1, np_label - elif word.dep == conj: - head = word.head - while head.dep == conj and head.head.i < head.i: - head = head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) - yield word.left_edge.i, word.right_edge.i+1, np_label - - -CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks, - 'es': es_noun_chunks, 'fr': french_noun_chunks} diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 5c4e42176..e2e242aea 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -4,17 +4,12 @@ from __future__ import unicode_literals from thinc.typedefs cimport weight_t from thinc.extra.search cimport Beam from collections import OrderedDict -import numpy -from thinc.neural.ops import NumpyOps from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition from .transition_system cimport do_func_t -from ..structs cimport TokenC, Entity -from ..gold cimport GoldParseC -from ..gold cimport GoldParse -from ..attrs cimport ENT_TYPE, ENT_IOB +from ..gold cimport GoldParseC, GoldParse cdef enum: @@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem): @classmethod def get_actions(cls, **kwargs): - actions = kwargs.get('actions', - OrderedDict(( - (MISSING, ['']), - (BEGIN, []), - (IN, []), - (LAST, []), - (UNIT, []), - (OUT, ['']) - ))) + actions = kwargs.get('actions', OrderedDict(( + (MISSING, ['']), + (BEGIN, []), + (IN, []), + (LAST, []), + (UNIT, []), + (OUT, ['']) + ))) seen_entities = set() for entity_type in kwargs.get('entity_types', []): if entity_type in seen_entities: @@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem): cdef Transition lookup_transition(self, object name) except *: cdef attr_t label - if name == '-' or name == None: + if name == '-' or name is None: return Transition(clas=0, move=MISSING, label=0, score=0) elif name == '!O': return Transition(clas=0, move=ISNT, label=0, score=0) @@ -328,8 +322,8 @@ cdef class In: return False elif preset_ent_iob == 3: return False - # TODO: Is this quite right? - # I think it's supposed to be ensuring the gazetteer matches are maintained + # TODO: Is this quite right? I think it's supposed to be ensuring the + # gazetteer matches are maintained elif st.B_(1).ent_iob != preset_ent_iob: return False # Don't allow entities to extend across sentence boundaries @@ -354,10 +348,12 @@ cdef class In: if g_act == MISSING: return 0 elif g_act == BEGIN: - # I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) + # I, Gold B --> True + # (P of bad open entity sunk, R of this entity sunk) return 0 elif g_act == IN: - # I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) + # I, Gold I --> True + # (label forced by prev, if mismatch, P and R both sunk) return 0 elif g_act == LAST: # I, Gold L --> True iff this entity sunk and next tag == O @@ -505,11 +501,3 @@ cdef class Out: return 1 else: return 1 - - -class OracleError(Exception): - pass - - -class UnknownMove(Exception): - pass diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 84b8e348f..1aa4443d0 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -4,79 +4,56 @@ # coding: utf-8 from __future__ import unicode_literals, print_function -from collections import Counter, OrderedDict +from collections import OrderedDict import ujson import json -import contextlib import numpy - -from libc.math cimport exp -cimport cython cimport cython.parallel import cytoolz -import dill - import numpy.random cimport numpy as np - -from libcpp.vector cimport vector -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF +from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno -from libc.stdint cimport uint32_t, uint64_t -from libc.string cimport memset, memcpy -from libc.stdlib cimport malloc, calloc, free -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.linalg cimport Vec, VecVec -from thinc.structs cimport SparseArrayC, FeatureC, ExampleC -from thinc.extra.eg cimport Example +from libc.math cimport exp +from libcpp.vector cimport vector +from libc.string cimport memset +from libc.stdlib cimport calloc, free +from cymem.cymem cimport Pool +from thinc.typedefs cimport weight_t, class_t, hash_t from thinc.extra.search cimport Beam - -from cymem.cymem cimport Pool, Address -from murmurhash.mrmr cimport hash64 -from preshed.maps cimport MapStruct -from preshed.maps cimport map_get - -from thinc.api import layerize, chain, clone, with_flatten -from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU +from thinc.api import chain, clone +from thinc.v2v import Model, Maxout, Affine from thinc.misc import LayerNorm - -from thinc.neural.ops import NumpyOps, CupyOps +from thinc.neural.ops import CupyOps from thinc.neural.util import get_array_module +from thinc.linalg cimport Vec, VecVec -from .. import util -from ..util import get_async, get_cuda_stream -from .._ml import zero_init, PrecomputableAffine -from .._ml import Tok2Vec, doc2feats -from .._ml import Residual, drop_layer, flatten +from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten from .._ml import link_vectors_to_models -from .._ml import HistoryFeatures from ..compat import json_dumps, copy_array - +from ..tokens.doc cimport Doc +from ..gold cimport GoldParse +from .. import util from .stateclass cimport StateClass from ._state cimport StateC -from . import nonproj -from .transition_system import OracleError -from .transition_system cimport TransitionSystem, Transition -from ..structs cimport TokenC -from ..tokens.doc cimport Doc -from ..strings cimport StringStore -from ..gold cimport GoldParse -from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG -from . import _beam_utils +from .transition_system cimport Transition +from . import _beam_utils, nonproj def get_templates(*args, **kwargs): return [] + DEBUG = False + + def set_debug(val): global DEBUG DEBUG = val cdef class precompute_hiddens: - '''Allow a model to be "primed" by pre-computing input features in bulk. + """Allow a model to be "primed" by pre-computing input features in bulk. This is used for the parser, where we want to take a batch of documents, and compute vectors for each (token, position) pair. These vectors can then @@ -91,7 +68,7 @@ cdef class precompute_hiddens: so we can save the factor k. This also gives a nice CPU/GPU division: we can do all our hard maths up front, packed into large multiplications, and do the hard-to-program parsing on the CPU. - ''' + """ cdef int nF, nO, nP cdef bint _is_synchronized cdef public object ops @@ -101,7 +78,8 @@ cdef class precompute_hiddens: cdef object _cuda_stream cdef object _bp_hiddens - def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.): + def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, + drop=0.): gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) cdef np.ndarray cached if not isinstance(gpu_cached, numpy.ndarray): @@ -122,8 +100,7 @@ cdef class precompute_hiddens: self._bp_hiddens = bp_features cdef const float* get_feat_weights(self) except NULL: - if not self._is_synchronized \ - and self._cuda_stream is not None: + if not self._is_synchronized and self._cuda_stream is not None: self._cuda_stream.synchronize() self._is_synchronized = True return self._cached.data @@ -248,10 +225,10 @@ cdef class Parser: depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) if depth != 1: raise ValueError("Currently parser depth is hard-coded to 1.") - parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) - #if parser_maxout_pieces != 2: - # raise ValueError("Currently parser_maxout_pieces is hard-coded to 2") - token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) + parser_maxout_pieces = util.env_opt('parser_maxout_pieces', + cfg.get('maxout_pieces', 2)) + token_vector_width = util.env_opt('token_vector_width', + cfg.get('token_vector_width', 128)) hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) @@ -289,23 +266,19 @@ cdef class Parser: return (tok2vec, lower, upper), cfg def __init__(self, Vocab vocab, moves=True, model=True, **cfg): - """ - Create a Parser. + """Create a Parser. - Arguments: - vocab (Vocab): - The vocabulary object. Must be shared with documents to be processed. - The value is set to the .vocab attribute. - moves (TransitionSystem): - Defines how the parse-state is created, updated and evaluated. - The value is set to the .moves attribute unless True (default), - in which case a new instance is created with Parser.Moves(). - model (object): - Defines how the parse-state is created, updated and evaluated. - The value is set to the .model attribute unless True (default), - in which case a new instance is created with Parser.Model(). - **cfg: - Arbitrary configuration parameters. Set to the .cfg attribute + vocab (Vocab): The vocabulary object. Must be shared with documents + to be processed. The value is set to the `.vocab` attribute. + moves (TransitionSystem): Defines how the parse-state is created, + updated and evaluated. The value is set to the .moves attribute + unless True (default), in which case a new instance is created with + `Parser.Moves()`. + model (object): Defines how the parse-state is created, updated and + evaluated. The value is set to the .model attribute unless True + (default), in which case a new instance is created with + `Parser.Model()`. + **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute """ self.vocab = vocab if moves is True: @@ -331,13 +304,10 @@ cdef class Parser: return (Parser, (self.vocab, self.moves, self.model), None, None) def __call__(self, Doc doc, beam_width=None, beam_density=None): - """ - Apply the parser or entity recognizer, setting the annotations onto the Doc object. + """Apply the parser or entity recognizer, setting the annotations onto + the `Doc` object. - Arguments: - doc (Doc): The document to be processed. - Returns: - None + doc (Doc): The document to be processed. """ if beam_width is None: beam_width = self.cfg.get('beam_width', 1) @@ -359,16 +329,13 @@ cdef class Parser: def pipe(self, docs, int batch_size=256, int n_threads=2, beam_width=None, beam_density=None): - """ - Process a stream of documents. + """Process a stream of documents. - Arguments: - stream: The sequence of documents to process. - batch_size (int): - The number of documents to accumulate into a working set. - n_threads (int): - The number of threads with which to work on the buffer in parallel. - Yields (Doc): Documents, in order. + stream: The sequence of documents to process. + batch_size (int): Number of documents to accumulate into a working set. + n_threads (int): The number of threads with which to work on the buffer + in parallel. + YIELDS (Doc): Documents, in order. """ if beam_width is None: beam_width = self.cfg.get('beam_width', 1) @@ -385,8 +352,8 @@ cdef class Parser: parse_states = self.parse_batch(subbatch) beams = [] else: - beams = self.beam_parse(subbatch, - beam_width=beam_width, beam_density=beam_density) + beams = self.beam_parse(subbatch, beam_width=beam_width, + beam_density=beam_density) parse_states = [] for beam in beams: parse_states.append(beam.at(0)) @@ -406,9 +373,9 @@ cdef class Parser: if isinstance(docs, Doc): docs = [docs] - cuda_stream = get_cuda_stream() - (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, - 0.0) + cuda_stream = util.get_cuda_stream() + (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( + docs, cuda_stream, 0.0) nr_state = len(docs) nr_class = self.moves.n_moves nr_dim = tokvecs.shape[1] @@ -422,7 +389,8 @@ cdef class Parser: feat_weights = state2vec.get_feat_weights() cdef int i - cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T) + cdef np.ndarray hidden_weights = numpy.ascontiguousarray( + vec2scores._layers[-1].W.T) cdef np.ndarray hidden_bias = vec2scores._layers[-1].b hW = hidden_weights.data @@ -450,6 +418,7 @@ cdef class Parser: with gil: PyErr_SetFromErrno(MemoryError) PyErr_CheckSignals() + cdef float feature while not state.is_final(): state.set_context_tokens(token_ids, nr_feat) memset(vectors, 0, nr_hidden * nr_piece * sizeof(float)) @@ -489,9 +458,9 @@ cdef class Parser: cdef Doc doc cdef int nr_class = self.moves.n_moves cdef StateClass stcls, output - cuda_stream = get_cuda_stream() - (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, - 0.0) + cuda_stream = util.get_cuda_stream() + (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( + docs, cuda_stream, 0.0) beams = [] cdef int offset = 0 cdef int j = 0 @@ -546,9 +515,7 @@ cdef class Parser: if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] - - cuda_stream = get_cuda_stream() - + cuda_stream = util.get_cuda_stream() states, golds, max_steps = self._init_gold_batch(docs, golds) (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop) @@ -563,7 +530,6 @@ cdef class Parser: n_steps = 0 while todo: states, golds = zip(*todo) - token_ids = self.get_token_ids(states) vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) if drop != 0: @@ -585,8 +551,8 @@ cdef class Parser: and not isinstance(token_ids, state2vec.ops.xp.ndarray): # Move token_ids and d_vector to GPU, asynchronously backprops.append(( - get_async(cuda_stream, token_ids), - get_async(cuda_stream, d_vector), + util.get_async(cuda_stream, token_ids), + util.get_async(cuda_stream, d_vector), bp_vector )) else: @@ -619,15 +585,13 @@ cdef class Parser: states = self.moves.init_batch(docs) for gold in golds: self.moves.preprocess_gold(gold) - - cuda_stream = get_cuda_stream() - (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop) - - states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, - states, golds, - state2vec, vec2scores, - width, density, self.cfg.get('hist_size', 0), - drop=drop, losses=losses) + cuda_stream = util.get_cuda_stream() + (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model( + docs, cuda_stream, drop) + states_d_scores, backprops = _beam_utils.update_beam( + self.moves, self.nr_feature, 500, states, golds, state2vec, + vec2scores, width, density, self.cfg.get('hist_size', 0), + drop=drop, losses=losses) backprop_lower = [] cdef float batch_size = len(docs) for i, d_scores in enumerate(states_d_scores): @@ -639,13 +603,14 @@ cdef class Parser: if isinstance(self.model[0].ops, CupyOps) \ and not isinstance(ids, state2vec.ops.xp.ndarray): backprop_lower.append(( - get_async(cuda_stream, ids), - get_async(cuda_stream, d_vector), + util.get_async(cuda_stream, ids), + util.get_async(cuda_stream, d_vector), bp_vectors)) else: backprop_lower.append((ids, d_vector, bp_vectors)) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) - self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream) + self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, + cuda_stream) def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long @@ -796,7 +761,8 @@ cdef class Parser: def begin_training(self, gold_tuples, pipeline=None, **cfg): if 'model' in cfg: self.model = cfg['model'] - gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100) + gold_tuples = nonproj.preprocess_training_data(gold_tuples, + label_freq_cutoff=100) actions = self.moves.get_actions(gold_parses=gold_tuples) for action, labels in actions.items(): for label in labels: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 499effcda..404f1bc90 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,39 +1,37 @@ # coding: utf-8 -""" -Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 +"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ from __future__ import unicode_literals + from copy import copy -from ..tokens.doc cimport Doc -from ..attrs import DEP, HEAD DELIMITER = '||' def ancestors(tokenid, heads): - # returns all words going from the word up the path to the root - # the path to root cannot be longer than the number of words in the sentence - # this function ends after at most len(heads) steps - # because it would otherwise loop indefinitely on cycles + # Returns all words going from the word up the path to the root. The path + # to root cannot be longer than the number of words in the sentence. This + # function ends after at most len(heads) steps, because it would otherwise + # loop indefinitely on cycles. head = tokenid cnt = 0 while heads[head] != head and cnt < len(heads): head = heads[head] cnt += 1 yield head - if head == None: + if head is None: break def contains_cycle(heads): - # in an acyclic tree, the path from each word following - # the head relation upwards always ends at the root node + # in an acyclic tree, the path from each word following the head relation + # upwards always ends at the root node for tokenid in range(len(heads)): seen = set([tokenid]) - for ancestor in ancestors(tokenid,heads): + for ancestor in ancestors(tokenid, heads): if ancestor in seen: return seen seen.add(ancestor) @@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads): # if there is a token k, h < k < d such that h is not # an ancestor of k. Same for h -> d, h > d head = heads[tokenid] - if head == tokenid: # root arcs cannot be non-projective + if head == tokenid: # root arcs cannot be non-projective return False - elif head == None: # unattached tokens cannot be non-projective + elif head is None: # unattached tokens cannot be non-projective return False start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) - for k in range(start,end): - for ancestor in ancestors(k,heads): - if ancestor == None: # for unattached tokens/subtrees + for k in range(start, end): + for ancestor in ancestors(k, heads): + if ancestor is None: # for unattached tokens/subtrees break - elif ancestor == head: # normal case: k dominated by h + elif ancestor == head: # normal case: k dominated by h break - else: # head not in ancestors: d -> h is non-projective + else: # head not in ancestors: d -> h is non-projective return True return False def is_nonproj_tree(heads): # a tree is non-projective if at least one arc is non-projective - return any( is_nonproj_arc(word,heads) for word in range(len(heads)) ) + return any(is_nonproj_arc(word, heads) for word in range(len(heads))) def decompose(label): @@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30): for raw_text, sents in gold_tuples: prepro_sents = [] for (ids, words, tags, heads, labels, iob), ctnts in sents: - proj_heads,deco_labels = projectivize(heads,labels) + proj_heads, deco_labels = projectivize(heads, labels) # set the label to ROOT for each root dependent - deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ] + deco_labels = ['ROOT' if head == i else deco_labels[i] + for i, head in enumerate(proj_heads)] # count label frequencies if label_freq_cutoff > 0: for label in deco_labels: if is_decorated(label): - freqs[label] = freqs.get(label,0) + 1 - prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts)) + freqs[label] = freqs.get(label, 0) + 1 + prepro_sents.append( + ((ids, words, tags, proj_heads, deco_labels, iob), ctnts)) preprocessed.append((raw_text, prepro_sents)) - if label_freq_cutoff > 0: - return _filter_labels(preprocessed,label_freq_cutoff,freqs) + return _filter_labels(preprocessed, label_freq_cutoff, freqs) return preprocessed def projectivize(heads, labels): - # use the algorithm by Nivre & Nilsson 2005 - # assumes heads to be a proper tree, i.e. connected and cycle-free - # returns a new pair (heads,labels) which encode - # a projective and decorated tree + # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper + # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels) + # which encode a projective and decorated tree. proj_heads = copy(heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) - if smallest_np_arc == None: # this sentence is already projective + if smallest_np_arc is None: # this sentence is already projective return proj_heads, copy(labels) - while smallest_np_arc != None: + while smallest_np_arc is not None: _lift(smallest_np_arc, proj_heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) deco_labels = _decorate(heads, proj_heads, labels) @@ -114,24 +112,26 @@ def projectivize(heads, labels): def deprojectivize(tokens): - # reattach arcs with decorated labels (following HEAD scheme) - # for each decorated arc X||Y, search top-down, left-to-right, - # breadth-first until hitting a Y then make this the new head + # Reattach arcs with decorated labels (following HEAD scheme). For each + # decorated arc X||Y, search top-down, left-to-right, breadth-first until + # hitting a Y then make this the new head. for token in tokens: if is_decorated(token.dep_): - newlabel,headlabel = decompose(token.dep_) - newhead = _find_new_head(token,headlabel) + newlabel, headlabel = decompose(token.dep_) + newhead = _find_new_head(token, headlabel) token.head = newhead token.dep_ = newlabel return tokens + def _decorate(heads, proj_heads, labels): # uses decoration scheme HEAD from Nivre & Nilsson 2005 assert(len(heads) == len(proj_heads) == len(labels)) deco_labels = [] - for tokenid,head in enumerate(heads): + for tokenid, head in enumerate(heads): if head != proj_heads[tokenid]: - deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) + deco_labels.append( + '%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) else: deco_labels.append(labels[tokenid]) return deco_labels @@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads): # and ties are broken left to right smallest_size = float('inf') smallest_np_arc = None - for tokenid,head in enumerate(heads): + for tokenid, head in enumerate(heads): size = abs(tokenid-head) - if size < smallest_size and is_nonproj_arc(tokenid,heads): + if size < smallest_size and is_nonproj_arc(tokenid, heads): smallest_size = size smallest_np_arc = tokenid return smallest_np_arc @@ -168,8 +168,10 @@ def _find_new_head(token, headlabel): next_queue = [] for qtoken in queue: for child in qtoken.children: - if child.is_space: continue - if child == token: continue + if child.is_space: + continue + if child == token: + continue if child.dep_ == headlabel: return child next_queue.append(child) @@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs): for raw_text, sents in gold_tuples: filtered_sents = [] for (ids, words, tags, heads, labels, iob), ctnts in sents: - filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ] - filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) + filtered_labels = [decompose(label)[0] + if freqs.get(label, cutoff) < cutoff + else label for label in labels] + filtered_sents.append( + ((ids, words, tags, heads, filtered_labels, iob), ctnts)) filtered.append((raw_text, filtered_sents)) return filtered diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index ddd1f558c..ea0ec77e5 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -2,17 +2,8 @@ # cython: infer_types=True from __future__ import unicode_literals -from libc.string cimport memcpy, memset -from libc.stdint cimport uint32_t, uint64_t import numpy -from ..vocab cimport EMPTY_LEXEME -from ..structs cimport Entity -from ..lexeme cimport Lexeme -from ..symbols cimport punct -from ..attrs cimport IS_SPACE -from ..attrs cimport attr_id_t -from ..tokens.token cimport Token from ..tokens.doc cimport Doc diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 922fdf97c..c351636c4 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -2,17 +2,17 @@ # coding: utf-8 from __future__ import unicode_literals -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF +from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t -from collections import defaultdict, OrderedDict +from collections import OrderedDict import ujson -from .. import util from ..structs cimport TokenC from .stateclass cimport StateClass -from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB from ..typedefs cimport attr_t +from ..compat import json_dumps +from .. import util cdef weight_t MIN_SCORE = -90000 @@ -136,11 +136,12 @@ cdef class TransitionSystem: print([gold.c.ner[i].clas for i in range(gold.length)]) print([gold.c.ner[i].move for i in range(gold.length)]) print([gold.c.ner[i].label for i in range(gold.length)]) - print("Self labels", [self.c[i].label for i in range(self.n_moves)]) + print("Self labels", + [self.c[i].label for i in range(self.n_moves)]) raise ValueError( "Could not find a gold-standard action to supervise " - "the entity recognizer\n" - "The transition system has %d actions." % (self.n_moves)) + "the entity recognizer. The transition system has " + "%d actions." % (self.n_moves)) def get_class_name(self, int clas): act = self.c[clas] @@ -149,7 +150,7 @@ cdef class TransitionSystem: def add_action(self, int action, label_name): cdef attr_t label_id if not isinstance(label_name, int) and \ - not isinstance(label_name, long): + not isinstance(label_name, long): label_id = self.strings.add(label_name) else: label_id = label_name @@ -186,7 +187,7 @@ cdef class TransitionSystem: 'name': self.move_name(trans.move, trans.label) }) serializers = { - 'transitions': lambda: ujson.dumps(transitions), + 'transitions': lambda: json_dumps(transitions), 'strings': lambda: self.strings.to_bytes() } return util.to_bytes(serializers, exclude) diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd deleted file mode 100644 index 6d2cef1f4..000000000 --- a/spacy/tagger.pxd +++ /dev/null @@ -1,17 +0,0 @@ -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.extra.eg cimport Example -from thinc.structs cimport ExampleC - -from .structs cimport TokenC -from .vocab cimport Vocab - - -cdef class TaggerModel(AveragedPerceptron): - cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except * - - -cdef class Tagger: - cdef readonly Vocab vocab - cdef readonly TaggerModel model - cdef public dict freqs - cdef public object cfg diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx deleted file mode 100644 index 0fadea15d..000000000 --- a/spacy/tagger.pyx +++ /dev/null @@ -1,253 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import defaultdict - -from cymem.cymem cimport Pool -from thinc.typedefs cimport atom_t -from thinc.extra.eg cimport Example -from thinc.structs cimport ExampleC -from thinc.linear.avgtron cimport AveragedPerceptron -from thinc.linalg cimport VecVec - -from .tokens.doc cimport Doc -from .attrs cimport TAG -from .gold cimport GoldParse -from .attrs cimport * - - -cpdef enum: - P2_orth - P2_cluster - P2_shape - P2_prefix - P2_suffix - P2_pos - P2_lemma - P2_flags - - P1_orth - P1_cluster - P1_shape - P1_prefix - P1_suffix - P1_pos - P1_lemma - P1_flags - - W_orth - W_cluster - W_shape - W_prefix - W_suffix - W_pos - W_lemma - W_flags - - N1_orth - N1_cluster - N1_shape - N1_prefix - N1_suffix - N1_pos - N1_lemma - N1_flags - - N2_orth - N2_cluster - N2_shape - N2_prefix - N2_suffix - N2_pos - N2_lemma - N2_flags - - N_CONTEXT_FIELDS - - -cdef class TaggerModel(AveragedPerceptron): - def update(self, Example eg): - self.time += 1 - guess = eg.guess - best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class) - if guess != best: - for feat in eg.c.features[:eg.c.nr_feat]: - self.update_weight(feat.key, best, -feat.value) - self.update_weight(feat.key, guess, feat.value) - - cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: - _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) - _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) - _fill_from_token(&eg.atoms[W_orth], &tokens[i]) - _fill_from_token(&eg.atoms[N1_orth], &tokens[i+1]) - _fill_from_token(&eg.atoms[N2_orth], &tokens[i+2]) - - eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.lower - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.tag - context[6] = t.lemma - if t.lex.flags & (1 << IS_ALPHA): - context[7] = 1 - elif t.lex.flags & (1 << IS_PUNCT): - context[7] = 2 - elif t.lex.flags & (1 << LIKE_URL): - context[7] = 3 - elif t.lex.flags & (1 << LIKE_NUM): - context[7] = 4 - else: - context[7] = 0 - - -cdef class Tagger: - """Annotate part-of-speech tags on Doc objects.""" - - def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): - """Create a Tagger. - - vocab (Vocab): The vocabulary object. Must be shared with documents to - be processed. - model (thinc.linear.AveragedPerceptron): The statistical model. - RETURNS (Tagger): The newly constructed object. - """ - if model is None: - model = TaggerModel(cfg.get('features', self.feature_templates), - L1=0.0) - self.vocab = vocab - self.model = model - self.model.l1_penalty = 0.0 - # TODO: Move this to tag map - self.freqs = {TAG: defaultdict(int)} - for tag in self.tag_names: - self.freqs[TAG][self.vocab.strings[tag]] = 1 - self.freqs[TAG][0] = 1 - self.cfg = cfg - - @property - def tag_names(self): - return self.vocab.morphology.tag_names - - def __reduce__(self): - return (self.__class__, (self.vocab, self.model), None, None) - - def tag_from_strings(self, Doc tokens, object tag_strs): - cdef int i - for i in range(tokens.length): - self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i]) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def __call__(self, Doc tokens): - """Apply the tagger, setting the POS tags onto the Doc object. - - doc (Doc): The tokens to be tagged. - """ - if tokens.length == 0: - return 0 - - cdef Pool mem = Pool() - - cdef int i, tag - cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS, - nr_class=self.vocab.morphology.n_tags, - nr_feat=self.model.nr_feat) - for i in range(tokens.length): - if tokens.c[i].pos == 0: - self.model.set_featuresC(&eg.c, tokens.c, i) - self.model.set_scoresC(eg.c.scores, - eg.c.features, eg.c.nr_feat) - guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) - self.vocab.morphology.assign_tag_id(&tokens.c[i], guess) - eg.fill_scores(0, eg.c.nr_class) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def pipe(self, stream, batch_size=1000, n_threads=2): - """Tag a stream of documents. - - Arguments: - stream: The sequence of documents to tag. - batch_size (int): The number of documents to accumulate into a working set. - n_threads (int): The number of threads with which to work on the buffer - in parallel, if the Matcher implementation supports multi-threading. - YIELDS (Doc): Documents, in order. - """ - for doc in stream: - self(doc) - yield doc - - def update(self, Doc tokens, GoldParse gold, itn=0): - """Update the statistical model, with tags supplied for the given document. - - doc (Doc): The document to update on. - gold (GoldParse): Manager for the gold-standard tags. - RETURNS (int): Number of tags predicted correctly. - """ - gold_tag_strs = gold.tags - assert len(tokens) == len(gold_tag_strs) - for tag in gold_tag_strs: - if tag != None and tag not in self.tag_names: - msg = ("Unrecognized gold tag: %s. tag_map.json must contain all " - "gold tags, to maintain coarse-grained mapping.") - raise ValueError(msg % tag) - golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] - cdef int correct = 0 - cdef Pool mem = Pool() - cdef Example eg = Example( - nr_atom=N_CONTEXT_FIELDS, - nr_class=self.vocab.morphology.n_tags, - nr_feat=self.model.nr_feat) - for i in range(tokens.length): - self.model.set_featuresC(&eg.c, tokens.c, i) - eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ] - self.model.set_scoresC(eg.c.scores, - eg.c.features, eg.c.nr_feat) - self.model.update(eg) - - self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) - - correct += eg.cost == 0 - self.freqs[TAG][tokens.c[i].tag] += 1 - eg.fill_scores(0, eg.c.nr_class) - eg.fill_costs(0, eg.c.nr_class) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - return correct - - - feature_templates = ( - (W_orth,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_orth,), - (N2_orth,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_orth), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - (P1_cluster,), - (P2_cluster,), - - (W_flags,), - (N1_flags,), - (N2_flags,), - (P1_flags,), - (P2_flags,), - ) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index c3bceb106..9493452a1 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -40,6 +40,8 @@ def parser(vocab): def test_init_parser(parser): pass +# TODO: This is flakey, because it depends on what the parser first learns. +@pytest.mark.xfail def test_add_label(parser): doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc = parser(doc) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index e865c60dd..ef31a5d5c 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc from cymem.cymem cimport Pool from preshed.maps cimport PreshMap import regex as re - -from .strings cimport hash_string -from . import util cimport cython from .tokens.doc cimport Doc +from .strings cimport hash_string +from . import util cdef class Tokenizer: @@ -21,7 +20,7 @@ cdef class Tokenizer: boundaries. """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, - suffix_search=None, infix_finditer=None, token_match=None): + suffix_search=None, infix_finditer=None, token_match=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -74,9 +73,8 @@ cdef class Tokenizer: RETURNS (Doc): A container for linguistic annotations. """ if len(string) >= (2 ** 30): - raise ValueError( - "String is too long: %d characters. Max is 2**30." % len(string) - ) + msg = "String is too long: %d characters. Max is 2**30." + raise ValueError(msg % len(string)) cdef int length = len(string) cdef Doc doc = Doc(self.vocab) if length == 0: @@ -122,8 +120,8 @@ cdef class Tokenizer: """Tokenize a stream of texts. texts: A sequence of unicode texts. - batch_size (int): The number of texts to accumulate in an internal buffer. - n_threads (int): The number of threads to use, if the implementation + batch_size (int): Number of texts to accumulate in an internal buffer. + n_threads (int): Number of threads to use, if the implementation supports multi-threading. The default tokenizer is single-threaded. YIELDS (Doc): A sequence of Doc objects, in order. """ @@ -232,8 +230,8 @@ cdef class Tokenizer: if not matches: tokens.push_back(self.vocab.get(tokens.mem, string), False) else: - # let's say we have dyn-o-mite-dave - # the regex finds the start and end positions of the hyphens + # let's say we have dyn-o-mite-dave - the regex finds the + # start and end positions of the hyphens start = 0 for match in matches: infix_start = match.start() @@ -293,8 +291,8 @@ cdef class Tokenizer: return list(self.infix_finditer(string)) def find_prefix(self, unicode string): - """Find the length of a prefix that should be segmented from the string, - or None if no prefix rules match. + """Find the length of a prefix that should be segmented from the + string, or None if no prefix rules match. string (unicode): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`. @@ -305,8 +303,8 @@ cdef class Tokenizer: return (match.end() - match.start()) if match is not None else 0 def find_suffix(self, unicode string): - """Find the length of a suffix that should be segmented from the string, - or None if no suffix rules match. + """Find the length of a suffix that should be segmented from the + string, or None if no suffix rules match. string (unicode): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`. @@ -326,8 +324,8 @@ cdef class Tokenizer: string (unicode): The string to specially tokenize. token_attrs (iterable): A sequence of dicts, where each dict describes - a token and its attributes. The `ORTH` fields of the attributes must - exactly match the string when they are concatenated. + a token and its attributes. The `ORTH` fields of the attributes + must exactly match the string when they are concatenated. """ substrings = list(substrings) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) @@ -343,7 +341,7 @@ cdef class Tokenizer: """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be either strings or Path-like objects. """ with path.open('wb') as file_: file_.write(self.to_bytes(**exclude)) diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index bc3794126..b4815abd2 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -2,4 +2,4 @@ from .doc import Doc from .token import Token from .span import Span -__all__ = [Doc, Token, Span] +__all__ = ['Doc', 'Token', 'Span'] diff --git a/spacy/tokens/binder.pyx b/spacy/tokens/binder.pyx deleted file mode 100644 index 0ee168579..000000000 --- a/spacy/tokens/binder.pyx +++ /dev/null @@ -1,21 +0,0 @@ -cdef class Binder: - def __init__(self, *docs): - pass - - def __iter__(self): - pass - - def __reduce__(self): - pass - - def to_bytes(self): - pass - - def from_bytes(cls, data): - pass - - def to_disk(self): - pass - - def from_disk(self, path): - pass diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1bd61b256..7a2e95e4b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -23,9 +23,9 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs import intify_attrs, IDS from ..attrs cimport attr_id_t -from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER -from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE -from ..attrs cimport SENT_START +from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER +from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB +from ..attrs cimport ENT_TYPE, SENT_START from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..util import normalize_slice from ..compat import is_config, copy_reg, pickle @@ -78,24 +78,25 @@ def _get_chunker(lang): cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export - annotations to numpy arrays, losslessly serialize to compressed binary strings. - The `Doc` object holds an array of `TokenC` structs. The Python-level - `Token` and `Span` objects are views of this array, i.e. they don't own - the data themselves. + annotations to numpy arrays, losslessly serialize to compressed binary + strings. The `Doc` object holds an array of `TokenC` structs. The + Python-level `Token` and `Span` objects are views of this array, i.e. + they don't own the data themselves. EXAMPLE: Construction 1 >>> doc = nlp(u'Some text') Construction 2 >>> from spacy.tokens import Doc - >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) + >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], + spaces=[True, False, False]) """ @classmethod def set_extension(cls, name, default=None, method=None, getter=None, setter=None): nr_defined = sum(t is not None for t in (default, getter, setter, method)) assert nr_defined == 1 - Underscore.doc_extensions[name] = (default, method, getter, setter) + Underscore.doc_extensions[name] = (default, method, getter, setter) @classmethod def get_extension(cls, name): @@ -109,15 +110,14 @@ cdef class Doc: orths_and_spaces=None): """Create a Doc object. - vocab (Vocab): A vocabulary object, which must match any models you want - to use (e.g. tokenizer, parser, entity recognizer). + vocab (Vocab): A vocabulary object, which must match any models you + want to use (e.g. tokenizer, parser, entity recognizer). words (list or None): A list of unicode strings to add to the document as words. If `None`, defaults to empty list. spaces (list or None): A list of boolean values, of the same length as words. True means that the word is followed by a space, False means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. - RETURNS (Doc): The newly constructed object. """ self.vocab = vocab @@ -153,10 +153,10 @@ cdef class Doc: spaces = [True] * len(words) elif len(spaces) != len(words): raise ValueError( - "Arguments 'words' and 'spaces' should be sequences of the " - "same length, or 'spaces' should be left default at None. " - "spaces should be a sequence of booleans, with True meaning " - "that the word owns a ' ' character following it.") + "Arguments 'words' and 'spaces' should be sequences of " + "the same length, or 'spaces' should be left default at " + "None. spaces should be a sequence of booleans, with True " + "meaning that the word owns a ' ' character following it.") orths_and_spaces = zip(words, spaces) if orths_and_spaces is not None: for orth_space in orths_and_spaces: @@ -166,7 +166,8 @@ cdef class Doc: elif isinstance(orth_space, bytes): raise ValueError( "orths_and_spaces expects either List(unicode) or " - "List((unicode, bool)). Got bytes instance: %s" % (str(orth_space))) + "List((unicode, bool)). " + "Got bytes instance: %s" % (str(orth_space))) else: orth, has_space = orth_space # Note that we pass self.mem here --- we have ownership, if LexemeC @@ -186,7 +187,8 @@ cdef class Doc: def __getitem__(self, object i): """Get a `Token` or `Span` object. - i (int or tuple) The index of the token, or the slice of the document to get. + i (int or tuple) The index of the token, or the slice of the document + to get. RETURNS (Token or Span): The token at `doc[i]]`, or the span at `doc[start : end]`. @@ -199,11 +201,11 @@ cdef class Doc: >>> doc[start : end]] Get a `Span` object, starting at position `start` and ending at position `end`, where `start` and `end` are token indices. For - instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4. - Stepped slices (e.g. `doc[start : end : step]`) are not supported, - as `Span` objects must be contiguous (cannot have gaps). You can use - negative indices and open-ended ranges, which have their normal - Python semantics. + instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and + 4. Stepped slices (e.g. `doc[start : end : step]`) are not + supported, as `Span` objects must be contiguous (cannot have gaps). + You can use negative indices and open-ended ranges, which have + their normal Python semantics. """ if isinstance(i, slice): start, stop = normalize_slice(len(self), i.start, i.stop, i.step) @@ -262,8 +264,10 @@ cdef class Doc: doc (Doc): The parent document. start (int): The index of the first character of the span. end (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for named entities. - vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + label (uint64 or string): A label to attach to the Span, e.g. for + named entities. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation of + the span. RETURNS (Span): The newly constructed object. """ if not isinstance(label, int): @@ -322,7 +326,8 @@ cdef class Doc: if self._vector is not None: return self._vector elif not len(self): - self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') + self._vector = numpy.zeros((self.vocab.vectors_length,), + dtype='f') return self._vector elif self.has_vector: vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') @@ -334,7 +339,8 @@ cdef class Doc: self._vector = self.tensor.mean(axis=0) return self._vector else: - return numpy.zeros((self.vocab.vectors_length,), dtype='float32') + return numpy.zeros((self.vocab.vectors_length,), + dtype='float32') def __set__(self, value): self._vector = value @@ -377,13 +383,14 @@ cdef class Doc: return self.text property ents: - """Iterate over the entities in the document. Yields named-entity `Span` - objects, if the entity recognizer has been applied to the document. + """Iterate over the entities in the document. Yields named-entity + `Span` objects, if the entity recognizer has been applied to the + document. YIELDS (Span): Entities in the document. - EXAMPLE: Iterate over the span to get individual Token objects, or access - the label: + EXAMPLE: Iterate over the span to get individual Token objects, + or access the label: >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') >>> ents = list(tokens.ents) @@ -419,7 +426,8 @@ cdef class Doc: def __set__(self, ents): # TODO: # 1. Allow negative matches - # 2. Ensure pre-set NERs are not over-written during statistical prediction + # 2. Ensure pre-set NERs are not over-written during statistical + # prediction # 3. Test basic data-driven ORTH gazetteer # 4. Test more nuanced date and currency regex cdef int i @@ -428,7 +436,7 @@ cdef class Doc: # At this point we don't know whether the NER has run over the # Doc. If the ent_iob is missing, leave it missing. if self.c[i].ent_iob != 0: - self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. + self.c[i].ent_iob = 2 # Means O. Non-O are set from ents. cdef attr_t ent_type cdef int start, end for ent_info in ents: @@ -456,10 +464,11 @@ cdef class Doc: property noun_chunks: """Iterate over the base noun phrases in the document. Yields base - noun-phrase #[code Span] objects, if the document has been syntactically - parsed. A base noun phrase, or "NP chunk", is a noun phrase that does - not permit other NPs to be nested within it – so no NP-level - coordination, no prepositional phrases, and no relative clauses. + noun-phrase #[code Span] objects, if the document has been + syntactically parsed. A base noun phrase, or "NP chunk", is a noun + phrase that does not permit other NPs to be nested within it – so no + NP-level coordination, no prepositional phrases, and no relative + clauses. YIELDS (Span): Noun chunks in the document. """ @@ -467,12 +476,14 @@ cdef class Doc: if not self.is_parsed: raise ValueError( "noun_chunks requires the dependency parse, which " - "requires data to be installed. For more info, see the " + "requires a statistical model to be installed and loaded. " + "For more info, see the " "documentation: \n%s\n" % about.__docs_models__) - # Accumulate the result before beginning to iterate over it. This prevents - # the tokenisation from being changed out from under us during the iteration. - # The tricky thing here is that Span accepts its tokenisation changing, - # so it's okay once we have the Span objects. See Issue #375 + # Accumulate the result before beginning to iterate over it. This + # prevents the tokenisation from being changed out from under us + # during the iteration. The tricky thing here is that Span accepts + # its tokenisation changing, so it's okay once we have the Span + # objects. See Issue #375. spans = [] for start, end, label in self.noun_chunks_iterator(self): spans.append(Span(self, start, end, label=label)) @@ -497,8 +508,9 @@ cdef class Doc: if not self.is_parsed: raise ValueError( - "sentence boundary detection requires the dependency parse, which " - "requires data to be installed. For more info, see the " + "Sentence boundary detection requires the dependency " + "parse, which requires a statistical model to be " + "installed and loaded. For more info, see the " "documentation: \n%s\n" % about.__docs_models__) cdef int i start = 0 @@ -537,12 +549,11 @@ cdef class Doc: @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): """Export given token attributes to a numpy `ndarray`. - - If `attr_ids` is a sequence of M attributes, the output array will - be of shape `(N, M)`, where N is the length of the `Doc` - (in tokens). If `attr_ids` is a single attribute, the output shape will - be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) - or string name (e.g. 'LEMMA' or 'lemma'). + If `attr_ids` is a sequence of M attributes, the output array will be + of shape `(N, M)`, where N is the length of the `Doc` (in tokens). If + `attr_ids` is a single attribute, the output shape will be (N,). You + can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) or + string name (e.g. 'LEMMA' or 'lemma'). attr_ids (list[]): A list of attributes (int IDs or string names). RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row @@ -566,18 +577,19 @@ cdef class Doc: # Allow strings, e.g. 'lemma' or 'LEMMA' py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_) for id_ in py_attr_ids] - # Make an array from the attributes --- otherwise our inner loop is Python - # dict iteration. + # Make an array from the attributes --- otherwise our inner loop is + # Python dict iteration. attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) - output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) + output = numpy.ndarray(shape=(self.length, len(attr_ids)), + dtype=numpy.uint64) for i in range(self.length): for j, feature in enumerate(attr_ids): output[i, j] = get_token_attr(&self.c[i], feature) # Handle 1d case return output if len(attr_ids) >= 2 else output.reshape((self.length,)) - - def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): + def count_by(self, attr_id_t attr_id, exclude=None, + PreshCounter counts=None): """Count the frequencies of a given attribute. Produces a dict of `{attribute (int): count (ints)}` frequencies, keyed by the values of the given attribute ID. @@ -641,13 +653,12 @@ cdef class Doc: def from_array(self, attrs, array): if SENT_START in attrs and HEAD in attrs: raise ValueError( - "Conflicting attributes specified in doc.from_array():\n" + "Conflicting attributes specified in doc.from_array(): " "(HEAD, SENT_START)\n" - "The HEAD attribute currently sets sentence boundaries implicitly,\n" - "based on the tree structure. This means the HEAD attribute would " - "potentially override the sentence boundaries set by SENT_START.\n" - "See https://github.com/spacy-io/spaCy/issues/235 for details and " - "workarounds, and to propose solutions.") + "The HEAD attribute currently sets sentence boundaries " + "implicitly, based on the tree structure. This means the HEAD " + "attribute would potentially override the sentence boundaries " + "set by SENT_START.") cdef int i, col cdef attr_id_t attr_id cdef TokenC* tokens = self.c @@ -675,18 +686,14 @@ cdef class Doc: return self def get_lca_matrix(self): - ''' - Calculates the lowest common ancestor matrix - for a given Spacy doc. - Returns LCA matrix containing the integer index - of the ancestor, or -1 if no common ancestor is - found (ex if span excludes a necessary ancestor). - Apologies about the recursion, but the - impact on performance is negligible given - the natural limitations on the depth of a typical human sentence. - ''' + """Calculates the lowest common ancestor matrix for a given `Doc`. + Returns LCA matrix containing the integer index of the ancestor, or -1 + if no common ancestor is found (ex if span excludes a necessary + ancestor). Apologies about the recursion, but the impact on + performance is negligible given the natural limitations on the depth + of a typical human sentence. + """ # Efficiency notes: - # # We can easily improve the performance here by iterating in Cython. # To loop over the tokens in Cython, the easiest way is: # for token in doc.c[:doc.c.length]: @@ -705,7 +712,8 @@ cdef class Doc: elif (token_j.head == token_j) and (token_k.head == token_k): lca_index = -1 else: - lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix) + lca_index = __pairwise_lca(token_j.head, token_k.head, + lca_matrix) lca_matrix[token_j.i][token_k.i] = lca_index lca_matrix[token_k.i][token_j.i] = lca_index @@ -719,14 +727,13 @@ cdef class Doc: token_k = self[k] lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) lca_matrix[k][j] = lca_matrix[j][k] - return lca_matrix def to_disk(self, path, **exclude): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be either strings or Path-like objects. """ with path.open('wb') as file_: file_.write(self.to_bytes(**exclude)) @@ -749,7 +756,7 @@ cdef class Doc: RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. """ - array_head = [LENGTH,SPACY,TAG,LEMMA,HEAD,DEP,ENT_IOB,ENT_TYPE] + array_head = [LENGTH, SPACY, TAG, LEMMA, HEAD, DEP, ENT_IOB, ENT_TYPE] # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope @@ -792,7 +799,8 @@ cdef class Doc: # keys, we must have tuples. In values we just have to hope # users don't mind getting a list instead of a tuple. if 'user_data' not in exclude and 'user_data_keys' in msg: - user_data_keys = msgpack.loads(msg['user_data_keys'], use_list=False) + user_data_keys = msgpack.loads(msg['user_data_keys'], + use_list=False) user_data_values = msgpack.loads(msg['user_data_values']) for key, value in zip(user_data_keys, user_data_values): self.user_data[key] = value @@ -819,14 +827,15 @@ cdef class Doc: return self def merge(self, int start_idx, int end_idx, *args, **attributes): - """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` - is merged into a single token. If `start_idx` and `end_idx `do not mark - start and end token boundaries, the document remains unchanged. + """Retokenize the document, such that the span at + `doc.text[start_idx : end_idx]` is merged into a single token. If + `start_idx` and `end_idx `do not mark start and end token boundaries, + the document remains unchanged. - start_idx (int): The character index of the start of the slice to merge. - end_idx (int): The character index after the end of the slice to merge. + start_idx (int): Character index of the start of the slice to merge. + end_idx (int): Character index after the end of the slice to merge. **attributes: Attributes to assign to the merged token. By default, - attributes are inherited from the syntactic root token of the span. + attributes are inherited from the syntactic root of the span. RETURNS (Token): The newly merged token, or `None` if the start and end indices did not fall at token boundaries. """ @@ -847,10 +856,11 @@ cdef class Doc: attributes[ENT_TYPE] = attributes['ent_type'] elif args: raise ValueError( - "Doc.merge received %d non-keyword arguments. " - "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " + "Doc.merge received %d non-keyword arguments. Expected either " + "3 arguments (deprecated), or 0 (use keyword arguments). " "Arguments supplied:\n%s\n" - "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) + "Keyword arguments: %s\n" % (len(args), repr(args), + repr(attributes))) # More deprecated attribute handling =/ if 'label' in attributes: @@ -882,8 +892,9 @@ cdef class Doc: Token.set_struct_attr(token, attr_name, attr_value) # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets - # Before thinking of something simpler, beware the case where a dependency - # bridges over the entity. Here the alignment of the tokens changes. + # Before thinking of something simpler, beware the case where a + # dependency bridges over the entity. Here the alignment of the + # tokens changes. span_root = span.root.i token.dep = span.root.dep # We update token.lex after keeping span root and dep, since @@ -932,8 +943,9 @@ cdef class Doc: >>> trees = doc.print_tree() >>> trees[1] {'modifiers': [ - {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', - 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, + {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', + 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', + 'lemma': 'Alice'}, {'modifiers': [ {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], @@ -1008,7 +1020,7 @@ def pickle_doc(doc): def unpickle_doc(vocab, hooks_and_data, bytes_data): user_data, doc_hooks, span_hooks, token_hooks = dill.loads(hooks_and_data) - + doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude='user_data') doc.user_hooks.update(doc_hooks) @@ -1018,4 +1030,3 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data): copy_reg.pickle(Doc, pickle_doc, unpickle_doc) - diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py index 4bc7099d7..92b2cd84c 100644 --- a/spacy/tokens/printers.py +++ b/spacy/tokens/printers.py @@ -43,8 +43,8 @@ def POS_tree(root, light=False, flat=False): def parse_tree(doc, light=False, flat=False): - """Makes a copy of the doc, then construct a syntactic parse tree, similar to - the one used in displaCy. Generates the POS tree for all sentences in a doc. + """Make a copy of the doc and construct a syntactic parse tree similar to + displaCy. Generates the POS tree for all sentences in a doc. doc (Doc): The doc for parsing. RETURNS (dict): The parse tree. @@ -66,8 +66,9 @@ def parse_tree(doc, light=False, flat=False): 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'} """ - doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) + doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE])) merge_ents(doc_clone) # merge the entities into single tokens first - return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents] + return [POS_tree(sent.root, light=light, flat=flat) + for sent in doc_clone.sents] diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 963292fdb..efe511089 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -35,15 +35,16 @@ cdef class Span: def has_extension(cls, name): return name in Underscore.span_extensions - def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, - vector_norm=None): + def __cinit__(self, Doc doc, int start, int end, attr_t label=0, + vector=None, vector_norm=None): """Create a `Span` object from the slice `doc[start : end]`. doc (Doc): The parent document. start (int): The index of the first token of the span. end (int): The index of the first token after the span. label (uint64): A label to attach to the Span, e.g. for named entities. - vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation + of the span. RETURNS (Span): The newly constructed object. """ if not (0 <= start <= end <= len(doc)): @@ -127,14 +128,17 @@ cdef class Span: @property def _(self): + """User space for adding custom attribute extensions.""" return Underscore(Underscore.span_extensions, self, start=self.start_char, end=self.end_char) def as_doc(self): - '''Create a Doc object view of the Span's data. + # TODO: fix + """Create a `Doc` object view of the Span's data. This is mostly + useful for C-typed interfaces. - This is mostly useful for C-typed interfaces. - ''' + RETURNS (Doc): The `Doc` view of the span. + """ cdef Doc doc = Doc(self.doc.vocab) doc.length = self.end-self.start doc.c = &self.doc.c[self.start] @@ -162,7 +166,8 @@ cdef class Span: attributes are inherited from the syntactic root token of the span. RETURNS (Token): The newly merged token. """ - return self.doc.merge(self.start_char, self.end_char, *args, **attributes) + return self.doc.merge(self.start_char, self.end_char, *args, + **attributes) def similarity(self, other): """Make a semantic similarity estimate. The default estimate is cosine @@ -179,24 +184,19 @@ cdef class Span: return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) def get_lca_matrix(self): - ''' - Calculates the lowest common ancestor matrix - for a given Spacy span. - Returns LCA matrix containing the integer index - of the ancestor, or -1 if no common ancestor is - found (ex if span excludes a necessary ancestor). - Apologies about the recursion, but the - impact on performance is negligible given - the natural limitations on the depth of a typical human sentence. - ''' - + """Calculates the lowest common ancestor matrix for a given `Span`. + Returns LCA matrix containing the integer index of the ancestor, or -1 + if no common ancestor is found (ex if span excludes a necessary + ancestor). Apologies about the recursion, but the impact on + performance is negligible given the natural limitations on the depth + of a typical human sentence. + """ def __pairwise_lca(token_j, token_k, lca_matrix, margins): offset = margins[0] token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j token_j_i = token_j.i - offset token_k_i = token_k.i - offset - if lca_matrix[token_j_i][token_k_i] != -2: return lca_matrix[token_j_i][token_k_i] elif token_j == token_k: @@ -209,23 +209,19 @@ cdef class Span: lca_index = -1 else: lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) - lca_matrix[token_j_i][token_k_i] = lca_index lca_matrix[token_k_i][token_j_i] = lca_index - return lca_index lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) lca_matrix.fill(-2) margins = [self.start, self.end] - for j in range(len(self)): token_j = self[j] for k in range(len(self)): token_k = self[k] lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) lca_matrix[k][j] = lca_matrix[j][k] - return lca_matrix cpdef np.ndarray to_array(self, object py_attr_ids): @@ -266,10 +262,7 @@ cdef class Span: self.end = end + 1 property sent: - """The sentence span that this span is a part of. - - RETURNS (Span): The sentence span that the span is a part of. - """ + """RETURNS (Span): The sentence span that the span is a part of.""" def __get__(self): if 'sent' in self.doc.user_span_hooks: return self.doc.user_span_hooks['sent'](self) @@ -282,13 +275,10 @@ cdef class Span: n += 1 if n >= self.doc.length: raise RuntimeError - return self.doc[root.l_edge : root.r_edge + 1] + return self.doc[root.l_edge:root.r_edge + 1] property has_vector: - """A boolean value indicating whether a word vector is associated with - the object. - - RETURNS (bool): Whether a word vector is associated with the object. + """RETURNS (bool): Whether a word vector is associated with the object. """ def __get__(self): if 'has_vector' in self.doc.user_span_hooks: @@ -310,10 +300,7 @@ cdef class Span: return self._vector property vector_norm: - """The L2 norm of the document's vector representation. - - RETURNS (float): The L2 norm of the vector representation. - """ + """RETURNS (float): The L2 norm of the vector representation.""" def __get__(self): if 'vector_norm' in self.doc.user_span_hooks: return self.doc.user_span_hooks['vector'](self) @@ -327,7 +314,9 @@ cdef class Span: return self._vector_norm property sentiment: - # TODO: docstring + """RETURNS (float): A scalar value indicating the positivity or + negativity of the span. + """ def __get__(self): if 'sentiment' in self.doc.user_span_hooks: return self.doc.user_span_hooks['sentiment'](self) @@ -335,10 +324,7 @@ cdef class Span: return sum([token.sentiment for token in self]) / len(self) property text: - """A unicode representation of the span text. - - RETURNS (unicode): The original verbatim text of the span. - """ + """RETURNS (unicode): The original verbatim text of the span.""" def __get__(self): text = self.text_with_ws if self[-1].whitespace_: @@ -349,7 +335,8 @@ cdef class Span: """The text content of the span with a trailing whitespace character if the last token has one. - RETURNS (unicode): The text content of the span (with trailing whitespace). + RETURNS (unicode): The text content of the span (with trailing + whitespace). """ def __get__(self): return u''.join([t.text_with_ws for t in self]) @@ -358,7 +345,8 @@ cdef class Span: """Yields base noun-phrase `Span` objects, if the document has been syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no - NP-level coordination, no prepositional phrases, and no relative clauses. + NP-level coordination, no prepositional phrases, and no relative + clauses. YIELDS (Span): Base noun-phrase `Span` objects """ @@ -366,12 +354,14 @@ cdef class Span: if not self.doc.is_parsed: raise ValueError( "noun_chunks requires the dependency parse, which " - "requires data to be installed. For more info, see the " + "requires a statistical model to be installed and loaded. " + "For more info, see the " "documentation: \n%s\n" % about.__docs_models__) - # Accumulate the result before beginning to iterate over it. This prevents - # the tokenisation from being changed out from under us during the iteration. - # The tricky thing here is that Span accepts its tokenisation changing, - # so it's okay once we have the Span objects. See Issue #375 + # Accumulate the result before beginning to iterate over it. This + # prevents the tokenisation from being changed out from under us + # during the iteration. The tricky thing here is that Span accepts + # its tokenisation changing, so it's okay once we have the Span + # objects. See Issue #375 spans = [] cdef attr_t label for start, end, label in self.doc.noun_chunks_iterator(self): @@ -385,9 +375,9 @@ cdef class Span: RETURNS (Token): The root token. - EXAMPLE: The root token has the shortest path to the root of the sentence - (or is the root itself). If multiple words are equally high in the - tree, the first word is taken. For example: + EXAMPLE: The root token has the shortest path to the root of the + sentence (or is the root itself). If multiple words are equally + high in the tree, the first word is taken. For example: >>> toks = nlp(u'I like New York in Autumn.') @@ -437,11 +427,11 @@ cdef class Span: if self.doc.c[i].head == 0: return self.doc[i] # If we don't have a sentence root, we do something that's not so - # algorithmically clever, but I think should be quite fast, especially - # for short spans. + # algorithmically clever, but I think should be quite fast, + # especially for short spans. # For each word, we count the path length, and arg min this measure. - # We could use better tree logic to save steps here...But I think this - # should be okay. + # We could use better tree logic to save steps here...But I + # think this should be okay. cdef int current_best = self.doc.length cdef int root = -1 for i in range(self.start, self.end): @@ -463,7 +453,7 @@ cdef class Span: YIELDS (Token):A left-child of a token of the span. """ def __get__(self): - for token in reversed(self): # Reverse, so we get the tokens in order + for token in reversed(self): # Reverse, so we get tokens in order for left in token.lefts: if left.i < self.start: yield left @@ -480,6 +470,22 @@ cdef class Span: if right.i >= self.end: yield right + property n_lefts: + """RETURNS (int): The number of leftward immediate children of the + span, in the syntactic dependency parse. + """ + # TODO: implement + def __get__(self): + raise NotImplementedError + + property n_rights: + """RETURNS (int): The number of rightward immediate children of the + span, in the syntactic dependency parse. + """ + # TODO: implement + def __get__(self): + raise NotImplementedError + property subtree: """Tokens that descend from tokens in the span, but fall outside it. @@ -493,66 +499,55 @@ cdef class Span: yield from word.subtree property ent_id: - """An (integer) entity ID. Usually assigned by patterns in the `Matcher`. - - RETURNS (uint64): The entity ID. - """ + """RETURNS (uint64): The entity ID.""" def __get__(self): return self.root.ent_id def __set__(self, hash_t key): - # TODO raise NotImplementedError( - "Can't yet set ent_id from Span. Vote for this feature on the issue " - "tracker: http://github.com/explosion/spaCy/issues") + "Can't yet set ent_id from Span. Vote for this feature on " + "the issue tracker: http://github.com/explosion/spaCy/issues") property ent_id_: - """A (string) entity ID. Usually assigned by patterns in the `Matcher`. - - RETURNS (unicode): The entity ID. - """ + """RETURNS (unicode): The (string) entity ID.""" def __get__(self): return self.root.ent_id_ def __set__(self, hash_t key): - # TODO raise NotImplementedError( - "Can't yet set ent_id_ from Span. Vote for this feature on the issue " - "tracker: http://github.com/explosion/spaCy/issues") + "Can't yet set ent_id_ from Span. Vote for this feature on the " + "issue tracker: http://github.com/explosion/spaCy/issues") property orth_: - # TODO: docstring + """Verbatim text content (identical to Span.text). Exists mostly for + consistency with other attributes. + + RETURNS (unicode): The span's text.""" def __get__(self): - return ''.join([t.string for t in self]).strip() + return ''.join([t.orth_ for t in self]).strip() property lemma_: - """The span's lemma. - - RETURNS (unicode): The span's lemma. - """ + """RETURNS (unicode): The span's lemma.""" def __get__(self): return ' '.join([t.lemma_ for t in self]).strip() property upper_: - # TODO: docstring + """Deprecated. Use Span.text.upper() instead.""" def __get__(self): - return ''.join([t.string.upper() for t in self]).strip() + return ''.join([t.text_with_ws.upper() for t in self]).strip() property lower_: - # TODO: docstring + """Deprecated. Use Span.text.lower() instead.""" def __get__(self): - return ''.join([t.string.lower() for t in self]).strip() + return ''.join([t.text_with_ws.lower() for t in self]).strip() property string: - # TODO: docstring + """Deprecated: Use Span.text_with_ws instead.""" def __get__(self): - return ''.join([t.string for t in self]) + return ''.join([t.text_with_ws for t in self]) property label_: - """The span's label. - - RETURNS (unicode): The span's label. - """ + """RETURNS (unicode): The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] @@ -570,7 +565,8 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: n += 1 if n >= sent_length: raise RuntimeError( - "Array bounds exceeded while searching for root word. This likely " - "means the parse tree is in an invalid state. Please report this " - "issue here: http://github.com/explosion/spaCy/issues") + "Array bounds exceeded while searching for root word. This " + "likely means the parse tree is in an invalid state. Please " + "report this issue here: " + "http://github.com/explosion/spaCy/issues") return n diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 514934ca7..fa07d0e9e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -14,17 +14,18 @@ from ..typedefs cimport hash_t from ..lexeme cimport Lexeme from .. import parts_of_speech from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE -from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV -from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP -from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER -from ..attrs cimport LEMMA, POS, TAG, DEP +from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT +from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL +from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX +from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..compat import is_config from .. import about from .underscore import Underscore cdef class Token: - """An individual token – i.e. a word, punctuation symbol, whitespace, etc.""" + """An individual token – i.e. a word, punctuation symbol, whitespace, + etc.""" @classmethod def set_extension(cls, name, default=None, method=None, getter=None, setter=None): @@ -144,37 +145,33 @@ cdef class Token: return self.doc.user_token_hooks['similarity'](self) if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 - return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + return (numpy.dot(self.vector, other.vector) / + (self.vector_norm * other.vector_norm)) property lex_id: - """ID of the token's lexical type. - - RETURNS (int): ID of the token's lexical type.""" + """RETURNS (int): Sequential ID of the token's lexical type.""" def __get__(self): return self.c.lex.id property rank: - # TODO: add docstring + """RETURNS (int): Sequential ID of the token's lexical type, used to + index into tables, e.g. for word vectors.""" def __get__(self): return self.c.lex.id property string: + """Deprecated: Use Token.text_with_ws instead.""" def __get__(self): return self.text_with_ws property text: - """A unicode representation of the token text. - - RETURNS (unicode): The original verbatim text of the token. - """ + """RETURNS (unicode): The original verbatim text of the token.""" def __get__(self): return self.orth_ property text_with_ws: - """The text content of the token with a trailing whitespace character if - it has one. - - RETURNS (unicode): The text content of the span (with trailing whitespace). + """RETURNS (unicode): The text content of the span (with trailing + whitespace). """ def __get__(self): cdef unicode orth = self.vocab.strings[self.c.lex.orth] @@ -184,74 +181,104 @@ cdef class Token: return orth property prob: + """RETURNS (float): Smoothed log probability estimate of token type.""" def __get__(self): return self.c.lex.prob property sentiment: + """RETURNS (float): A scalar value indicating the positivity or + negativity of the token.""" def __get__(self): if 'sentiment' in self.doc.user_token_hooks: return self.doc.user_token_hooks['sentiment'](self) return self.c.lex.sentiment property lang: + """RETURNS (uint64): ID of the language of the parent document's + vocabulary. + """ def __get__(self): return self.c.lex.lang property idx: + """RETURNS (int): The character offset of the token within the parent + document. + """ def __get__(self): return self.c.idx property cluster: + """RETURNS (int): Brown cluster ID.""" def __get__(self): return self.c.lex.cluster property orth: + """RETURNS (uint64): ID of the verbatim text content.""" def __get__(self): return self.c.lex.orth property lower: + """RETURNS (uint64): ID of the lowercase token text.""" def __get__(self): return self.c.lex.lower property norm: + """RETURNS (uint64): ID of the token's norm, i.e. a normalised form of + the token text. Usually set in the language's tokenizer exceptions + or norm exceptions. + """ def __get__(self): return self.c.lex.norm property shape: + """RETURNS (uint64): ID of the token's shape, a transform of the + tokens's string, to show orthographic features (e.g. "Xxxx", "dd"). + """ def __get__(self): return self.c.lex.shape property prefix: + """RETURNS (uint64): ID of a length-N substring from the start of the + token. Defaults to `N=1`. + """ def __get__(self): return self.c.lex.prefix property suffix: + """RETURNS (uint64): ID of a length-N substring from the end of the + token. Defaults to `N=3`. + """ def __get__(self): return self.c.lex.suffix property lemma: - """Base form of the word, with no inflectional suffixes. - - RETURNS (uint64): Token lemma. + """RETURNS (uint64): ID of the base form of the word, with no + inflectional suffixes. """ def __get__(self): return self.c.lemma + def __set__(self, attr_t lemma): self.c.lemma = lemma property pos: + """RETURNS (uint64): ID of coarse-grained part-of-speech tag.""" def __get__(self): return self.c.pos property tag: + """RETURNS (uint64): ID of fine-grained part-of-speech tag.""" def __get__(self): return self.c.tag + def __set__(self, attr_t tag): self.vocab.morphology.assign_tag(self.c, tag) property dep: + """RETURNS (uint64): ID of syntactic dependency label.""" def __get__(self): return self.c.dep + def __set__(self, attr_t label): self.c.dep = label @@ -292,23 +319,29 @@ cdef class Token: return numpy.sqrt((vector ** 2).sum()) property n_lefts: + """RETURNS (int): The number of leftward immediate children of the + word, in the syntactic dependency parse. + """ def __get__(self): return self.c.l_kids property n_rights: + """RETURNS (int): The number of rightward immediate children of the + word, in the syntactic dependency parse. + """ def __get__(self): return self.c.r_kids property sent_start: + # TODO: fix and document def __get__(self): return self.c.sent_start def __set__(self, value): if self.doc.is_parsed: raise ValueError( - 'Refusing to write to token.sent_start if its document is parsed, ' - 'because this may cause inconsistent state. ' - 'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.') + "Refusing to write to token.sent_start if its document " + "is parsed, because this may cause inconsistent state.") if value is None: self.c.sent_start = 0 elif value is True: @@ -316,15 +349,16 @@ cdef class Token: elif value is False: self.c.sent_start = -1 else: - raise ValueError("Invalid value for token.sent_start -- must be one of " - "None, True, False") + raise ValueError("Invalid value for token.sent_start. Must be " + "one of: None, True, False") property lefts: + """The leftward immediate children of the word, in the syntactic + dependency parse. + + YIELDS (Token): A left-child of the token. + """ def __get__(self): - """ - The leftward immediate children of the word, in the syntactic - dependency parse. - """ cdef int nr_iter = 0 cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) while ptr < self.c: @@ -334,15 +368,16 @@ cdef class Token: nr_iter += 1 # This is ugly, but it's a way to guard out infinite loops if nr_iter >= 10000000: - raise RuntimeError( - "Possibly infinite loop encountered while looking for token.lefts") + raise RuntimeError("Possibly infinite loop encountered " + "while looking for token.lefts") property rights: + """The rightward immediate children of the word, in the syntactic + dependency parse. + + YIELDS (Token): A right-child of the token. + """ def __get__(self): - """ - The rightward immediate children of the word, in the syntactic - dependency parse. - """ cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) tokens = [] cdef int nr_iter = 0 @@ -352,27 +387,26 @@ cdef class Token: ptr -= 1 nr_iter += 1 if nr_iter >= 10000000: - raise RuntimeError( - "Possibly infinite loop encountered while looking for token.rights") + raise RuntimeError("Possibly infinite loop encountered " + "while looking for token.rights") tokens.reverse() for t in tokens: yield t property children: - """ - A sequence of the token's immediate syntactic children. + """A sequence of the token's immediate syntactic children. - Yields: Token A child token such that child.head==self + YIELDS (Token): A child token such that child.head==self """ def __get__(self): yield from self.lefts yield from self.rights property subtree: - """ - A sequence of all the token's syntactic descendents. + """A sequence of all the token's syntactic descendents. - Yields: Token A descendent token such that self.is_ancestor(descendent) + YIELDS (Token): A descendent token such that + `self.is_ancestor(descendent)`. """ def __get__(self): for word in self.lefts: @@ -422,18 +456,17 @@ cdef class Token: """ if self.doc is not descendant.doc: return False - return any( ancestor.i == self.i for ancestor in descendant.ancestors ) + return any(ancestor.i == self.i for ancestor in descendant.ancestors) property head: """The syntactic parent, or "governor", of this token. - RETURNS (Token): The token head. + RETURNS (Token): The token predicted by the parser to be the head of + the current token. """ def __get__(self): - """The token predicted by the parser to be the head of the current - token. - """ return self.doc[self.i + self.c.head] + def __set__(self, Token new_head): # this function sets the head of self to new_head # and updates the counters for left/right dependents @@ -453,16 +486,18 @@ cdef class Token: cdef Token anc, child # update number of deps of old head - if self.c.head > 0: # left dependent + if self.c.head > 0: # left dependent old_head.c.l_kids -= 1 if self.c.l_edge == old_head.c.l_edge: - # the token dominates the left edge so the left edge of the head - # may change when the token is reattached - # it may not change if the new head is a descendant of the current head + # the token dominates the left edge so the left edge of + # the head may change when the token is reattached, it may + # not change if the new head is a descendant of the current + # head new_edge = self.c.l_edge - # the new l_edge is the left-most l_edge on any of the other dependents - # where the l_edge is left of the head, otherwise it is the head + # the new l_edge is the left-most l_edge on any of the + # other dependents where the l_edge is left of the head, + # otherwise it is the head if not is_desc: new_edge = old_head.i for child in old_head.children: @@ -472,14 +507,15 @@ cdef class Token: new_edge = child.c.l_edge old_head.c.l_edge = new_edge - # walk up the tree from old_head and assign new l_edge to ancestors - # until an ancestor already has an l_edge that's further left + # walk up the tree from old_head and assign new l_edge to + # ancestors until an ancestor already has an l_edge that's + # further left for anc in old_head.ancestors: if anc.c.l_edge <= new_edge: break anc.c.l_edge = new_edge - elif self.c.head < 0: # right dependent + elif self.c.head < 0: # right dependent old_head.c.r_kids -= 1 # do the same thing as for l_edge if self.c.r_edge == old_head.c.r_edge: @@ -500,7 +536,7 @@ cdef class Token: anc.c.r_edge = new_edge # update number of deps of new head - if rel_newhead_i > 0: # left dependent + if rel_newhead_i > 0: # left dependent new_head.c.l_kids += 1 # walk up the tree from new head and set l_edge to self.l_edge # until you hit a token with an l_edge further to the left @@ -511,7 +547,7 @@ cdef class Token: break anc.c.l_edge = self.c.l_edge - elif rel_newhead_i < 0: # right dependent + elif rel_newhead_i < 0: # right dependent new_head.c.r_kids += 1 # do the same as for l_edge if self.c.r_edge > new_head.c.r_edge: @@ -542,12 +578,10 @@ cdef class Token: yield from word.conjuncts property ent_type: - """Named entity type. - - RETURNS (uint64): Named entity type. - """ + """RETURNS (uint64): Named entity type.""" def __get__(self): return self.c.ent_type + def __set__(self, ent_type): self.c.ent_type = ent_type @@ -561,19 +595,17 @@ cdef class Token: return self.c.ent_iob property ent_type_: - """Named entity type. - - RETURNS (unicode): Named entity type. - """ + """RETURNS (unicode): Named entity type.""" def __get__(self): return self.vocab.strings[self.c.ent_type] + def __set__(self, ent_type): self.c.ent_type = self.vocab.strings.add(ent_type) property ent_iob_: """IOB code of named entity tag. "B" means the token begins an entity, - "I" means it is inside an entity, "O" means it is outside an entity, and - "" means no entity tag is set. + "I" means it is inside an entity, "O" means it is outside an entity, + and "" means no entity tag is set. RETURNS (unicode): IOB code of named entity tag. """ @@ -582,10 +614,8 @@ cdef class Token: return iob_strings[self.c.ent_iob] property ent_id: - """ID of the entity the token is an instance of, if any. Usually - assigned by patterns in the Matcher. - - RETURNS (uint64): ID of the entity. + """RETURNS (uint64): ID of the entity the token is an instance of, + if any. """ def __get__(self): return self.c.ent_id @@ -594,10 +624,8 @@ cdef class Token: self.c.ent_id = key property ent_id_: - """ID of the entity the token is an instance of, if any. Usually - assigned by patterns in the Matcher. - - RETURNS (unicode): ID of the entity. + """RETURNS (unicode): ID of the entity the token is an instance of, + if any. """ def __get__(self): return self.vocab.strings[self.c.ent_id] @@ -606,107 +634,192 @@ cdef class Token: self.c.ent_id = self.vocab.strings.add(name) property whitespace_: + """RETURNS (unicode): The trailing whitespace character, if present. + """ def __get__(self): return ' ' if self.c.spacy else '' property orth_: + """RETURNS (unicode): Verbatim text content (identical to + `Token.text`). Existst mostly for consistency with the other + attributes. + """ def __get__(self): return self.vocab.strings[self.c.lex.orth] property lower_: + """RETURNS (unicode): The lowercase token text. Equivalent to + `Token.text.lower()`. + """ def __get__(self): return self.vocab.strings[self.c.lex.lower] property norm_: + """RETURNS (unicode): The token's norm, i.e. a normalised form of the + token text. Usually set in the language's tokenizer exceptions or + norm exceptions. + """ def __get__(self): return self.vocab.strings[self.c.lex.norm] property shape_: + """RETURNS (unicode): Transform of the tokens's string, to show + orthographic features. For example, "Xxxx" or "dd". + """ def __get__(self): return self.vocab.strings[self.c.lex.shape] property prefix_: + """RETURNS (unicode): A length-N substring from the start of the token. + Defaults to `N=1`. + """ def __get__(self): return self.vocab.strings[self.c.lex.prefix] property suffix_: + """RETURNS (unicode): A length-N substring from the end of the token. + Defaults to `N=3`. + """ def __get__(self): return self.vocab.strings[self.c.lex.suffix] property lang_: + """RETURNS (unicode): Language of the parent document's vocabulary, + e.g. 'en'. + """ def __get__(self): return self.vocab.strings[self.c.lex.lang] property lemma_: - """Base form of the word, with no inflectional suffixes. - - RETURNS (unicode): Token lemma. + """RETURNS (unicode): The token lemma, i.e. the base form of the word, + with no inflectional suffixes. """ def __get__(self): return self.vocab.strings[self.c.lemma] + def __set__(self, unicode lemma_): self.c.lemma = self.vocab.strings.add(lemma_) property pos_: + """RETURNS (unicode): Coarse-grained part-of-speech tag.""" def __get__(self): return parts_of_speech.NAMES[self.c.pos] property tag_: + """RETURNS (unicode): Fine-grained part-of-speech tag.""" def __get__(self): return self.vocab.strings[self.c.tag] + def __set__(self, tag): self.tag = self.vocab.strings.add(tag) property dep_: + """RETURNS (unicode): The syntactic dependency label.""" def __get__(self): return self.vocab.strings[self.c.dep] + def __set__(self, unicode label): self.c.dep = self.vocab.strings.add(label) property is_oov: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV) + """RETURNS (bool): Whether the token is out-of-vocabulary.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_OOV) property is_stop: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_STOP) + """RETURNS (bool): Whether the token is a stop word, i.e. part of a + "stop list" defined by the language data. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_STOP) property is_alpha: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) + """RETURNS (bool): Whether the token consists of alpha characters. + Equivalent to `token.text.isalpha()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_ALPHA) property is_ascii: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_ASCII) + """RETURNS (bool): Whether the token consists of ASCII characters. + Equivalent to `[any(ord(c) >= 128 for c in token.text)]`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_ASCII) property is_digit: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) + """RETURNS (bool): Whether the token consists of digits. Equivalent to + `token.text.isdigit()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_DIGIT) property is_lower: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LOWER) + """RETURNS (bool): Whether the token is in lowercase. Equivalent to + `token.text.islower()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_LOWER) + + property is_upper: + """RETURNS (bool): Whether the token is in uppercase. Equivalent to + `token.text.isupper()` + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_UPPER) property is_title: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_TITLE) + """RETURNS (bool): Whether the token is in titlecase. Equivalent to + `token.text.istitle()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_TITLE) property is_punct: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) + """RETURNS (bool): Whether the token is punctuation.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_PUNCT) property is_space: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) + """RETURNS (bool): Whether the token consists of whitespace characters. + Equivalent to `token.text.isspace()`. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_SPACE) property is_bracket: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) + """RETURNS (bool): Whether the token is a bracket.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) property is_quote: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) + """RETURNS (bool): Whether the token is a quotation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) property is_left_punct: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) + """RETURNS (bool): Whether the token is a left punctuation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) property is_right_punct: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) + """RETURNS (bool): Whether the token is a left punctuation mark.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) property like_url: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL) + """RETURNS (bool): Whether the token resembles a URL.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, LIKE_URL) property like_num: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) + """RETURNS (bool): Whether the token resembles a number, e.g. "10.9", + "10", "ten", etc. + """ + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, LIKE_NUM) property like_email: - def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) + """RETURNS (bool): Whether the token resembles an email address.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, LIKE_EMAIL) diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index 6e782647b..d80f50685 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -1,5 +1,9 @@ +# coding: utf8 +from __future__ import unicode_literals + import functools + class Underscore(object): doc_extensions = {} span_extensions = {} diff --git a/spacy/typedefs.pyx b/spacy/typedefs.pyx index 8b1378917..e69de29bb 100644 --- a/spacy/typedefs.pyx +++ b/spacy/typedefs.pyx @@ -1 +0,0 @@ - diff --git a/spacy/util.py b/spacy/util.py index ca5a40f97..a45d43c47 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -10,25 +10,27 @@ from pathlib import Path import sys import textwrap import random -import numpy -import io -import dill from collections import OrderedDict from thinc.neural._classes.model import Model import functools +from .symbols import ORTH +from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ +from .compat import import_file + import msgpack import msgpack_numpy msgpack_numpy.patch() -import ujson - -from .symbols import ORTH -from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ -from .compat import copy_array, normalize_string_keys, getattr_, import_file LANGUAGES = {} _data_path = Path(__file__).parent / 'data' +_PRINT_ENV = False + + +def set_env_log(value): + global _PRINT_ENV + _PRINT_ENV = value def get_lang_class(lang): @@ -38,11 +40,12 @@ def get_lang_class(lang): RETURNS (Language): Language class. """ global LANGUAGES - if not lang in LANGUAGES: + if lang not in LANGUAGES: try: module = importlib.import_module('.lang.%s' % lang, 'spacy') except ImportError: - raise ImportError("Can't import language %s from spacy.lang." %lang) + msg = "Can't import language %s from spacy.lang." + raise ImportError(msg % lang) LANGUAGES[lang] = getattr(module, module.__all__[0]) return LANGUAGES[lang] @@ -100,14 +103,14 @@ def load_model(name, **overrides): data_path = get_data_path() if not data_path or not data_path.exists(): raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) - if isinstance(name, basestring_): - if name in set([d.name for d in data_path.iterdir()]): # in data dir / shortcut + if isinstance(name, basestring_): # in data dir / shortcut + if name in set([d.name for d in data_path.iterdir()]): return load_model_from_link(name, **overrides) - if is_package(name): # installed as package + if is_package(name): # installed as package return load_model_from_package(name, **overrides) - if Path(name).exists(): # path to model data directory + if Path(name).exists(): # path to model data directory return load_model_from_path(Path(name), **overrides) - elif hasattr(name, 'exists'): # Path or Path-like to model data + elif hasattr(name, 'exists'): # Path or Path-like to model data return load_model_from_path(name, **overrides) raise IOError("Can't find model '%s'" % name) @@ -120,7 +123,7 @@ def load_model_from_link(name, **overrides): except AttributeError: raise IOError( "Cant' load '%s'. If you're using a shortcut link, make sure it " - "points to a valid model package (not just a data directory)." % name) + "points to a valid package (not just a data directory)." % name) return cls.load(**overrides) @@ -164,7 +167,8 @@ def load_model_from_init_py(init_file, **overrides): data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) data_path = model_path / data_dir if not model_path.exists(): - raise ValueError("Can't find model directory: %s" % path2str(data_path)) + msg = "Can't find model directory: %s" + raise ValueError(msg % path2str(data_path)) return load_model_from_path(data_path, meta, **overrides) @@ -176,14 +180,16 @@ def get_model_meta(path): """ model_path = ensure_path(path) if not model_path.exists(): - raise ValueError("Can't find model directory: %s" % path2str(model_path)) + msg = "Can't find model directory: %s" + raise ValueError(msg % path2str(model_path)) meta_path = model_path / 'meta.json' if not meta_path.is_file(): raise IOError("Could not read meta.json from %s" % meta_path) meta = read_json(meta_path) for setting in ['lang', 'name', 'version']: if setting not in meta or not meta[setting]: - raise ValueError("No valid '%s' setting found in model meta.json" % setting) + msg = "No valid '%s' setting found in model meta.json" + raise ValueError(msg % setting) return meta @@ -240,7 +246,7 @@ def get_async(stream, numpy_array): return numpy_array else: array = cupy.ndarray(numpy_array.shape, order='C', - dtype=numpy_array.dtype) + dtype=numpy_array.dtype) array.set(numpy_array, stream=stream) return array @@ -274,12 +280,6 @@ def itershuffle(iterable, bufsize=1000): raise StopIteration -_PRINT_ENV = False -def set_env_log(value): - global _PRINT_ENV - _PRINT_ENV = value - - def env_opt(name, default=None): if type(default) is float: type_convert = float @@ -305,17 +305,20 @@ def read_regex(path): path = ensure_path(path) with path.open() as file_: entries = file_.read().split('\n') - expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) + expression = '|'.join(['^' + re.escape(piece) + for piece in entries if piece.strip()]) return re.compile(expression) def compile_prefix_regex(entries): if '(' in entries: # Handle deprecated data - expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) + expression = '|'.join(['^' + re.escape(piece) + for piece in entries if piece.strip()]) return re.compile(expression) else: - expression = '|'.join(['^' + piece for piece in entries if piece.strip()]) + expression = '|'.join(['^' + piece + for piece in entries if piece.strip()]) return re.compile(expression) @@ -359,16 +362,15 @@ def update_exc(base_exceptions, *addition_dicts): exc = dict(base_exceptions) for additions in addition_dicts: for orth, token_attrs in additions.items(): - if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs): - msg = "Invalid value for ORTH in exception: key='%s', orths='%s'" + if not all(isinstance(attr[ORTH], unicode_) + for attr in token_attrs): + msg = "Invalid ORTH value in exception: key='%s', orths='%s'" raise ValueError(msg % (orth, token_attrs)) described_orth = ''.join(attr[ORTH] for attr in token_attrs) if orth != described_orth: - raise ValueError("Invalid tokenizer exception: ORTH values " - "combined don't match original string. " - "key='%s', orths='%s'" % (orth, described_orth)) - # overlap = set(exc.keys()).intersection(set(additions)) - # assert not overlap, overlap + msg = ("Invalid tokenizer exception: ORTH values combined " + "don't match original string. key='%s', orths='%s'") + raise ValueError(msg % (orth, described_orth)) exc.update(additions) exc = expand_exc(exc, "'", "’") return exc @@ -401,17 +403,15 @@ def normalize_slice(length, start, stop, step=None): raise ValueError("Stepped slices not supported in Span objects." "Try: list(tokens)[start:stop:step] instead.") if start is None: - start = 0 + start = 0 elif start < 0: - start += length + start += length start = min(length, max(0, start)) - if stop is None: - stop = length + stop = length elif stop < 0: - stop += length + stop += length stop = min(length, max(start, stop)) - assert 0 <= start <= stop <= length return start, stop @@ -428,7 +428,7 @@ def compounding(start, stop, compound): >>> assert next(sizes) == 1.5 * 1.5 """ def clip(value): - return max(value, stop) if (start>stop) else min(value, stop) + return max(value, stop) if (start > stop) else min(value, stop) curr = float(start) while True: yield clip(curr) @@ -438,7 +438,7 @@ def compounding(start, stop, compound): def decaying(start, stop, decay): """Yield an infinite series of linearly decaying values.""" def clip(value): - return max(value, stop) if (start>stop) else min(value, stop) + return max(value, stop) if (start > stop) else min(value, stop) nr_upd = 1. while True: yield clip(start * 1./(1. + decay * nr_upd)) @@ -530,17 +530,19 @@ def print_markdown(data, title=None): if isinstance(data, dict): data = list(data.items()) - markdown = ["* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)] + markdown = ["* **{}:** {}".format(l, unicode_(v)) + for l, v in data if not excl_value(v)] if title: print("\n## {}".format(title)) print('\n{}\n'.format('\n'.join(markdown))) def prints(*texts, **kwargs): - """Print formatted message (manual ANSI escape sequences to avoid dependency) + """Print formatted message (manual ANSI escape sequences to avoid + dependency) *texts (unicode): Texts to print. Each argument is rendered as paragraph. - **kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit. + **kwargs: 'title' becomes coloured headline. exits=True performs sys exit. """ exits = kwargs.get('exits', None) title = kwargs.get('title', None) @@ -570,7 +572,8 @@ def _wrap(text, wrap_max=80, indent=4): def minify_html(html): """Perform a template-specific, rudimentary HTML minification for displaCy. - Disclaimer: NOT a general-purpose solution, only removes indentation/newlines. + Disclaimer: NOT a general-purpose solution, only removes indentation and + newlines. html (unicode): Markup to minify. RETURNS (unicode): "Minified" HTML. diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index fa5fcf624..155d7b9d2 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,5 +1,6 @@ +# coding: utf8 from __future__ import unicode_literals -from libc.stdint cimport int32_t, uint64_t + import numpy from collections import OrderedDict import msgpack @@ -9,23 +10,20 @@ cimport numpy as np from thinc.neural.util import get_array_module from thinc.neural._classes.model import Model -from .typedefs cimport attr_t from .strings cimport StringStore -from . import util from .compat import basestring_, path2str +from . import util cdef class Vectors: - '''Store, save and load word vectors. + """Store, save and load word vectors. Vectors data is kept in the vectors.data attribute, which should be an - instance of numpy.ndarray (for CPU vectors) - or cupy.ndarray (for GPU vectors). - - vectors.key2row is a dictionary mapping word hashes to rows - in the vectors.data table. The array `vectors.keys` keeps - the keys in order, such that keys[vectors.key2row[key]] == key. - ''' + instance of numpy.ndarray (for CPU vectors) or cupy.ndarray + (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to + rows in the vectors.data table. The array `vectors.keys` keeps the keys in + order, such that `keys[vectors.key2row[key]] == key`. + """ cdef public object data cdef readonly StringStore strings cdef public object key2row @@ -33,6 +31,16 @@ cdef class Vectors: cdef public int i def __init__(self, strings, width=0, data=None): + """Create a new vector store. To keep the vector table empty, pass + `width=0`. You can also create the vector table and add vectors one by + one, or set the vector values directly on initialisation. + + strings (StringStore or list): List of strings or StringStore that maps + strings to hash values, and vice versa. + width (int): Number of dimensions. + data (numpy.ndarray): The vector data. + RETURNS (Vectors): The newly created object. + """ if isinstance(strings, StringStore): self.strings = strings else: @@ -55,11 +63,13 @@ cdef class Vectors: return (Vectors, (self.strings, self.data)) def __getitem__(self, key): - '''Get a vector by key. If key is a string, it is hashed - to an integer ID using the vectors.strings table. + """Get a vector by key. If key is a string, it is hashed to an integer + ID using the vectors.strings table. If the integer key is not found in + the table, a KeyError is raised. - If the integer key is not found in the table, a KeyError is raised. - ''' + key (unicode / int): The key to get the vector for. + RETURNS (numpy.ndarray): The vector for the key. + """ if isinstance(key, basestring): key = self.strings[key] i = self.key2row[key] @@ -69,30 +79,47 @@ cdef class Vectors: return self.data[i] def __setitem__(self, key, vector): - '''Set a vector for the given key. If key is a string, it is hashed + """Set a vector for the given key. If key is a string, it is hashed to an integer ID using the vectors.strings table. - ''' + + key (unicode / int): The key to set the vector for. + vector (numpy.ndarray): The vector to set. + """ if isinstance(key, basestring): key = self.strings.add(key) i = self.key2row[key] self.data[i] = vector def __iter__(self): - '''Yield vectors from the table.''' + """Yield vectors from the table. + + YIELDS (numpy.ndarray): A vector. + """ yield from self.data def __len__(self): - '''Return the number of vectors that have been assigned.''' + """Return the number of vectors that have been assigned. + + RETURNS (int): The number of vectors in the data. + """ return self.i def __contains__(self, key): - '''Check whether a key has a vector entry in the table.''' + """Check whether a key has a vector entry in the table. + + key (unicode / int): The key to check. + RETURNS (bool): Whether the key has a vector entry. + """ if isinstance(key, basestring_): key = self.strings[key] return key in self.key2row def add(self, key, vector=None): - '''Add a key to the table, optionally setting a vector value as well.''' + """Add a key to the table, optionally setting a vector value as well. + + key (unicode / int): The key to add. + vector (numpy.ndarray): An optional vector to add. + """ if isinstance(key, basestring_): key = self.strings.add(key) if key not in self.key2row: @@ -110,24 +137,36 @@ cdef class Vectors: return i def items(self): - '''Iterate over (string key, vector) pairs, in order.''' + """Iterate over `(string key, vector)` pairs, in order. + + YIELDS (tuple): A key/vector pair. + """ for i, key in enumerate(self.keys): string = self.strings[key] yield string, self.data[i] @property def shape(self): + """Get `(rows, dims)` tuples of number of rows and number of dimensions + in the vector table. + + RETURNS (tuple): A `(rows, dims)` pair. + """ return self.data.shape def most_similar(self, key): + # TODO: implement raise NotImplementedError def from_glove(self, path): - '''Load GloVe vectors from a directory. Assumes binary format, + """Load GloVe vectors from a directory. Assumes binary format, that the vocab is in a vocab.txt, and that vectors are named vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32 vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc. - By default GloVe outputs 64-bit vectors.''' + By default GloVe outputs 64-bit vectors. + + path (unicode / Path): The path to load the GloVe vectors from. + """ path = util.ensure_path(path) for name in path.iterdir(): if name.parts[-1].startswith('vectors'): @@ -150,9 +189,15 @@ cdef class Vectors: self.data def to_disk(self, path, **exclude): + """Save the current state to a directory. + + path (unicode / Path): A path to a directory, which will be created if + it doesn't exists. Either a string or a Path-like object. + """ xp = get_array_module(self.data) if xp is numpy: - save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) + save_array = lambda arr, file_: xp.save(file_, arr, + allow_pickle=False) else: save_array = lambda arr, file_: xp.save(file_, arr) serializers = OrderedDict(( @@ -162,6 +207,12 @@ cdef class Vectors: return util.to_disk(path, serializers, exclude) def from_disk(self, path, **exclude): + """Loads state from a directory. Modifies the object in place and + returns it. + + path (unicode / Path): Directory path, string or Path-like object. + RETURNS (Vectors): The modified object. + """ def load_keys(path): if path.exists(): self.keys = numpy.load(path2str(path)) @@ -182,6 +233,11 @@ cdef class Vectors: return self def to_bytes(self, **exclude): + """Serialize the current state to a binary string. + + **exclude: Named attributes to prevent from being serialized. + RETURNS (bytes): The serialized form of the `Vectors` object. + """ def serialize_weights(): if hasattr(self.data, 'to_bytes'): return self.data.to_bytes() @@ -194,6 +250,12 @@ cdef class Vectors: return util.to_bytes(serializers, exclude) def from_bytes(self, data, **exclude): + """Load state from a binary string. + + data (bytes): The data to load from. + **exclude: Named attributes to prevent from being loaded. + RETURNS (Vectors): The `Vectors` object. + """ def deserialize_weights(b): if hasattr(self.data, 'from_bytes'): self.data.from_bytes() diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 38286cb90..8b09d7ee7 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,32 +1,24 @@ # coding: utf8 from __future__ import unicode_literals -import ujson -import re import numpy import dill -from libc.string cimport memset, memcpy -from libc.stdint cimport int32_t -from libc.math cimport sqrt -from cymem.cymem cimport Address from collections import OrderedDict from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string from .typedefs cimport attr_t from .tokens.token cimport Token -from .attrs cimport PROB, LANG +from .attrs cimport PROB, LANG, ORTH, TAG from .structs cimport SerializedLexemeC -from .compat import copy_reg, pickle, basestring_ +from .compat import copy_reg, basestring_ from .lemmatizer import Lemmatizer from .attrs import intify_attrs from .vectors import Vectors -from . import util -from . import attrs -from . import symbols from ._ml import link_vectors_to_models +from . import util cdef class Vocab: @@ -35,23 +27,22 @@ cdef class Vocab: C-data that is shared between `Doc` objects. """ def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - strings=tuple(), **deprecated_kwargs): + strings=tuple(), **deprecated_kwargs): """Create the vocabulary. - lex_attr_getters (dict): A dictionary mapping attribute IDs to functions - to compute them. Defaults to `None`. - tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained + lex_attr_getters (dict): A dictionary mapping attribute IDs to + functions to compute them. Defaults to `None`. + tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. lemmatizer (object): A lemmatizer. Defaults to `None`. strings (StringStore): StringStore that maps strings to integers, and vice versa. - RETURNS (Vocab): The newly constructed vocab object. + RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} tag_map = tag_map if tag_map is not None else {} if lemmatizer in (None, True, False): lemmatizer = Lemmatizer({}, {}, {}) - self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() @@ -83,19 +74,20 @@ cdef class Vocab: The flag_getter function will be called over the words currently in the vocab, and then applied to new words as they occur. You'll then be able - to access the flag value on each token, using token.check_flag(flag_id). + to access the flag value on each token using token.check_flag(flag_id). See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`, `Token.check_flag`. - flag_getter (callable): A function `f(unicode) -> bool`, to get the flag - value. + flag_getter (callable): A function `f(unicode) -> bool`, to get the + flag value. flag_id (int): An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If -1, the lowest available bit will be chosen. RETURNS (int): The integer ID by which the flag value can be checked. EXAMPLE: - >>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy']) + >>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy'] + >>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter) >>> doc = nlp(u'I like spaCy') >>> assert doc[2].check_flag(MY_PRODUCT) == True """ @@ -106,9 +98,10 @@ cdef class Vocab: break else: raise ValueError( - "Cannot find empty bit for new lexical flag. All bits between " - "0 and 63 are occupied. You can replace one by specifying the " - "flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA") + "Cannot find empty bit for new lexical flag. All bits " + "between 0 and 63 are occupied. You can replace one by " + "specifying the flag_id explicitly, e.g. " + "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.") elif flag_id >= 64 or flag_id < 1: raise ValueError( "Invalid value for flag_id: %d. Flag IDs must be between " @@ -119,9 +112,9 @@ cdef class Vocab: return flag_id cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: - """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` - if necessary, using memory acquired from the given pool. If the pool - is the lexicon's own memory, the lexeme is saved in the lexicon. + """Get a pointer to a `LexemeC` from the lexicon, creating a new + `Lexeme` if necessary using memory acquired from the given pool. If the + pool is the lexicon's own memory, the lexeme is saved in the lexicon. """ if string == u'': return &EMPTY_LEXEME @@ -138,9 +131,9 @@ cdef class Vocab: return self._new_lexeme(mem, string) cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: - """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` - if necessary, using memory acquired from the given pool. If the pool - is the lexicon's own memory, the lexeme is saved in the lexicon. + """Get a pointer to a `LexemeC` from the lexicon, creating a new + `Lexeme` if necessary using memory acquired from the given pool. If the + pool is the lexicon's own memory, the lexeme is saved in the lexicon. """ if orth == 0: return &EMPTY_LEXEME @@ -202,8 +195,8 @@ cdef class Vocab: for orth, addr in self._by_orth.items(): yield Lexeme(self, orth) - def __getitem__(self, id_or_string): - """Retrieve a lexeme, given an int ID or a unicode string. If a + def __getitem__(self, id_or_string): + """Retrieve a lexeme, given an int ID or a unicode string. If a previously unseen unicode string is given, a new lexeme is created and stored. @@ -228,13 +221,14 @@ cdef class Vocab: cdef int i tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) for i, props in enumerate(substrings): - props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) + props = intify_attrs(props, strings_map=self.strings, + _do_deprecated=True) token = &tokens[i] # Set the special tokens up to have arbitrary attributes - lex = self.get_by_orth(self.mem, props[attrs.ORTH]) + lex = self.get_by_orth(self.mem, props[ORTH]) token.lex = lex - if attrs.TAG in props: - self.morphology.assign_tag(token, props[attrs.TAG]) + if TAG in props: + self.morphology.assign_tag(token, props[TAG]) for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) Lexeme.set_struct_attr(lex, attr_id, value) @@ -253,16 +247,13 @@ cdef class Vocab: self.vectors = Vectors(self.strings, width=new_dim) def get_vector(self, orth): - """Retrieve a vector for a word in the vocabulary. + """Retrieve a vector for a word in the vocabulary. Words can be looked + up by string or int ID. If no vectors data is loaded, ValueError is + raised. - Words can be looked up by string or int ID. - - RETURNS: - A word vector. Size and shape determined by the - vocab.vectors instance. Usually, a numpy ndarray - of shape (300,) and dtype float32. - - RAISES: If no vectors data is loaded, ValueError is raised. + RETURNS (numpy.ndarray): A word vector. Size + and shape determined by the `vocab.vectors` instance. Usually, a + numpy ndarray of shape (300,) and dtype float32. """ if isinstance(orth, basestring_): orth = self.strings.add(orth) @@ -272,21 +263,16 @@ cdef class Vocab: return numpy.zeros((self.vectors_length,), dtype='f') def set_vector(self, orth, vector): - """Set a vector for a word in the vocabulary. - - Words can be referenced by string or int ID. - - RETURNS: - None + """Set a vector for a word in the vocabulary. Words can be referenced + by string or int ID. """ if not isinstance(orth, basestring_): orth = self.strings[orth] self.vectors.add(orth, vector=vector) def has_vector(self, orth): - """Check whether a word has a vector. Returns False if no - vectors have been loaded. Words can be looked up by string - or int ID.""" + """Check whether a word has a vector. Returns False if no vectors have + been loaded. Words can be looked up by string or int ID.""" if isinstance(orth, basestring_): orth = self.strings.add(orth) return orth in self.vectors @@ -295,7 +281,7 @@ cdef class Vocab: """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or `Path`-like objects. + it doesn't exist. Paths may be either strings or Path-like objects. """ path = util.ensure_path(path) if not path.exists(): @@ -420,16 +406,13 @@ def pickle_vocab(vocab): length = vocab.length data_dir = vocab.data_dir lex_attr_getters = dill.dumps(vocab.lex_attr_getters) - lexemes_data = vocab.lexemes_to_bytes() - return (unpickle_vocab, - (sstore, morph, data_dir, lex_attr_getters, - lexemes_data, length)) + (sstore, morph, data_dir, lex_attr_getters, lexemes_data, length)) def unpickle_vocab(sstore, morphology, data_dir, - lex_attr_getters, bytes lexemes_data, int length): + lex_attr_getters, bytes lexemes_data, int length): cdef Vocab vocab = Vocab() vocab.length = length vocab.strings = sstore @@ -449,12 +432,10 @@ class LookupError(Exception): @classmethod def mismatched_strings(cls, id_, id_string, original_string): return cls( - "Error fetching a Lexeme from the Vocab. When looking up a string, " - "the lexeme returned had an orth ID that did not match the query string. " - "This means that the cached lexeme structs are mismatched to the " - "string encoding table. The mismatched:\n" - "Query string: {query}\n" - "Orth cached: {orth_str}\n" - "ID of orth: {orth_id}".format( - query=repr(original_string), orth_str=repr(id_string), orth_id=id_) - ) + "Error fetching a Lexeme from the Vocab. When looking up a " + "string, the lexeme returned had an orth ID that did not match " + "the query string. This means that the cached lexeme structs are " + "mismatched to the string encoding table. The mismatched:\n" + "Query string: {}\n" + "Orth cached: {}\n" + "Orth ID: {}".format(repr(original_string), repr(id_string), id_)) diff --git a/website/api/_top-level/_cli.jade b/website/api/_top-level/_cli.jade index fc573e0ec..f19eb43d0 100644 --- a/website/api/_top-level/_cli.jade +++ b/website/api/_top-level/_cli.jade @@ -134,11 +134,12 @@ p p | Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format] | for use with the #[code train] command and other experiment management - | functions. The right converter is chosen based on the file extension of - | the input file. Currently only supports #[code .conllu]. + | functions. The converter can be specified on the command line, or + | chosen based on the file extension of the input file. +code(false, "bash", "$", false, false, true). - spacy convert [input_file] [output_dir] [--n-sents] [--morphology] + spacy convert [input_file] [output_dir] [--converter] [--n-sents] + [--morphology] +table(["Argument", "Type", "Description"]) +row @@ -151,6 +152,11 @@ p +cell positional +cell Output directory for converted JSON file. + +row + +cell #[code converter], #[code -c] + +cell option + +cell #[+tag-new(2)] Name of converter to use (see below). + +row +cell #[code --n-sents], #[code -n] +cell option @@ -166,6 +172,25 @@ p +cell flag +cell Show help message and available arguments. +p The following converters are available: + ++table(["ID", "Description"]) + +row + +cell #[code auto] + +cell Automatically pick converter based on file extension (default). + + +row + +cell #[code conllu], #[code conll] + +cell Universal Dependencies #[code .conllu] or #[code .conll] format. + + +row + +cell #[code ner] + +cell Tab-based named entity recognition format. + + +row + +cell #[code iob] + +cell IOB named entity recognition format. + +h(3, "train") Train p diff --git a/website/api/doc.jade b/website/api/doc.jade index ceb564c7a..ac91ad427 100644 --- a/website/api/doc.jade +++ b/website/api/doc.jade @@ -332,6 +332,26 @@ p +cell dict +cell A dictionary mapping attributes to integer counts. ++h(2, "get_lca_matrix") Doc.get_lca_matrix + +tag method + +p + | Calculates the lowest common ancestor matrix for a given #[code Doc]. + | Returns LCA matrix containing the integer index of the ancestor, or + | #[code -1] if no common ancestor is found, e.g. if span excludes a + | necessary ancestor. + ++aside-code("Example"). + doc = nlp(u"This is a test") + matrix = doc.get_lca_matrix() + # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32) + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] + +cell The lowest common ancestor matrix of the #[code Doc]. + +h(2, "to_array") Doc.to_array +tag method @@ -764,3 +784,10 @@ p +cell | A dictionary that allows customisation of properties of | #[code Span] children. + + +row + +cell #[code _] + +cell #[code Underscore] + +cell + | User space for adding custom + | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions]. diff --git a/website/api/lexeme.jade b/website/api/lexeme.jade index dddefd2d7..86fa18730 100644 --- a/website/api/lexeme.jade +++ b/website/api/lexeme.jade @@ -157,27 +157,61 @@ p The L2 norm of the lexeme's vector representation. +row +cell #[code vocab] +cell #[code Vocab] - +cell + +cell The lexeme's vocabulary. +row +cell #[code text] +cell unicode +cell Verbatim text content. + +row + +cell #[code orth] + +cell int + +cell ID of the verbatim text content. + + +row + +cell #[code orth_] + +cell unicode + +cell + | Verbatim text content (identical to #[code Lexeme.text]). Existst + | mostly for consistency with the other attributes. + +row +cell #[code lex_id] +cell int +cell ID of the lexeme's lexical type. + +row + +cell #[code rank] + +cell int + +cell + | Sequential ID of the lexemes's lexical type, used to index into + | tables, e.g. for word vectors. + + +row + +cell #[code flags] + +cell int + +cell Container of the lexeme's binary flags. + + +row + +cell #[code norm] + +cell int + +cell The lexemes's norm, i.e. a normalised form of the lexeme text. + + +row + +cell #[code norm_] + +cell unicode + +cell The lexemes's norm, i.e. a normalised form of the lexeme text. + +row +cell #[code lower] +cell int - +cell Lower-case form of the word. + +cell Lowercase form of the word. +row +cell #[code lower_] +cell unicode - +cell Lower-case form of the word. + +cell Lowercase form of the word. +row +cell #[code shape] @@ -192,22 +226,30 @@ p The L2 norm of the lexeme's vector representation. +row +cell #[code prefix] +cell int - +cell Length-N substring from the start of the word. Defaults to #[code N=1]. + +cell + | Length-N substring from the start of the word. Defaults to + | #[code N=1]. +row +cell #[code prefix_] +cell unicode - +cell Length-N substring from the start of the word. Defaults to #[code N=1]. + +cell + | Length-N substring from the start of the word. Defaults to + | #[code N=1]. +row +cell #[code suffix] +cell int - +cell Length-N substring from the end of the word. Defaults to #[code N=3]. + +cell + | Length-N substring from the end of the word. Defaults to + | #[code N=3]. +row +cell #[code suffix_] +cell unicode - +cell Length-N substring from the start of the word. Defaults to #[code N=3]. + +cell + | Length-N substring from the start of the word. Defaults to + | #[code N=3]. +row +cell #[code is_alpha] @@ -237,6 +279,13 @@ p The L2 norm of the lexeme's vector representation. | Is the lexeme in lowercase? Equivalent to | #[code lexeme.text.islower()]. + +row + +cell #[code is_upper] + +cell bool + +cell + | Is the lexeme in uppercase? Equivalent to + | #[code lexeme.text.isupper()]. + +row +cell #[code is_title] +cell bool @@ -249,6 +298,16 @@ p The L2 norm of the lexeme's vector representation. +cell bool +cell Is the lexeme punctuation? + +row + +cell #[code is_left_punct] + +cell bool + +cell Is the lexeme a left punctuation mark, e.g. #[code (]? + + +row + +cell #[code is_right_punct] + +cell bool + +cell Is the lexeme a right punctuation mark, e.g. #[code )]? + +row +cell #[code is_space] +cell bool @@ -256,6 +315,16 @@ p The L2 norm of the lexeme's vector representation. | Does the lexeme consist of whitespace characters? Equivalent to | #[code lexeme.text.isspace()]. + +row + +cell #[code is_bracket] + +cell bool + +cell Is the lexeme a bracket? + + +row + +cell #[code is_quote] + +cell bool + +cell Is the lexeme a quotation mark? + +row +cell #[code like_url] +cell bool @@ -285,6 +354,7 @@ p The L2 norm of the lexeme's vector representation. +cell #[code lang] +cell int +cell Language of the parent vocabulary. + +row +cell #[code lang_] +cell unicode @@ -293,9 +363,16 @@ p The L2 norm of the lexeme's vector representation. +row +cell #[code prob] +cell float - +cell Smoothed log probability estimate of lexeme's type. + +cell Smoothed log probability estimate of the lexeme's type. + + +row + +cell #[code cluster] + +cell int + +cell Brown cluster ID. +row +cell #[code sentiment] +cell float - +cell A scalar value indicating the positivity or negativity of the lexeme. + +cell + | A scalar value indicating the positivity or negativity of the + | lexeme. diff --git a/website/api/span.jade b/website/api/span.jade index 2a55409f1..266518076 100644 --- a/website/api/span.jade +++ b/website/api/span.jade @@ -248,6 +248,28 @@ p +cell float +cell A scalar similarity score. Higher is more similar. ++h(2, "get_lca_matrix") Span.get_lca_matrix + +tag method + +p + | Calculates the lowest common ancestor matrix for a given #[code Span]. + | Returns LCA matrix containing the integer index of the ancestor, or + | #[code -1] if no common ancestor is found, e.g. if span excludes a + | necessary ancestor. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn') + span = doc[1:4] + matrix = span.get_lca_matrix() + # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32) + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] + +cell The lowest common ancestor matrix of the #[code Span]. + + +h(2, "to_array") Span.to_array +tag method +tag-new(2) @@ -347,7 +369,7 @@ p +tag property +tag-model("parse") -p Tokens that are to the left of the span, whose head is within the span. +p Tokens that are to the left of the span, whose heads are within the span. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') @@ -364,7 +386,7 @@ p Tokens that are to the left of the span, whose head is within the span. +tag property +tag-model("parse") -p Tokens that are to the right of the span, whose head is within the span. +p Tokens that are to the right of the span, whose heads are within the span. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') @@ -377,6 +399,42 @@ p Tokens that are to the right of the span, whose head is within the span. +cell #[code Token] +cell A right-child of a token of the span. ++h(2, "n_lefts") Span.n_lefts + +tag property + +tag-model("parse") + +p + | The number of tokens that are to the left of the span, whose heads are + | within the span. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[3:7].n_lefts == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of left-child tokens. + ++h(2, "n_rights") Span.n_rights + +tag property + +tag-model("parse") + +p + | The number of tokens that are to the right of the span, whose heads are + | within the span. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[2:4].n_rights == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of right-child tokens. + +h(2, "subtree") Span.subtree +tag property +tag-model("parse") @@ -495,6 +553,18 @@ p | The text content of the span with a trailing whitespace character | if the last token has one. + +row + +cell #[code orth] + +cell int + +cell ID of the verbatim text content. + + +row + +cell #[code orth_] + +cell unicode + +cell + | Verbatim text content (identical to #[code Span.text]). Existst + | mostly for consistency with the other attributes. + +row +cell #[code label] +cell int @@ -519,3 +589,17 @@ p +cell #[code ent_id_] +cell unicode +cell The string ID of the named entity the token is an instance of. + + +row + +cell #[code sentiment] + +cell float + +cell + | A scalar value indicating the positivity or negativity of the + | span. + + +row + +cell #[code _] + +cell #[code Underscore] + +cell + | User space for adding custom + | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions]. diff --git a/website/api/token.jade b/website/api/token.jade index 4062594b4..f8fa15fe8 100644 --- a/website/api/token.jade +++ b/website/api/token.jade @@ -302,6 +302,80 @@ p A sequence of the token's immediate syntactic children. +cell #[code Token] +cell A child token such that #[code child.head==self]. ++h(2, "lefts") Token.lefts + +tag property + +tag-model("parse") + +p + | The leftward immediate children of the word, in the syntactic dependency + | parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + lefts = [t.text for t in doc[3].lefts] + assert lefts == [u'New'] + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell yields + +cell #[code Token] + +cell A left-child of the token. + ++h(2, "rights") Token.rights + +tag property + +tag-model("parse") + +p + | The rightward immediate children of the word, in the syntactic + | dependency parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + rights = [t.text for t in doc[3].rights] + assert rights == [u'in'] + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell yields + +cell #[code Token] + +cell A right-child of the token. + ++h(2, "n_lefts") Token.n_lefts + +tag property + +tag-model("parse") + +p + | The number of leftward immediate children of the word, in the syntactic + | dependency parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[3].n_lefts == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of left-child tokens. + ++h(2, "n_rights") Token.n_rights + +tag property + +tag-model("parse") + +p + | The number of rightward immediate children of the word, in the syntactic + | dependency parse. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + assert doc[3].n_rights == 1 + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell int + +cell The number of right-child tokens. + +h(2, "subtree") Token.subtree +tag property +tag-model("parse") @@ -489,15 +563,35 @@ p The L2 norm of the token's vector representation. +cell unicode +cell Base form of the token, with no inflectional suffixes. + +row + +cell #[code norm] + +cell int + +cell + | The token's norm, i.e. a normalised form of the token text. + | Usually set in the language's + | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or + | #[+a("/usage/adding-languages#norm-exceptions") norm exceptions]. + + +row + +cell #[code norm_] + +cell unicode + +cell + | The token's norm, i.e. a normalised form of the token text. + | Usually set in the language's + | #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions] or + | #[+a("/usage/adding-languages#norm-exceptions") norm exceptions]. + +row +cell #[code lower] +cell int - +cell Lower-case form of the token. + +cell Lowercase form of the token. +row +cell #[code lower_] +cell unicode - +cell Lower-case form of the token. + +cell + | Lowercase form of the token text. Equivalent to + | #[code Token.text.lower()]. +row +cell #[code shape] @@ -537,7 +631,9 @@ p The L2 norm of the token's vector representation. +row +cell #[code suffix_] +cell unicode - +cell Length-N substring from the end of the token. Defaults to #[code N=3]. + +cell + | Length-N substring from the end of the token. Defaults to + | #[code N=3]. +row +cell #[code is_alpha] @@ -672,6 +768,7 @@ p The L2 norm of the token's vector representation. +cell #[code lang] +cell int +cell Language of the parent document's vocabulary. + +row +cell #[code lang_] +cell unicode @@ -690,9 +787,30 @@ p The L2 norm of the token's vector representation. +row +cell #[code sentiment] +cell float - +cell A scalar value indicating the positivity or negativity of the token. + +cell + | A scalar value indicating the positivity or negativity of the + | token. +row +cell #[code lex_id] +cell int - +cell ID of the token's lexical type. + +cell Sequential ID of the token's lexical type. + + +row + +cell #[code rank] + +cell int + +cell + | Sequential ID of the token's lexical type, used to index into + | tables, e.g. for word vectors. + + +row + +cell #[code cluster] + +cell int + +cell Brown cluster ID. + + +row + +cell #[code _] + +cell #[code Underscore] + +cell + | User space for adding custom + | #[+a("/usage/processing-pipelines#custom-components-attributes") attribute extensions]. diff --git a/website/api/vectors.jade b/website/api/vectors.jade index e08f34643..692bd1ca8 100644 --- a/website/api/vectors.jade +++ b/website/api/vectors.jade @@ -36,12 +36,14 @@ p | that maps strings to hash values, and vice versa. +row - +cell #[code data] - +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code width] + +cell int + +cell Number of dimensions. +row - +cell #[code width] - +cell Number of dimensions. + +cell #[code data] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell The vector data. +row("foot") +cell returns @@ -208,7 +210,7 @@ p +row("foot") +cell returns +cell tuple - +cell #[code (rows, dims)] pairs. + +cell A #[code (rows, dims)] pair. +h(2, "from_glove") Vectors.from_glove +tag method @@ -238,11 +240,16 @@ p Save the current state to a directory. +table(["Name", "Type", "Description"]) +row +cell #[code path] - +cell unicode or #[code Path] + +cell unicode / #[code Path] +cell | A path to a directory, which will be created if it doesn't exist. | Paths may be either strings or #[code Path]-like objects. + +row + +cell #[code **exclude] + +cell - + +cell Named attributes to prevent from being saved. + +h(2, "from_disk") Vectors.from_disk +tag method @@ -255,7 +262,7 @@ p Loads state from a directory. Modifies the object in place and returns it. +table(["Name", "Type", "Description"]) +row +cell #[code path] - +cell unicode or #[code Path] + +cell unicode / #[code Path] +cell | A path to a directory. Paths may be either strings or | #[code Path]-like objects. @@ -297,7 +304,7 @@ p Load state from a binary string. +table(["Name", "Type", "Description"]) +row - +cell #[code bytes_data] + +cell #[code data] +cell bytes +cell The data to load from. diff --git a/website/usage/_linguistic-features/_dependency-parse.jade b/website/usage/_linguistic-features/_dependency-parse.jade index 85d9179df..0fcdd4713 100644 --- a/website/usage/_linguistic-features/_dependency-parse.jade +++ b/website/usage/_linguistic-features/_dependency-parse.jade @@ -111,11 +111,13 @@ p p | A few more convenience attributes are provided for iterating around the - | local tree from the token. The #[code .lefts] and #[code .rights] - | attributes provide sequences of syntactic children that occur before and - | after the token. Both sequences are in sentences order. There are also - | two integer-typed attributes, #[code .n_rights] and #[code .n_lefts], - | that give the number of left and right children. + | local tree from the token. The #[+api("token#lefts") #[code Token.lefts]] + | and #[+api("token#rights") #[code Token.rights]] attributes provide + | sequences of syntactic children that occur before and after the token. + | Both sequences are in sentence order. There are also two integer-typed + | attributes, #[+api("token#n_rights") #[code Token.n_rights]] and + | #[+api("token#n_lefts") #[code Token.n_lefts]], that give the number of + | left and right children. +code. doc = nlp(u'bright red apples on the tree') @@ -126,10 +128,11 @@ p p | You can get a whole phrase by its syntactic head using the - | #[code .subtree] attribute. This returns an ordered sequence of tokens. - | You can walk up the tree with the #[code .ancestors] attribute, and - | check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]] - | method. + | #[+api("token#subtree") #[code Token.subtree]] attribute. This returns an + | ordered sequence of tokens. You can walk up the tree with the + | #[+api("token#ancestors") #[code Token.ancestors]] attribute, and + | check dominance with + | #[+api("token#is_ancestor") #[code Token.is_ancestor()]]. +aside("Projective vs. non-projective") | For the #[+a("/models/en") default English model], the