mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Merge changes from master
This commit is contained in:
commit
d2118792e7
|
@ -32,7 +32,7 @@ test_script:
|
||||||
# Note that you must use the environment variable %PYTHON% to refer to
|
# Note that you must use the environment variable %PYTHON% to refer to
|
||||||
# the interpreter you're using - Appveyor does not do anything special
|
# the interpreter you're using - Appveyor does not do anything special
|
||||||
# to put the Python version you want to use on PATH.
|
# to put the Python version you want to use on PATH.
|
||||||
- "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
|
- "%PYTHON%\\python.exe -m pytest spacy/"
|
||||||
|
|
||||||
after_test:
|
after_test:
|
||||||
# This step builds your wheels.
|
# This step builds your wheels.
|
||||||
|
|
|
@ -1,11 +0,0 @@
|
||||||
steps:
|
|
||||||
-
|
|
||||||
command: "fab env clean make test wheel"
|
|
||||||
label: ":dizzy: :python:"
|
|
||||||
artifact_paths: "dist/*.whl"
|
|
||||||
- wait
|
|
||||||
- trigger: "spacy-train-from-wheel"
|
|
||||||
label: ":dizzy: :train:"
|
|
||||||
build:
|
|
||||||
env:
|
|
||||||
SPACY_VERSION: "{$SPACY_VERSION}"
|
|
|
@ -182,7 +182,7 @@ If you've made a contribution to spaCy, you should fill in the
|
||||||
[spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that
|
[spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that
|
||||||
your contribution can be used across the project. If you agree to be bound by
|
your contribution can be used across the project. If you agree to be bound by
|
||||||
the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md)
|
the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md)
|
||||||
and include it with your pull request, or submit it separately to
|
and include it with your pull request, or sumit it separately to
|
||||||
[`.github/contributors/`](/.github/contributors). The name of the file should be
|
[`.github/contributors/`](/.github/contributors). The name of the file should be
|
||||||
your GitHub username, with the extension `.md`. For example, the user
|
your GitHub username, with the extension `.md`. For example, the user
|
||||||
example_user would create the file `.github/contributors/example_user.md`.
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
|
@ -1,392 +0,0 @@
|
||||||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
|
||||||
'''
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import plac
|
|
||||||
import tqdm
|
|
||||||
import attr
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
import spacy.util
|
|
||||||
from spacy.tokens import Token, Doc
|
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.syntax.nonproj import projectivize
|
|
||||||
from collections import defaultdict, Counter
|
|
||||||
from timeit import default_timer as timer
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
import random
|
|
||||||
import numpy.random
|
|
||||||
import cytoolz
|
|
||||||
|
|
||||||
import conll17_ud_eval
|
|
||||||
|
|
||||||
import spacy.lang.zh
|
|
||||||
import spacy.lang.ja
|
|
||||||
|
|
||||||
spacy.lang.zh.Chinese.Defaults.use_jieba = False
|
|
||||||
spacy.lang.ja.Japanese.Defaults.use_janome = False
|
|
||||||
|
|
||||||
random.seed(0)
|
|
||||||
numpy.random.seed(0)
|
|
||||||
|
|
||||||
def minibatch_by_words(items, size=5000):
|
|
||||||
random.shuffle(items)
|
|
||||||
if isinstance(size, int):
|
|
||||||
size_ = itertools.repeat(size)
|
|
||||||
else:
|
|
||||||
size_ = size
|
|
||||||
items = iter(items)
|
|
||||||
while True:
|
|
||||||
batch_size = next(size_)
|
|
||||||
batch = []
|
|
||||||
while batch_size >= 0:
|
|
||||||
try:
|
|
||||||
doc, gold = next(items)
|
|
||||||
except StopIteration:
|
|
||||||
if batch:
|
|
||||||
yield batch
|
|
||||||
return
|
|
||||||
batch_size -= len(doc)
|
|
||||||
batch.append((doc, gold))
|
|
||||||
if batch:
|
|
||||||
yield batch
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
################
|
|
||||||
# Data reading #
|
|
||||||
################
|
|
||||||
|
|
||||||
space_re = re.compile('\s+')
|
|
||||||
def split_text(text):
|
|
||||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
|
||||||
max_doc_length=None, limit=None):
|
|
||||||
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
|
||||||
include Doc objects created using nlp.make_doc and then aligned against
|
|
||||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
|
||||||
created from the gold-standard segments. At least one must be True.'''
|
|
||||||
if not raw_text and not oracle_segments:
|
|
||||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
|
||||||
paragraphs = split_text(text_file.read())
|
|
||||||
conllu = read_conllu(conllu_file)
|
|
||||||
# sd is spacy doc; cd is conllu doc
|
|
||||||
# cs is conllu sent, ct is conllu token
|
|
||||||
docs = []
|
|
||||||
golds = []
|
|
||||||
for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
|
|
||||||
sent_annots = []
|
|
||||||
for cs in cd:
|
|
||||||
sent = defaultdict(list)
|
|
||||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
|
||||||
if '.' in id_:
|
|
||||||
continue
|
|
||||||
if '-' in id_:
|
|
||||||
continue
|
|
||||||
id_ = int(id_)-1
|
|
||||||
head = int(head)-1 if head != '0' else id_
|
|
||||||
sent['words'].append(word)
|
|
||||||
sent['tags'].append(tag)
|
|
||||||
sent['heads'].append(head)
|
|
||||||
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
|
||||||
sent['spaces'].append(space_after == '_')
|
|
||||||
sent['entities'] = ['-'] * len(sent['words'])
|
|
||||||
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
|
||||||
sent['deps'])
|
|
||||||
if oracle_segments:
|
|
||||||
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
|
|
||||||
golds.append(GoldParse(docs[-1], **sent))
|
|
||||||
|
|
||||||
sent_annots.append(sent)
|
|
||||||
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
|
||||||
sent_annots = []
|
|
||||||
docs.append(doc)
|
|
||||||
golds.append(gold)
|
|
||||||
if limit and len(docs) >= limit:
|
|
||||||
return docs, golds
|
|
||||||
|
|
||||||
if raw_text and sent_annots:
|
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
|
||||||
docs.append(doc)
|
|
||||||
golds.append(gold)
|
|
||||||
if limit and len(docs) >= limit:
|
|
||||||
return docs, golds
|
|
||||||
return docs, golds
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
|
||||||
docs = []
|
|
||||||
sent = []
|
|
||||||
doc = []
|
|
||||||
for line in file_:
|
|
||||||
if line.startswith('# newdoc'):
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
doc = []
|
|
||||||
elif line.startswith('#'):
|
|
||||||
continue
|
|
||||||
elif not line.strip():
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
sent = []
|
|
||||||
else:
|
|
||||||
sent.append(list(line.strip().split('\t')))
|
|
||||||
if len(sent[-1]) != 10:
|
|
||||||
print(repr(line))
|
|
||||||
raise ValueError
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def _make_gold(nlp, text, sent_annots):
|
|
||||||
# Flatten the conll annotations, and adjust the head indices
|
|
||||||
flat = defaultdict(list)
|
|
||||||
for sent in sent_annots:
|
|
||||||
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
|
||||||
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
|
|
||||||
flat[field].extend(sent[field])
|
|
||||||
# Construct text if necessary
|
|
||||||
assert len(flat['words']) == len(flat['spaces'])
|
|
||||||
if text is None:
|
|
||||||
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
|
|
||||||
doc = nlp.make_doc(text)
|
|
||||||
flat.pop('spaces')
|
|
||||||
gold = GoldParse(doc, **flat)
|
|
||||||
return doc, gold
|
|
||||||
|
|
||||||
#############################
|
|
||||||
# Data transforms for spaCy #
|
|
||||||
#############################
|
|
||||||
|
|
||||||
def golds_to_gold_tuples(docs, golds):
|
|
||||||
'''Get out the annoying 'tuples' format used by begin_training, given the
|
|
||||||
GoldParse objects.'''
|
|
||||||
tuples = []
|
|
||||||
for doc, gold in zip(docs, golds):
|
|
||||||
text = doc.text
|
|
||||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
|
||||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
|
||||||
tuples.append((text, sents))
|
|
||||||
return tuples
|
|
||||||
|
|
||||||
|
|
||||||
##############
|
|
||||||
# Evaluation #
|
|
||||||
##############
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|
||||||
with text_loc.open('r', encoding='utf8') as text_file:
|
|
||||||
texts = split_text(text_file.read())
|
|
||||||
docs = list(nlp.pipe(texts))
|
|
||||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
|
||||||
write_conllu(docs, out_file)
|
|
||||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
|
||||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
|
||||||
return scores
|
|
||||||
|
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
|
||||||
merger = Matcher(docs[0].vocab)
|
|
||||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
matches = merger(doc)
|
|
||||||
spans = [doc[start:end+1] for _, start, end in matches]
|
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
|
||||||
for start_char, end_char in offsets:
|
|
||||||
doc.merge(start_char, end_char)
|
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
|
||||||
for j, sent in enumerate(doc.sents):
|
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
|
||||||
for k, token in enumerate(sent):
|
|
||||||
file_.write(token._.get_conllu_lines(k) + '\n')
|
|
||||||
file_.write('\n')
|
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, ud_scores):
|
|
||||||
fields = {
|
|
||||||
'dep_loss': losses.get('parser', 0.0),
|
|
||||||
'tag_loss': losses.get('tagger', 0.0),
|
|
||||||
'words': ud_scores['Words'].f1 * 100,
|
|
||||||
'sents': ud_scores['Sentences'].f1 * 100,
|
|
||||||
'tags': ud_scores['XPOS'].f1 * 100,
|
|
||||||
'uas': ud_scores['UAS'].f1 * 100,
|
|
||||||
'las': ud_scores['LAS'].f1 * 100,
|
|
||||||
}
|
|
||||||
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
|
||||||
if itn == 0:
|
|
||||||
print('\t'.join(header))
|
|
||||||
tpl = '\t'.join((
|
|
||||||
'{:d}',
|
|
||||||
'{dep_loss:.1f}',
|
|
||||||
'{las:.1f}',
|
|
||||||
'{uas:.1f}',
|
|
||||||
'{tags:.1f}',
|
|
||||||
'{sents:.1f}',
|
|
||||||
'{words:.1f}',
|
|
||||||
))
|
|
||||||
print(tpl.format(itn, **fields))
|
|
||||||
|
|
||||||
#def get_sent_conllu(sent, sent_id):
|
|
||||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
|
||||||
|
|
||||||
def get_token_conllu(token, i):
|
|
||||||
if token._.begins_fused:
|
|
||||||
n = 1
|
|
||||||
while token.nbor(n)._.inside_fused:
|
|
||||||
n += 1
|
|
||||||
id_ = '%d-%d' % (i, i+n)
|
|
||||||
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
|
|
||||||
else:
|
|
||||||
lines = []
|
|
||||||
if token.head.i == token.i:
|
|
||||||
head = 0
|
|
||||||
else:
|
|
||||||
head = i + (token.head.i - token.i) + 1
|
|
||||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
|
||||||
str(head), token.dep_.lower(), '_', '_']
|
|
||||||
lines.append('\t'.join(fields))
|
|
||||||
return '\n'.join(lines)
|
|
||||||
|
|
||||||
Token.set_extension('get_conllu_lines', method=get_token_conllu)
|
|
||||||
Token.set_extension('begins_fused', default=False)
|
|
||||||
Token.set_extension('inside_fused', default=False)
|
|
||||||
|
|
||||||
|
|
||||||
##################
|
|
||||||
# Initialization #
|
|
||||||
##################
|
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(corpus, config):
|
|
||||||
lang = corpus.split('_')[0]
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
if config.vectors:
|
|
||||||
nlp.vocab.from_disk(config.vectors / 'vocab')
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config):
|
|
||||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
|
||||||
if config.multitask_tag:
|
|
||||||
nlp.parser.add_multitask_objective('tag')
|
|
||||||
if config.multitask_sent:
|
|
||||||
nlp.parser.add_multitask_objective('sent_start')
|
|
||||||
nlp.parser.moves.add_action(2, 'subtok')
|
|
||||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
|
||||||
for gold in golds:
|
|
||||||
for tag in gold.tags:
|
|
||||||
if tag is not None:
|
|
||||||
nlp.tagger.add_label(tag)
|
|
||||||
# Replace labels that didn't make the frequency cutoff
|
|
||||||
actions = set(nlp.parser.labels)
|
|
||||||
label_set = set([act.split('-')[1] for act in actions if '-' in act])
|
|
||||||
for gold in golds:
|
|
||||||
for i, label in enumerate(gold.labels):
|
|
||||||
if label is not None and label not in label_set:
|
|
||||||
gold.labels[i] = label.split('||')[0]
|
|
||||||
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
|
|
||||||
|
|
||||||
|
|
||||||
########################
|
|
||||||
# Command line helpers #
|
|
||||||
########################
|
|
||||||
|
|
||||||
@attr.s
|
|
||||||
class Config(object):
|
|
||||||
vectors = attr.ib(default=None)
|
|
||||||
max_doc_length = attr.ib(default=10)
|
|
||||||
multitask_tag = attr.ib(default=True)
|
|
||||||
multitask_sent = attr.ib(default=True)
|
|
||||||
nr_epoch = attr.ib(default=30)
|
|
||||||
batch_size = attr.ib(default=1000)
|
|
||||||
dropout = attr.ib(default=0.2)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, loc):
|
|
||||||
with Path(loc).open('r', encoding='utf8') as file_:
|
|
||||||
cfg = json.load(file_)
|
|
||||||
return cls(**cfg)
|
|
||||||
|
|
||||||
|
|
||||||
class Dataset(object):
|
|
||||||
def __init__(self, path, section):
|
|
||||||
self.path = path
|
|
||||||
self.section = section
|
|
||||||
self.conllu = None
|
|
||||||
self.text = None
|
|
||||||
for file_path in self.path.iterdir():
|
|
||||||
name = file_path.parts[-1]
|
|
||||||
if section in name and name.endswith('conllu'):
|
|
||||||
self.conllu = file_path
|
|
||||||
elif section in name and name.endswith('txt'):
|
|
||||||
self.text = file_path
|
|
||||||
if self.conllu is None:
|
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
|
||||||
raise IOError(msg.format(section=section, path=path))
|
|
||||||
if self.text is None:
|
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
|
||||||
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
|
|
||||||
|
|
||||||
|
|
||||||
class TreebankPaths(object):
|
|
||||||
def __init__(self, ud_path, treebank, **cfg):
|
|
||||||
self.train = Dataset(ud_path / treebank, 'train')
|
|
||||||
self.dev = Dataset(ud_path / treebank, 'dev')
|
|
||||||
self.lang = self.train.lang
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
|
||||||
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
|
||||||
"positional", None, str),
|
|
||||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
|
||||||
config=("Path to json formatted config file", "positional", None, Config.load),
|
|
||||||
limit=("Size limit", "option", "n", int)
|
|
||||||
)
|
|
||||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|
||||||
paths = TreebankPaths(ud_dir, corpus)
|
|
||||||
if not (parses_dir / corpus).exists():
|
|
||||||
(parses_dir / corpus).mkdir()
|
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
|
||||||
nlp = load_nlp(paths.lang, config)
|
|
||||||
|
|
||||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
|
||||||
max_doc_length=config.max_doc_length, limit=limit)
|
|
||||||
|
|
||||||
optimizer = initialize_pipeline(nlp, docs, golds, config)
|
|
||||||
|
|
||||||
for i in range(config.nr_epoch):
|
|
||||||
docs = [nlp.make_doc(doc.text) for doc in docs]
|
|
||||||
batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
|
|
||||||
losses = {}
|
|
||||||
n_train_words = sum(len(doc) for doc in docs)
|
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
|
||||||
for batch in batches:
|
|
||||||
batch_docs, batch_gold = zip(*batch)
|
|
||||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
|
||||||
nlp.update(batch_docs, batch_gold, sgd=optimizer,
|
|
||||||
drop=config.dropout, losses=losses)
|
|
||||||
|
|
||||||
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
|
|
||||||
print_progress(i, losses, scores)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
88
examples/vectors_tensorboard_standalone.py
Normal file
88
examples/vectors_tensorboard_standalone.py
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf8
|
||||||
|
"""Export spaCy model vectors for use in TensorBoard's standalone embedding projector.
|
||||||
|
https://github.com/tensorflow/embedding-projector-standalone
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
python vectors_tensorboard_standalone.py ./myVectorModel ./output [name]
|
||||||
|
|
||||||
|
This outputs two files that have to be copied into the "oss_data" of the standalone projector:
|
||||||
|
|
||||||
|
[name]_labels.tsv - metadata such as human readable labels for vectors
|
||||||
|
[name]_tensors.bytes - numpy.ndarray of numpy.float32 precision vectors
|
||||||
|
|
||||||
|
"""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import plac
|
||||||
|
import spacy
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
|
||||||
|
out_loc=("Path to output folder writing tensors and labels data", "positional", None, str),
|
||||||
|
name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
|
||||||
|
)
|
||||||
|
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||||
|
# A tab-separated file that contains information about the vectors for visualization
|
||||||
|
#
|
||||||
|
# Learn more: https://www.tensorflow.org/programmers_guide/embedding#metadata
|
||||||
|
meta_file = "{}_labels.tsv".format(name)
|
||||||
|
out_meta_file = path.join(out_loc, meta_file)
|
||||||
|
|
||||||
|
print('Loading spaCy vectors model: {}'.format(vectors_loc))
|
||||||
|
model = spacy.load(vectors_loc)
|
||||||
|
|
||||||
|
print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
|
||||||
|
voacb_strings = [
|
||||||
|
w for w in tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
|
||||||
|
if model.vocab.has_vector(w)
|
||||||
|
]
|
||||||
|
vector_count = len(voacb_strings)
|
||||||
|
|
||||||
|
print('Building Projector labels for {} vectors: {}'.format(vector_count, out_meta_file))
|
||||||
|
vector_dimensions = model.vocab.vectors.shape[1]
|
||||||
|
tf_vectors_variable = numpy.zeros((vector_count, vector_dimensions), dtype=numpy.float32)
|
||||||
|
|
||||||
|
# Write a tab-separated file that contains information about the vectors for visualization
|
||||||
|
#
|
||||||
|
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
|
||||||
|
with open(out_meta_file, 'wb') as file_metadata:
|
||||||
|
# Define columns in the first row
|
||||||
|
file_metadata.write("Text\tFrequency\n".encode('utf-8'))
|
||||||
|
# Write out a row for each vector that we add to the tensorflow variable we created
|
||||||
|
vec_index = 0
|
||||||
|
|
||||||
|
for text in tqdm.tqdm(voacb_strings, total=len(voacb_strings), leave=False):
|
||||||
|
# https://github.com/tensorflow/tensorflow/issues/9094
|
||||||
|
text = '<Space>' if text.lstrip() == '' else text
|
||||||
|
lex = model.vocab[text]
|
||||||
|
|
||||||
|
# Store vector data and metadata
|
||||||
|
tf_vectors_variable[vec_index] = numpy.float64(model.vocab.get_vector(text))
|
||||||
|
file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * len(voacb_strings)).encode('utf-8'))
|
||||||
|
vec_index += 1
|
||||||
|
|
||||||
|
# Write out "[name]_tensors.bytes" file for standalone embeddings projector to load
|
||||||
|
tensor_path = '{}_tensors.bytes'.format(name)
|
||||||
|
tf_vectors_variable.tofile(path.join(out_loc, tensor_path))
|
||||||
|
|
||||||
|
print('Done.')
|
||||||
|
print('Add the following entry to "oss_data/oss_demo_projector_config.json"')
|
||||||
|
print(json.dumps({
|
||||||
|
"tensorName": name,
|
||||||
|
"tensorShape": [vector_count, vector_dimensions],
|
||||||
|
"tensorPath": 'oss_data/{}'.format(tensor_path),
|
||||||
|
"metadataPath": 'oss_data/{}'.format(meta_file)
|
||||||
|
}, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
plac.call(main)
|
83
fabfile.py
vendored
83
fabfile.py
vendored
|
@ -1,92 +1,49 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import contextlib
|
|
||||||
from pathlib import Path
|
|
||||||
from fabric.api import local, lcd, env, settings, prefix
|
from fabric.api import local, lcd, env, settings, prefix
|
||||||
|
from fabtools.python import virtualenv
|
||||||
from os import path, environ
|
from os import path, environ
|
||||||
import shutil
|
|
||||||
|
|
||||||
|
|
||||||
PWD = path.dirname(__file__)
|
PWD = path.dirname(__file__)
|
||||||
ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
|
ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
|
||||||
VENV_DIR = Path(PWD) / ENV
|
VENV_DIR = path.join(PWD, ENV)
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
def env(lang='python2.7'):
|
||||||
def virtualenv(name, create=False, python='/usr/bin/python3.6'):
|
if path.exists(VENV_DIR):
|
||||||
python = Path(python).resolve()
|
|
||||||
env_path = VENV_DIR
|
|
||||||
if create:
|
|
||||||
if env_path.exists():
|
|
||||||
shutil.rmtree(str(env_path))
|
|
||||||
local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR))
|
|
||||||
def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
|
|
||||||
return local('source {}/bin/activate && {}'.format(env_path, cmd),
|
|
||||||
shell='/bin/bash', capture=False)
|
|
||||||
yield wrapped_local
|
|
||||||
|
|
||||||
|
|
||||||
def env(lang='python3.6'):
|
|
||||||
if VENV_DIR.exists():
|
|
||||||
local('rm -rf {env}'.format(env=VENV_DIR))
|
local('rm -rf {env}'.format(env=VENV_DIR))
|
||||||
if lang.startswith('python3'):
|
local('pip install virtualenv')
|
||||||
local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR))
|
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
||||||
else:
|
|
||||||
local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang))
|
|
||||||
local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR))
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
print(venv_local('python --version', capture=True))
|
|
||||||
venv_local('pip install --upgrade setuptools --no-cache-dir')
|
|
||||||
venv_local('pip install pytest --no-cache-dir')
|
|
||||||
venv_local('pip install wheel --no-cache-dir')
|
|
||||||
venv_local('pip install -r requirements.txt --no-cache-dir')
|
|
||||||
venv_local('pip install pex --no-cache-dir')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def install():
|
def install():
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
with virtualenv(VENV_DIR):
|
||||||
venv_local('pip install dist/*.tar.gz')
|
local('pip install --upgrade setuptools')
|
||||||
|
local('pip install dist/*.tar.gz')
|
||||||
|
local('pip install pytest')
|
||||||
|
|
||||||
|
|
||||||
def make():
|
def make():
|
||||||
with lcd(path.dirname(__file__)):
|
with virtualenv(VENV_DIR):
|
||||||
local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace',
|
with lcd(path.dirname(__file__)):
|
||||||
shell='/bin/bash')
|
local('pip install cython')
|
||||||
|
local('pip install murmurhash')
|
||||||
|
local('pip install -r requirements.txt')
|
||||||
|
local('python setup.py build_ext --inplace')
|
||||||
|
|
||||||
def sdist():
|
def sdist():
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
with virtualenv(VENV_DIR):
|
||||||
with lcd(path.dirname(__file__)):
|
with lcd(path.dirname(__file__)):
|
||||||
local('python setup.py sdist')
|
local('python setup.py sdist')
|
||||||
|
|
||||||
def wheel():
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
with lcd(path.dirname(__file__)):
|
|
||||||
venv_local('python setup.py bdist_wheel')
|
|
||||||
|
|
||||||
def pex():
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
with lcd(path.dirname(__file__)):
|
|
||||||
sha = local('git rev-parse --short HEAD', capture=True)
|
|
||||||
venv_local('pex dist/*.whl -e spacy -o dist/spacy-%s.pex' % sha,
|
|
||||||
direct=True)
|
|
||||||
|
|
||||||
|
|
||||||
def clean():
|
def clean():
|
||||||
with lcd(path.dirname(__file__)):
|
with lcd(path.dirname(__file__)):
|
||||||
local('rm -f dist/*.whl')
|
local('python setup.py clean --all')
|
||||||
local('rm -f dist/*.pex')
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
venv_local('python setup.py clean --all')
|
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
with virtualenv(VENV_DIR):
|
||||||
with lcd(path.dirname(__file__)):
|
with lcd(path.dirname(__file__)):
|
||||||
venv_local('pytest -x spacy/tests')
|
local('py.test -x spacy/tests')
|
||||||
|
|
||||||
def train():
|
|
||||||
args = environ.get('SPACY_TRAIN_ARGS', '')
|
|
||||||
with virtualenv(VENV_DIR) as venv_local:
|
|
||||||
venv_local('spacy train {args}'.format(args=args))
|
|
||||||
|
|
|
@ -5,8 +5,8 @@ cymem>=1.30,<1.32
|
||||||
preshed>=1.0.0,<2.0.0
|
preshed>=1.0.0,<2.0.0
|
||||||
thinc>=6.11.1.dev10,<6.12.0
|
thinc>=6.11.1.dev10,<6.12.0
|
||||||
murmurhash>=0.28,<0.29
|
murmurhash>=0.28,<0.29
|
||||||
cytoolz>=0.9.0,<0.10.0
|
|
||||||
plac<1.0.0,>=0.9.6
|
plac<1.0.0,>=0.9.6
|
||||||
|
six
|
||||||
ujson>=1.35
|
ujson>=1.35
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
@ -16,3 +16,4 @@ pytest>=3.0.6,<4.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
msgpack-python==0.5.4
|
msgpack-python==0.5.4
|
||||||
msgpack-numpy==0.4.1
|
msgpack-numpy==0.4.1
|
||||||
|
html5lib==1.0b8
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -18,7 +18,6 @@ PACKAGES = find_packages()
|
||||||
|
|
||||||
|
|
||||||
MOD_NAMES = [
|
MOD_NAMES = [
|
||||||
'spacy._align',
|
|
||||||
'spacy.parts_of_speech',
|
'spacy.parts_of_speech',
|
||||||
'spacy.strings',
|
'spacy.strings',
|
||||||
'spacy.lexeme',
|
'spacy.lexeme',
|
||||||
|
@ -192,6 +191,8 @@ def setup_package():
|
||||||
'preshed>=1.0.0,<2.0.0',
|
'preshed>=1.0.0,<2.0.0',
|
||||||
'thinc>=6.11.1.dev10,<6.12.0',
|
'thinc>=6.11.1.dev10,<6.12.0',
|
||||||
'plac<1.0.0,>=0.9.6',
|
'plac<1.0.0,>=0.9.6',
|
||||||
|
'six',
|
||||||
|
'html5lib==1.0b8',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
'ujson>=1.35',
|
'ujson>=1.35',
|
||||||
'dill>=0.2,<0.3',
|
'dill>=0.2,<0.3',
|
||||||
|
@ -200,7 +201,6 @@ def setup_package():
|
||||||
'ftfy>=4.4.2,<5.0.0',
|
'ftfy>=4.4.2,<5.0.0',
|
||||||
'msgpack-python==0.5.4',
|
'msgpack-python==0.5.4',
|
||||||
'msgpack-numpy==0.4.1'],
|
'msgpack-numpy==0.4.1'],
|
||||||
setup_requires=['wheel'],
|
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 5 - Production/Stable',
|
'Development Status :: 5 - Production/Stable',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
|
|
|
@ -8,7 +8,6 @@ if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
from spacy.cli import download, link, info, package, train, convert
|
from spacy.cli import download, link, info, package, train, convert
|
||||||
from spacy.cli import vocab, init_model, profile, evaluate, validate
|
from spacy.cli import vocab, init_model, profile, evaluate, validate
|
||||||
from spacy.cli import ud_train, ud_evaluate
|
|
||||||
from spacy.util import prints
|
from spacy.util import prints
|
||||||
|
|
||||||
commands = {
|
commands = {
|
||||||
|
@ -16,9 +15,7 @@ if __name__ == '__main__':
|
||||||
'link': link,
|
'link': link,
|
||||||
'info': info,
|
'info': info,
|
||||||
'train': train,
|
'train': train,
|
||||||
'ud-train': ud_train,
|
|
||||||
'evaluate': evaluate,
|
'evaluate': evaluate,
|
||||||
'ud-evaluate': ud_evaluate,
|
|
||||||
'convert': convert,
|
'convert': convert,
|
||||||
'package': package,
|
'package': package,
|
||||||
'vocab': vocab,
|
'vocab': vocab,
|
||||||
|
|
251
spacy/_align.pyx
251
spacy/_align.pyx
|
@ -1,251 +0,0 @@
|
||||||
# cython: infer_types=True
|
|
||||||
'''Do Levenshtein alignment, for evaluation of tokenized input.
|
|
||||||
|
|
||||||
Random notes:
|
|
||||||
|
|
||||||
r i n g
|
|
||||||
0 1 2 3 4
|
|
||||||
r 1 0 1 2 3
|
|
||||||
a 2 1 1 2 3
|
|
||||||
n 3 2 2 1 2
|
|
||||||
g 4 3 3 2 1
|
|
||||||
|
|
||||||
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
|
|
||||||
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
|
|
||||||
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
|
|
||||||
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
|
|
||||||
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
|
|
||||||
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
|
|
||||||
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
|
|
||||||
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
|
|
||||||
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
|
|
||||||
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
|
|
||||||
2,2: (3,3)
|
|
||||||
3,2: (4,3)
|
|
||||||
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
|
|
||||||
|
|
||||||
We know the costs to transition:
|
|
||||||
|
|
||||||
S[:i] -> T[:j] (at D[i,j])
|
|
||||||
S[:i+1] -> T[:j] (at D[i+1,j])
|
|
||||||
S[:i] -> T[:j+1] (at D[i,j+1])
|
|
||||||
|
|
||||||
Further, we now we can tranform:
|
|
||||||
S[:i+1] -> S[:i] (DEL) for 1,
|
|
||||||
T[:j+1] -> T[:j] (INS) for 1.
|
|
||||||
S[i+1] -> T[j+1] (SUB) for 0 or 1
|
|
||||||
|
|
||||||
Therefore we have the costs:
|
|
||||||
SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j])
|
|
||||||
i.e. D[i, j] + S[i+1] != T[j+1]
|
|
||||||
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
|
|
||||||
i.e. D[i+1,j] + 1
|
|
||||||
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i])
|
|
||||||
i.e. D[i,j+1] + 1
|
|
||||||
|
|
||||||
Source string S has length m, with index i
|
|
||||||
Target string T has length n, with index j
|
|
||||||
|
|
||||||
Output two alignment vectors: i2j (length m) and j2i (length n)
|
|
||||||
# function LevenshteinDistance(char s[1..m], char t[1..n]):
|
|
||||||
# for all i and j, d[i,j] will hold the Levenshtein distance between
|
|
||||||
# the first i characters of s and the first j characters of t
|
|
||||||
# note that d has (m+1)*(n+1) values
|
|
||||||
# set each element in d to zero
|
|
||||||
ring rang
|
|
||||||
- r i n g
|
|
||||||
- 0 0 0 0 0
|
|
||||||
r 0 0 0 0 0
|
|
||||||
a 0 0 0 0 0
|
|
||||||
n 0 0 0 0 0
|
|
||||||
g 0 0 0 0 0
|
|
||||||
|
|
||||||
# source prefixes can be transformed into empty string by
|
|
||||||
# dropping all characters
|
|
||||||
# d[i, 0] := i
|
|
||||||
ring rang
|
|
||||||
- r i n g
|
|
||||||
- 0 0 0 0 0
|
|
||||||
r 1 0 0 0 0
|
|
||||||
a 2 0 0 0 0
|
|
||||||
n 3 0 0 0 0
|
|
||||||
g 4 0 0 0 0
|
|
||||||
|
|
||||||
# target prefixes can be reached from empty source prefix
|
|
||||||
# by inserting every character
|
|
||||||
# d[0, j] := j
|
|
||||||
- r i n g
|
|
||||||
- 0 1 2 3 4
|
|
||||||
r 1 0 0 0 0
|
|
||||||
a 2 0 0 0 0
|
|
||||||
n 3 0 0 0 0
|
|
||||||
g 4 0 0 0 0
|
|
||||||
|
|
||||||
'''
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
import numpy
|
|
||||||
cimport numpy as np
|
|
||||||
from .compat import unicode_
|
|
||||||
from murmurhash.mrmr cimport hash32
|
|
||||||
|
|
||||||
|
|
||||||
def align(S, T):
|
|
||||||
cdef int m = len(S)
|
|
||||||
cdef int n = len(T)
|
|
||||||
cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
|
|
||||||
cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
|
|
||||||
cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
|
|
||||||
|
|
||||||
cdef np.ndarray S_arr = _convert_sequence(S)
|
|
||||||
cdef np.ndarray T_arr = _convert_sequence(T)
|
|
||||||
|
|
||||||
fill_matrix(<int*>matrix.data,
|
|
||||||
<const int*>S_arr.data, m, <const int*>T_arr.data, n)
|
|
||||||
fill_i2j(i2j, matrix)
|
|
||||||
fill_j2i(j2i, matrix)
|
|
||||||
for i in range(i2j.shape[0]):
|
|
||||||
if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
|
|
||||||
i2j[i] = -1
|
|
||||||
for j in range(j2i.shape[0]):
|
|
||||||
if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
|
|
||||||
j2i[j] = -1
|
|
||||||
return matrix[-1,-1], i2j, j2i, matrix
|
|
||||||
|
|
||||||
|
|
||||||
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
|
|
||||||
'''Let's say we had:
|
|
||||||
|
|
||||||
Guess: [aa bb cc dd]
|
|
||||||
Truth: [aa bbcc dd]
|
|
||||||
i2j: [0, None, -2, 2]
|
|
||||||
j2i: [0, -2, 3]
|
|
||||||
|
|
||||||
We want:
|
|
||||||
|
|
||||||
i2j_multi: {1: 1, 2: 1}
|
|
||||||
j2i_multi: {}
|
|
||||||
'''
|
|
||||||
i2j_miss = _get_regions(i2j, i_lengths)
|
|
||||||
j2i_miss = _get_regions(j2i, j_lengths)
|
|
||||||
|
|
||||||
i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
|
|
||||||
return i2j_multi, j2i_multi
|
|
||||||
|
|
||||||
|
|
||||||
def _get_regions(alignment, lengths):
|
|
||||||
regions = {}
|
|
||||||
start = None
|
|
||||||
offset = 0
|
|
||||||
for i in range(len(alignment)):
|
|
||||||
if alignment[i] < 0:
|
|
||||||
if start is None:
|
|
||||||
start = offset
|
|
||||||
regions.setdefault(start, [])
|
|
||||||
regions[start].append(i)
|
|
||||||
else:
|
|
||||||
start = None
|
|
||||||
offset += lengths[i]
|
|
||||||
return regions
|
|
||||||
|
|
||||||
|
|
||||||
def _get_mapping(miss1, miss2, lengths1, lengths2):
|
|
||||||
i2j = {}
|
|
||||||
j2i = {}
|
|
||||||
for start, region1 in miss1.items():
|
|
||||||
if not region1 or start not in miss2:
|
|
||||||
continue
|
|
||||||
region2 = miss2[start]
|
|
||||||
if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
|
|
||||||
j = region2.pop(0)
|
|
||||||
buff = []
|
|
||||||
# Consume tokens from region 1, until we meet the length of the
|
|
||||||
# first token in region2. If we do, align the tokens. If
|
|
||||||
# we exceed the length, break.
|
|
||||||
while region1:
|
|
||||||
buff.append(region1.pop(0))
|
|
||||||
if sum(lengths1[i] for i in buff) == lengths2[j]:
|
|
||||||
for i in buff:
|
|
||||||
i2j[i] = j
|
|
||||||
j2i[j] = buff[-1]
|
|
||||||
j += 1
|
|
||||||
buff = []
|
|
||||||
elif sum(lengths1[i] for i in buff) > lengths2[j]:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
|
|
||||||
for i in buff:
|
|
||||||
i2j[i] = j
|
|
||||||
j2i[j] = buff[-1]
|
|
||||||
return i2j, j2i
|
|
||||||
|
|
||||||
|
|
||||||
def _convert_sequence(seq):
|
|
||||||
if isinstance(seq, numpy.ndarray):
|
|
||||||
return numpy.ascontiguousarray(seq, dtype='uint32_t')
|
|
||||||
cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
|
|
||||||
cdef bytes item_bytes
|
|
||||||
for i, item in enumerate(seq):
|
|
||||||
if isinstance(item, unicode):
|
|
||||||
item_bytes = item.encode('utf8')
|
|
||||||
else:
|
|
||||||
item_bytes = item
|
|
||||||
output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
cdef void fill_matrix(int* D,
|
|
||||||
const int* S, int m, const int* T, int n) nogil:
|
|
||||||
m1 = m+1
|
|
||||||
n1 = n+1
|
|
||||||
for i in range(m1*n1):
|
|
||||||
D[i] = 0
|
|
||||||
|
|
||||||
for i in range(m1):
|
|
||||||
D[i*n1] = i
|
|
||||||
|
|
||||||
for j in range(n1):
|
|
||||||
D[j] = j
|
|
||||||
|
|
||||||
cdef int sub_cost, ins_cost, del_cost
|
|
||||||
for j in range(n):
|
|
||||||
for i in range(m):
|
|
||||||
i_j = i*n1 + j
|
|
||||||
i1_j1 = (i+1)*n1 + j+1
|
|
||||||
i1_j = (i+1)*n1 + j
|
|
||||||
i_j1 = i*n1 + j+1
|
|
||||||
if S[i] != T[j]:
|
|
||||||
sub_cost = D[i_j] + 1
|
|
||||||
else:
|
|
||||||
sub_cost = D[i_j]
|
|
||||||
del_cost = D[i_j1] + 1
|
|
||||||
ins_cost = D[i1_j] + 1
|
|
||||||
best = min(min(sub_cost, ins_cost), del_cost)
|
|
||||||
D[i1_j1] = best
|
|
||||||
|
|
||||||
|
|
||||||
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
|
|
||||||
j = D.shape[1]-2
|
|
||||||
cdef int i = D.shape[0]-2
|
|
||||||
while i >= 0:
|
|
||||||
while D[i+1, j] < D[i+1, j+1]:
|
|
||||||
j -= 1
|
|
||||||
if D[i, j+1] < D[i+1, j+1]:
|
|
||||||
i2j[i] = -1
|
|
||||||
else:
|
|
||||||
i2j[i] = j
|
|
||||||
j -= 1
|
|
||||||
i -= 1
|
|
||||||
|
|
||||||
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
|
|
||||||
i = D.shape[0]-2
|
|
||||||
cdef int j = D.shape[1]-2
|
|
||||||
while j >= 0:
|
|
||||||
while D[i, j+1] < D[i+1, j+1]:
|
|
||||||
i -= 1
|
|
||||||
if D[i+1, j] < D[i+1, j+1]:
|
|
||||||
j2i[j] = -1
|
|
||||||
else:
|
|
||||||
j2i[j] = i
|
|
||||||
i -= 1
|
|
||||||
j -= 1
|
|
|
@ -1,251 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
class Vocab(object):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Doc(list):
|
|
||||||
def __init__(self, vocab, words=None):
|
|
||||||
list.__init__(self)
|
|
||||||
self.extend([Token(i, w) for i, w in enumerate(words)])
|
|
||||||
|
|
||||||
|
|
||||||
class Token(object):
|
|
||||||
def __init__(self, i, word):
|
|
||||||
self.i = i
|
|
||||||
self.text = word
|
|
||||||
|
|
||||||
|
|
||||||
def find_matches(patterns, doc):
|
|
||||||
init_states = [(pattern, 0, None) for pattern in patterns]
|
|
||||||
curr_states = []
|
|
||||||
matches = []
|
|
||||||
for token in doc:
|
|
||||||
nexts = []
|
|
||||||
for state in (curr_states + init_states):
|
|
||||||
matches, nexts = transition(state, token, matches, nexts)
|
|
||||||
curr_states = nexts
|
|
||||||
return matches
|
|
||||||
|
|
||||||
|
|
||||||
def transition(state, token, matches, nexts):
|
|
||||||
action = get_action(state, token)
|
|
||||||
is_match, keep_state, advance_state = [bool(int(c)) for c in action]
|
|
||||||
pattern, i, start = state
|
|
||||||
if start is None:
|
|
||||||
start = token.i
|
|
||||||
if is_match:
|
|
||||||
matches.append((pattern, start, token.i+1))
|
|
||||||
if advance_state:
|
|
||||||
nexts.append((pattern, i+1, start))
|
|
||||||
if keep_state:
|
|
||||||
# TODO: This needs to be zero-width :(.
|
|
||||||
nexts.append((pattern, i, start))
|
|
||||||
return (matches, nexts)
|
|
||||||
|
|
||||||
|
|
||||||
def get_action(state, token):
|
|
||||||
'''We need to consider:
|
|
||||||
|
|
||||||
a) Does the token match the specification? [Yes, No]
|
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
|
||||||
c) Is this the last specification? [final, non-final]
|
|
||||||
|
|
||||||
We can transition in the following ways:
|
|
||||||
|
|
||||||
a) Do we emit a match?
|
|
||||||
b) Do we add a state with (next state, next token)?
|
|
||||||
c) Do we add a state with (next state, same token)?
|
|
||||||
d) Do we add a state with (same state, next token)?
|
|
||||||
|
|
||||||
We'll code the actions as boolean strings, so 0000 means no to all 4,
|
|
||||||
1000 means match but no states added, etc.
|
|
||||||
|
|
||||||
1:
|
|
||||||
Yes, final:
|
|
||||||
1000
|
|
||||||
Yes, non-final:
|
|
||||||
0100
|
|
||||||
No, final:
|
|
||||||
0000
|
|
||||||
No, non-final
|
|
||||||
0000
|
|
||||||
0+:
|
|
||||||
Yes, final:
|
|
||||||
1001
|
|
||||||
Yes, non-final:
|
|
||||||
0111
|
|
||||||
No, final:
|
|
||||||
1000 (note: Don't include last token!)
|
|
||||||
No, non-final:
|
|
||||||
0010
|
|
||||||
?:
|
|
||||||
Yes, final:
|
|
||||||
1000
|
|
||||||
Yes, non-final:
|
|
||||||
0100
|
|
||||||
No, final:
|
|
||||||
1000 (note: Don't include last token!)
|
|
||||||
No, non-final:
|
|
||||||
0010
|
|
||||||
|
|
||||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
|
||||||
'''
|
|
||||||
is_match = get_is_match(state, token)
|
|
||||||
operator = get_operator(state, token)
|
|
||||||
is_final = get_is_final(state, token)
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
def get_is_match(state, token):
|
|
||||||
pattern, i, start = state
|
|
||||||
is_match = token.text == pattern[i]['spec']
|
|
||||||
if pattern[i].get('invert'):
|
|
||||||
return not is_match
|
|
||||||
else:
|
|
||||||
return is_match
|
|
||||||
|
|
||||||
def get_is_final(state, token):
|
|
||||||
pattern, i, start = state
|
|
||||||
return i == len(pattern)-1
|
|
||||||
|
|
||||||
def get_operator(state, token):
|
|
||||||
pattern, i, start = state
|
|
||||||
return pattern[i].get('op', '1')
|
|
||||||
|
|
||||||
|
|
||||||
########################
|
|
||||||
# Tests for get_action #
|
|
||||||
########################
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_action_simple_match():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1'}]
|
|
||||||
doc = Doc(Vocab(), words=['a'])
|
|
||||||
state = (pattern, 0, None)
|
|
||||||
action = get_action(state, doc[0])
|
|
||||||
assert action == '100'
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_action_simple_reject():
|
|
||||||
pattern = [{'spec': 'b', 'op': '1'}]
|
|
||||||
doc = Doc(Vocab(), words=['a'])
|
|
||||||
state = (pattern, 0, None)
|
|
||||||
action = get_action(state, doc[0])
|
|
||||||
assert action == '000'
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_action_simple_match_match():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]
|
|
||||||
doc = Doc(Vocab(), words=['a', 'a'])
|
|
||||||
state = (pattern, 0, None)
|
|
||||||
action = get_action(state, doc[0])
|
|
||||||
assert action == '001'
|
|
||||||
state = (pattern, 1, 0)
|
|
||||||
action = get_action(state, doc[1])
|
|
||||||
assert action == '100'
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_action_simple_match_reject():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
|
|
||||||
doc = Doc(Vocab(), words=['a', 'a'])
|
|
||||||
state = (pattern, 0, None)
|
|
||||||
action = get_action(state, doc[0])
|
|
||||||
assert action == '001'
|
|
||||||
state = (pattern, 1, 0)
|
|
||||||
action = get_action(state, doc[1])
|
|
||||||
assert action == '000'
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_action_simple_match_reject():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
|
|
||||||
doc = Doc(Vocab(), words=['a', 'a'])
|
|
||||||
state = (pattern, 0, None)
|
|
||||||
action = get_action(state, doc[0])
|
|
||||||
assert action == '001'
|
|
||||||
state = (pattern, 1, 0)
|
|
||||||
action = get_action(state, doc[1])
|
|
||||||
assert action == '000'
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_action_plus_match():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1+'}]
|
|
||||||
doc = Doc(Vocab(), words=['a'])
|
|
||||||
state = (pattern, 0, None)
|
|
||||||
action = get_action(state, doc[0])
|
|
||||||
assert action == '110'
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_action_plus_match_match():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1+'}]
|
|
||||||
doc = Doc(Vocab(), words=['a', 'a'])
|
|
||||||
state = (pattern, 0, None)
|
|
||||||
action = get_action(state, doc[0])
|
|
||||||
assert action == '110'
|
|
||||||
state = (pattern, 0, 0)
|
|
||||||
action = get_action(state, doc[1])
|
|
||||||
assert action == '110'
|
|
||||||
|
|
||||||
|
|
||||||
##########################
|
|
||||||
# Tests for find_matches #
|
|
||||||
##########################
|
|
||||||
|
|
||||||
def test_find_matches_simple_accept():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1'}]
|
|
||||||
doc = Doc(Vocab(), words=['a'])
|
|
||||||
matches = find_matches([pattern], doc)
|
|
||||||
assert matches == [(pattern, 0, 1)]
|
|
||||||
|
|
||||||
|
|
||||||
def test_find_matches_simple_reject():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1'}]
|
|
||||||
doc = Doc(Vocab(), words=['b'])
|
|
||||||
matches = find_matches([pattern], doc)
|
|
||||||
assert matches == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_find_matches_match_twice():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1'}]
|
|
||||||
doc = Doc(Vocab(), words=['a', 'a'])
|
|
||||||
matches = find_matches([pattern], doc)
|
|
||||||
assert matches == [(pattern, 0, 1), (pattern, 1, 2)]
|
|
||||||
|
|
||||||
|
|
||||||
def test_find_matches_longer_pattern():
|
|
||||||
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
|
|
||||||
doc = Doc(Vocab(), words=['a', 'b'])
|
|
||||||
matches = find_matches([pattern], doc)
|
|
||||||
assert matches == [(pattern, 0, 2)]
|
|
||||||
|
|
||||||
|
|
||||||
def test_find_matches_two_patterns():
|
|
||||||
patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]
|
|
||||||
doc = Doc(Vocab(), words=['a', 'b'])
|
|
||||||
matches = find_matches(patterns, doc)
|
|
||||||
assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]
|
|
||||||
|
|
||||||
|
|
||||||
def test_find_matches_two_patterns_overlap():
|
|
||||||
patterns = [[{'spec': 'a'}, {'spec': 'b'}],
|
|
||||||
[{'spec': 'b'}, {'spec': 'c'}]]
|
|
||||||
doc = Doc(Vocab(), words=['a', 'b', 'c'])
|
|
||||||
matches = find_matches(patterns, doc)
|
|
||||||
assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]
|
|
||||||
|
|
||||||
|
|
||||||
def test_find_matches_greedy():
|
|
||||||
patterns = [[{'spec': 'a', 'op': '1+'}]]
|
|
||||||
doc = Doc(Vocab(), words=['a'])
|
|
||||||
matches = find_matches(patterns, doc)
|
|
||||||
assert matches == [(patterns[0], 0, 1)]
|
|
||||||
doc = Doc(Vocab(), words=['a', 'a'])
|
|
||||||
matches = find_matches(patterns, doc)
|
|
||||||
assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]
|
|
||||||
|
|
||||||
def test_find_matches_non_greedy():
|
|
||||||
patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]]
|
|
||||||
doc = Doc(Vocab(), words=['b'])
|
|
||||||
matches = find_matches(patterns, doc)
|
|
||||||
assert matches == [(patterns[0], 0, 1)]
|
|
30
spacy/_ml.py
30
spacy/_ml.py
|
@ -64,6 +64,23 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
||||||
return (X, lengths), finish_update
|
return (X, lengths), finish_update
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def _logistic(X, drop=0.):
|
||||||
|
xp = get_array_module(X)
|
||||||
|
if not isinstance(X, xp.ndarray):
|
||||||
|
X = xp.asarray(X)
|
||||||
|
# Clip to range (-10, 10)
|
||||||
|
X = xp.minimum(X, 10., X)
|
||||||
|
X = xp.maximum(X, -10., X)
|
||||||
|
Y = 1. / (1. + xp.exp(-X))
|
||||||
|
|
||||||
|
def logistic_bwd(dY, sgd=None):
|
||||||
|
dX = dY * (Y * (1-Y))
|
||||||
|
return dX
|
||||||
|
|
||||||
|
return Y, logistic_bwd
|
||||||
|
|
||||||
|
|
||||||
def _zero_init(model):
|
def _zero_init(model):
|
||||||
def _zero_init_impl(self, X, y):
|
def _zero_init_impl(self, X, y):
|
||||||
self.W.fill(0)
|
self.W.fill(0)
|
||||||
|
@ -127,8 +144,8 @@ class PrecomputableAffine(Model):
|
||||||
self.nF = nF
|
self.nF = nF
|
||||||
|
|
||||||
def begin_update(self, X, drop=0.):
|
def begin_update(self, X, drop=0.):
|
||||||
Yf = self.ops.gemm(X,
|
Yf = self.ops.xp.dot(X,
|
||||||
self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
|
self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T)
|
||||||
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
||||||
Yf = self._add_padding(Yf)
|
Yf = self._add_padding(Yf)
|
||||||
|
|
||||||
|
@ -144,11 +161,11 @@ class PrecomputableAffine(Model):
|
||||||
Wopfi = self.W.transpose((1, 2, 0, 3))
|
Wopfi = self.W.transpose((1, 2, 0, 3))
|
||||||
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
||||||
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
|
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
|
||||||
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
|
dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
|
||||||
|
|
||||||
# Reuse the buffer
|
# Reuse the buffer
|
||||||
dWopfi = Wopfi; dWopfi.fill(0.)
|
dWopfi = Wopfi; dWopfi.fill(0.)
|
||||||
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
self.ops.xp.dot(dY.T, Xf, out=dWopfi)
|
||||||
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
||||||
# (o, p, f, i) --> (f, o, p, i)
|
# (o, p, f, i) --> (f, o, p, i)
|
||||||
self.d_W += dWopfi.transpose((2, 0, 1, 3))
|
self.d_W += dWopfi.transpose((2, 0, 1, 3))
|
||||||
|
@ -450,7 +467,6 @@ def SpacyVectors(docs, drop=0.):
|
||||||
|
|
||||||
|
|
||||||
def build_text_classifier(nr_class, width=64, **cfg):
|
def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
depth = cfg.get('depth', 2)
|
|
||||||
nr_vector = cfg.get('nr_vector', 5000)
|
nr_vector = cfg.get('nr_vector', 5000)
|
||||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||||
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
||||||
|
@ -502,7 +518,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
LN(Maxout(width, vectors_width))
|
LN(Maxout(width, vectors_width))
|
||||||
>> Residual(
|
>> Residual(
|
||||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
||||||
) ** depth, pad=depth
|
) ** 2, pad=2
|
||||||
)
|
)
|
||||||
>> flatten_add_lengths
|
>> flatten_add_lengths
|
||||||
>> ParametricAttention(width)
|
>> ParametricAttention(width)
|
||||||
|
@ -515,6 +531,8 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
_preprocess_doc
|
_preprocess_doc
|
||||||
>> LinearModel(nr_class)
|
>> LinearModel(nr_class)
|
||||||
)
|
)
|
||||||
|
#model = linear_model >> logistic
|
||||||
|
|
||||||
model = (
|
model = (
|
||||||
(linear_model | cnn_model)
|
(linear_model | cnn_model)
|
||||||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
||||||
|
|
|
@ -9,7 +9,7 @@ __uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Explosion AI'
|
__author__ = 'Explosion AI'
|
||||||
__email__ = 'contact@explosion.ai'
|
__email__ = 'contact@explosion.ai'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
__release__ = False
|
__release__ = True
|
||||||
|
|
||||||
__docs_models__ = 'https://spacy.io/usage/models'
|
__docs_models__ = 'https://spacy.io/usage/models'
|
||||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||||
|
|
|
@ -131,7 +131,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
||||||
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
||||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
||||||
'Polarity', 'PrepCase', 'Animacy' # U20
|
'Polarity', 'Animacy' # U20
|
||||||
]
|
]
|
||||||
for key in morph_keys:
|
for key in morph_keys:
|
||||||
if key in stringy_attrs:
|
if key in stringy_attrs:
|
||||||
|
|
|
@ -9,5 +9,3 @@ from .convert import convert
|
||||||
from .vocab import make_vocab as vocab
|
from .vocab import make_vocab as vocab
|
||||||
from .init_model import init_model
|
from .init_model import init_model
|
||||||
from .validate import validate
|
from .validate import validate
|
||||||
from .ud_train import main as ud_train
|
|
||||||
from .conll17_ud_eval import main as ud_evaluate
|
|
||||||
|
|
|
@ -1,571 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
# CoNLL 2017 UD Parsing evaluation script.
|
|
||||||
#
|
|
||||||
# Compatible with Python 2.7 and 3.2+, can be used either as a module
|
|
||||||
# or a standalone executable.
|
|
||||||
#
|
|
||||||
# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL),
|
|
||||||
# Faculty of Mathematics and Physics, Charles University, Czech Republic.
|
|
||||||
#
|
|
||||||
# Changelog:
|
|
||||||
# - [02 Jan 2017] Version 0.9: Initial release
|
|
||||||
# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation
|
|
||||||
# - [10 Mar 2017] Version 1.0: Add documentation and test
|
|
||||||
# Compare HEADs correctly using aligned words
|
|
||||||
# Allow evaluation with errorneous spaces in forms
|
|
||||||
# Compare forms in LCS case insensitively
|
|
||||||
# Detect cycles and multiple root nodes
|
|
||||||
# Compute AlignedAccuracy
|
|
||||||
|
|
||||||
# Command line usage
|
|
||||||
# ------------------
|
|
||||||
# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file
|
|
||||||
#
|
|
||||||
# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics
|
|
||||||
# is printed
|
|
||||||
# - if -v is given, several metrics are printed (as precision, recall, F1 score,
|
|
||||||
# and in case the metric is computed on aligned words also accuracy on these):
|
|
||||||
# - Tokens: how well do the gold tokens match system tokens
|
|
||||||
# - Sentences: how well do the gold sentences match system sentences
|
|
||||||
# - Words: how well can the gold words be aligned to system words
|
|
||||||
# - UPOS: using aligned words, how well does UPOS match
|
|
||||||
# - XPOS: using aligned words, how well does XPOS match
|
|
||||||
# - Feats: using aligned words, how well does FEATS match
|
|
||||||
# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
|
|
||||||
# - Lemmas: using aligned words, how well does LEMMA match
|
|
||||||
# - UAS: using aligned words, how well does HEAD match
|
|
||||||
# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
|
|
||||||
# - if weights_file is given (with lines containing deprel-weight pairs),
|
|
||||||
# one more metric is shown:
|
|
||||||
# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight
|
|
||||||
|
|
||||||
# API usage
|
|
||||||
# ---------
|
|
||||||
# - load_conllu(file)
|
|
||||||
# - loads CoNLL-U file from given file object to an internal representation
|
|
||||||
# - the file object should return str on both Python 2 and Python 3
|
|
||||||
# - raises UDError exception if the given file cannot be loaded
|
|
||||||
# - evaluate(gold_ud, system_ud)
|
|
||||||
# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
|
|
||||||
# - raises UDError if the concatenated tokens of gold and system file do not match
|
|
||||||
# - returns a dictionary with the metrics described above, each metrics having
|
|
||||||
# three fields: precision, recall and f1
|
|
||||||
|
|
||||||
# Description of token matching
|
|
||||||
# -----------------------------
|
|
||||||
# In order to match tokens of gold file and system file, we consider the text
|
|
||||||
# resulting from concatenation of gold tokens and text resulting from
|
|
||||||
# concatenation of system tokens. These texts should match -- if they do not,
|
|
||||||
# the evaluation fails.
|
|
||||||
#
|
|
||||||
# If the texts do match, every token is represented as a range in this original
|
|
||||||
# text, and tokens are equal only if their range is the same.
|
|
||||||
|
|
||||||
# Description of word matching
|
|
||||||
# ----------------------------
|
|
||||||
# When matching words of gold file and system file, we first match the tokens.
|
|
||||||
# The words which are also tokens are matched as tokens, but words in multi-word
|
|
||||||
# tokens have to be handled differently.
|
|
||||||
#
|
|
||||||
# To handle multi-word tokens, we start by finding "multi-word spans".
|
|
||||||
# Multi-word span is a span in the original text such that
|
|
||||||
# - it contains at least one multi-word token
|
|
||||||
# - all multi-word tokens in the span (considering both gold and system ones)
|
|
||||||
# are completely inside the span (i.e., they do not "stick out")
|
|
||||||
# - the multi-word span is as small as possible
|
|
||||||
#
|
|
||||||
# For every multi-word span, we align the gold and system words completely
|
|
||||||
# inside this span using LCS on their FORMs. The words not intersecting
|
|
||||||
# (even partially) any multi-word span are then aligned as tokens.
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import io
|
|
||||||
import sys
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
# CoNLL-U column names
|
|
||||||
ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)
|
|
||||||
|
|
||||||
# UD Error is used when raising exceptions in this module
|
|
||||||
class UDError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Load given CoNLL-U file into internal representation
|
|
||||||
def load_conllu(file):
|
|
||||||
# Internal representation classes
|
|
||||||
class UDRepresentation:
|
|
||||||
def __init__(self):
|
|
||||||
# Characters of all the tokens in the whole file.
|
|
||||||
# Whitespace between tokens is not included.
|
|
||||||
self.characters = []
|
|
||||||
# List of UDSpan instances with start&end indices into `characters`.
|
|
||||||
self.tokens = []
|
|
||||||
# List of UDWord instances.
|
|
||||||
self.words = []
|
|
||||||
# List of UDSpan instances with start&end indices into `characters`.
|
|
||||||
self.sentences = []
|
|
||||||
class UDSpan:
|
|
||||||
def __init__(self, start, end, characters):
|
|
||||||
self.start = start
|
|
||||||
# Note that self.end marks the first position **after the end** of span,
|
|
||||||
# so we can use characters[start:end] or range(start, end).
|
|
||||||
self.end = end
|
|
||||||
self.characters = characters
|
|
||||||
|
|
||||||
@property
|
|
||||||
def text(self):
|
|
||||||
return ''.join(self.characters[self.start:self.end])
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.text
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.text
|
|
||||||
class UDWord:
|
|
||||||
def __init__(self, span, columns, is_multiword):
|
|
||||||
# Span of this word (or MWT, see below) within ud_representation.characters.
|
|
||||||
self.span = span
|
|
||||||
# 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
|
|
||||||
self.columns = columns
|
|
||||||
# is_multiword==True means that this word is part of a multi-word token.
|
|
||||||
# In that case, self.span marks the span of the whole multi-word token.
|
|
||||||
self.is_multiword = is_multiword
|
|
||||||
# Reference to the UDWord instance representing the HEAD (or None if root).
|
|
||||||
self.parent = None
|
|
||||||
# Let's ignore language-specific deprel subtypes.
|
|
||||||
self.columns[DEPREL] = columns[DEPREL].split(':')[0]
|
|
||||||
|
|
||||||
ud = UDRepresentation()
|
|
||||||
|
|
||||||
# Load the CoNLL-U file
|
|
||||||
index, sentence_start = 0, None
|
|
||||||
linenum = 0
|
|
||||||
while True:
|
|
||||||
line = file.readline()
|
|
||||||
linenum += 1
|
|
||||||
if not line:
|
|
||||||
break
|
|
||||||
line = line.rstrip("\r\n")
|
|
||||||
|
|
||||||
# Handle sentence start boundaries
|
|
||||||
if sentence_start is None:
|
|
||||||
# Skip comments
|
|
||||||
if line.startswith("#"):
|
|
||||||
continue
|
|
||||||
# Start a new sentence
|
|
||||||
ud.sentences.append(UDSpan(index, 0, ud.characters))
|
|
||||||
sentence_start = len(ud.words)
|
|
||||||
if not line:
|
|
||||||
# Add parent UDWord links and check there are no cycles
|
|
||||||
def process_word(word):
|
|
||||||
if word.parent == "remapping":
|
|
||||||
raise UDError("There is a cycle in a sentence")
|
|
||||||
if word.parent is None:
|
|
||||||
head = int(word.columns[HEAD])
|
|
||||||
if head > len(ud.words) - sentence_start:
|
|
||||||
raise UDError("Line {}: HEAD '{}' points outside of the sentence".format(
|
|
||||||
linenum, word.columns[HEAD]))
|
|
||||||
if head:
|
|
||||||
parent = ud.words[sentence_start + head - 1]
|
|
||||||
word.parent = "remapping"
|
|
||||||
process_word(parent)
|
|
||||||
word.parent = parent
|
|
||||||
|
|
||||||
for word in ud.words[sentence_start:]:
|
|
||||||
process_word(word)
|
|
||||||
|
|
||||||
# Check there is a single root node
|
|
||||||
if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
|
|
||||||
raise UDError("There are multiple roots in a sentence")
|
|
||||||
|
|
||||||
# End the sentence
|
|
||||||
ud.sentences[-1].end = index
|
|
||||||
sentence_start = None
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Read next token/word
|
|
||||||
columns = line.split("\t")
|
|
||||||
if len(columns) != 10:
|
|
||||||
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line))
|
|
||||||
|
|
||||||
# Skip empty nodes
|
|
||||||
if "." in columns[ID]:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Delete spaces from FORM so gold.characters == system.characters
|
|
||||||
# even if one of them tokenizes the space.
|
|
||||||
columns[FORM] = columns[FORM].replace(" ", "")
|
|
||||||
if not columns[FORM]:
|
|
||||||
raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum)
|
|
||||||
|
|
||||||
# Save token
|
|
||||||
ud.characters.extend(columns[FORM])
|
|
||||||
ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters))
|
|
||||||
index += len(columns[FORM])
|
|
||||||
|
|
||||||
# Handle multi-word tokens to save word(s)
|
|
||||||
if "-" in columns[ID]:
|
|
||||||
try:
|
|
||||||
start, end = map(int, columns[ID].split("-"))
|
|
||||||
except:
|
|
||||||
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
|
||||||
|
|
||||||
for _ in range(start, end + 1):
|
|
||||||
word_line = file.readline().rstrip("\r\n")
|
|
||||||
word_columns = word_line.split("\t")
|
|
||||||
if len(word_columns) != 10:
|
|
||||||
print(columns)
|
|
||||||
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line))
|
|
||||||
ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
|
|
||||||
# Basic tokens/words
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
word_id = int(columns[ID])
|
|
||||||
except:
|
|
||||||
raise UDError("Cannot parse word ID '{}'".format(columns[ID]))
|
|
||||||
if word_id != len(ud.words) - sentence_start + 1:
|
|
||||||
raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1))
|
|
||||||
|
|
||||||
try:
|
|
||||||
head_id = int(columns[HEAD])
|
|
||||||
except:
|
|
||||||
raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD]))
|
|
||||||
if head_id < 0:
|
|
||||||
raise UDError("HEAD cannot be negative")
|
|
||||||
|
|
||||||
ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
|
|
||||||
|
|
||||||
if sentence_start is not None:
|
|
||||||
raise UDError("The CoNLL-U file does not end with empty line")
|
|
||||||
|
|
||||||
return ud
|
|
||||||
|
|
||||||
# Evaluate the gold and system treebanks (loaded using load_conllu).
|
|
||||||
def evaluate(gold_ud, system_ud, deprel_weights=None):
|
|
||||||
class Score:
|
|
||||||
def __init__(self, gold_total, system_total, correct, aligned_total=None):
|
|
||||||
self.precision = correct / system_total if system_total else 0.0
|
|
||||||
self.recall = correct / gold_total if gold_total else 0.0
|
|
||||||
self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
|
|
||||||
self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
|
|
||||||
class AlignmentWord:
|
|
||||||
def __init__(self, gold_word, system_word):
|
|
||||||
self.gold_word = gold_word
|
|
||||||
self.system_word = system_word
|
|
||||||
self.gold_parent = None
|
|
||||||
self.system_parent_gold_aligned = None
|
|
||||||
class Alignment:
|
|
||||||
def __init__(self, gold_words, system_words):
|
|
||||||
self.gold_words = gold_words
|
|
||||||
self.system_words = system_words
|
|
||||||
self.matched_words = []
|
|
||||||
self.matched_words_map = {}
|
|
||||||
def append_aligned_words(self, gold_word, system_word):
|
|
||||||
self.matched_words.append(AlignmentWord(gold_word, system_word))
|
|
||||||
self.matched_words_map[system_word] = gold_word
|
|
||||||
def fill_parents(self):
|
|
||||||
# We represent root parents in both gold and system data by '0'.
|
|
||||||
# For gold data, we represent non-root parent by corresponding gold word.
|
|
||||||
# For system data, we represent non-root parent by either gold word aligned
|
|
||||||
# to parent system nodes, or by None if no gold words is aligned to the parent.
|
|
||||||
for words in self.matched_words:
|
|
||||||
words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0
|
|
||||||
words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \
|
|
||||||
if words.system_word.parent is not None else 0
|
|
||||||
|
|
||||||
def lower(text):
|
|
||||||
if sys.version_info < (3, 0) and isinstance(text, str):
|
|
||||||
return text.decode("utf-8").lower()
|
|
||||||
return text.lower()
|
|
||||||
|
|
||||||
def spans_score(gold_spans, system_spans):
|
|
||||||
correct, gi, si = 0, 0, 0
|
|
||||||
while gi < len(gold_spans) and si < len(system_spans):
|
|
||||||
if system_spans[si].start < gold_spans[gi].start:
|
|
||||||
si += 1
|
|
||||||
elif gold_spans[gi].start < system_spans[si].start:
|
|
||||||
gi += 1
|
|
||||||
else:
|
|
||||||
correct += gold_spans[gi].end == system_spans[si].end
|
|
||||||
si += 1
|
|
||||||
gi += 1
|
|
||||||
|
|
||||||
return Score(len(gold_spans), len(system_spans), correct)
|
|
||||||
|
|
||||||
def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
|
|
||||||
gold, system, aligned, correct = 0, 0, 0, 0
|
|
||||||
|
|
||||||
for word in alignment.gold_words:
|
|
||||||
gold += weight_fn(word)
|
|
||||||
|
|
||||||
for word in alignment.system_words:
|
|
||||||
system += weight_fn(word)
|
|
||||||
|
|
||||||
for words in alignment.matched_words:
|
|
||||||
aligned += weight_fn(words.gold_word)
|
|
||||||
|
|
||||||
if key_fn is None:
|
|
||||||
# Return score for whole aligned words
|
|
||||||
return Score(gold, system, aligned)
|
|
||||||
|
|
||||||
for words in alignment.matched_words:
|
|
||||||
if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned):
|
|
||||||
correct += weight_fn(words.gold_word)
|
|
||||||
|
|
||||||
return Score(gold, system, correct, aligned)
|
|
||||||
|
|
||||||
def beyond_end(words, i, multiword_span_end):
|
|
||||||
if i >= len(words):
|
|
||||||
return True
|
|
||||||
if words[i].is_multiword:
|
|
||||||
return words[i].span.start >= multiword_span_end
|
|
||||||
return words[i].span.end > multiword_span_end
|
|
||||||
|
|
||||||
def extend_end(word, multiword_span_end):
|
|
||||||
if word.is_multiword and word.span.end > multiword_span_end:
|
|
||||||
return word.span.end
|
|
||||||
return multiword_span_end
|
|
||||||
|
|
||||||
def find_multiword_span(gold_words, system_words, gi, si):
|
|
||||||
# We know gold_words[gi].is_multiword or system_words[si].is_multiword.
|
|
||||||
# Find the start of the multiword span (gs, ss), so the multiword span is minimal.
|
|
||||||
# Initialize multiword_span_end characters index.
|
|
||||||
if gold_words[gi].is_multiword:
|
|
||||||
multiword_span_end = gold_words[gi].span.end
|
|
||||||
if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start:
|
|
||||||
si += 1
|
|
||||||
else: # if system_words[si].is_multiword
|
|
||||||
multiword_span_end = system_words[si].span.end
|
|
||||||
if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start:
|
|
||||||
gi += 1
|
|
||||||
gs, ss = gi, si
|
|
||||||
|
|
||||||
# Find the end of the multiword span
|
|
||||||
# (so both gi and si are pointing to the word following the multiword span end).
|
|
||||||
while not beyond_end(gold_words, gi, multiword_span_end) or \
|
|
||||||
not beyond_end(system_words, si, multiword_span_end):
|
|
||||||
if gi < len(gold_words) and (si >= len(system_words) or
|
|
||||||
gold_words[gi].span.start <= system_words[si].span.start):
|
|
||||||
multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
|
|
||||||
gi += 1
|
|
||||||
else:
|
|
||||||
multiword_span_end = extend_end(system_words[si], multiword_span_end)
|
|
||||||
si += 1
|
|
||||||
return gs, ss, gi, si
|
|
||||||
|
|
||||||
def compute_lcs(gold_words, system_words, gi, si, gs, ss):
|
|
||||||
lcs = [[0] * (si - ss) for i in range(gi - gs)]
|
|
||||||
for g in reversed(range(gi - gs)):
|
|
||||||
for s in reversed(range(si - ss)):
|
|
||||||
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
|
|
||||||
lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
|
|
||||||
lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
|
|
||||||
lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
|
|
||||||
return lcs
|
|
||||||
|
|
||||||
def align_words(gold_words, system_words):
|
|
||||||
alignment = Alignment(gold_words, system_words)
|
|
||||||
|
|
||||||
gi, si = 0, 0
|
|
||||||
while gi < len(gold_words) and si < len(system_words):
|
|
||||||
if gold_words[gi].is_multiword or system_words[si].is_multiword:
|
|
||||||
# A: Multi-word tokens => align via LCS within the whole "multiword span".
|
|
||||||
gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
|
|
||||||
|
|
||||||
if si > ss and gi > gs:
|
|
||||||
lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
|
|
||||||
|
|
||||||
# Store aligned words
|
|
||||||
s, g = 0, 0
|
|
||||||
while g < gi - gs and s < si - ss:
|
|
||||||
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
|
|
||||||
alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
|
|
||||||
g += 1
|
|
||||||
s += 1
|
|
||||||
elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0):
|
|
||||||
g += 1
|
|
||||||
else:
|
|
||||||
s += 1
|
|
||||||
else:
|
|
||||||
# B: No multi-word token => align according to spans.
|
|
||||||
if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end):
|
|
||||||
alignment.append_aligned_words(gold_words[gi], system_words[si])
|
|
||||||
gi += 1
|
|
||||||
si += 1
|
|
||||||
elif gold_words[gi].span.start <= system_words[si].span.start:
|
|
||||||
gi += 1
|
|
||||||
else:
|
|
||||||
si += 1
|
|
||||||
|
|
||||||
alignment.fill_parents()
|
|
||||||
|
|
||||||
return alignment
|
|
||||||
|
|
||||||
# Check that underlying character sequences do match
|
|
||||||
if gold_ud.characters != system_ud.characters:
|
|
||||||
index = 0
|
|
||||||
while gold_ud.characters[index] == system_ud.characters[index]:
|
|
||||||
index += 1
|
|
||||||
|
|
||||||
raise UDError(
|
|
||||||
"The concatenation of tokens in gold file and in system file differ!\n" +
|
|
||||||
"First 20 differing characters in gold file: '{}' and system file: '{}'".format(
|
|
||||||
"".join(gold_ud.characters[index:index + 20]),
|
|
||||||
"".join(system_ud.characters[index:index + 20])
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Align words
|
|
||||||
alignment = align_words(gold_ud.words, system_ud.words)
|
|
||||||
|
|
||||||
# Compute the F1-scores
|
|
||||||
result = {
|
|
||||||
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
|
|
||||||
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
|
|
||||||
"Words": alignment_score(alignment, None),
|
|
||||||
"UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
|
|
||||||
"XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
|
|
||||||
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
|
|
||||||
"AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
|
|
||||||
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
|
|
||||||
"UAS": alignment_score(alignment, lambda w, parent: parent),
|
|
||||||
"LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add WeightedLAS if weights are given
|
|
||||||
if deprel_weights is not None:
|
|
||||||
def weighted_las(word):
|
|
||||||
return deprel_weights.get(word.columns[DEPREL], 1.0)
|
|
||||||
result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def load_deprel_weights(weights_file):
|
|
||||||
if weights_file is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
deprel_weights = {}
|
|
||||||
for line in weights_file:
|
|
||||||
# Ignore comments and empty lines
|
|
||||||
if line.startswith("#") or not line.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
columns = line.rstrip("\r\n").split()
|
|
||||||
if len(columns) != 2:
|
|
||||||
raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line))
|
|
||||||
|
|
||||||
deprel_weights[columns[0]] = float(columns[1])
|
|
||||||
|
|
||||||
return deprel_weights
|
|
||||||
|
|
||||||
def load_conllu_file(path):
|
|
||||||
_file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
|
|
||||||
return load_conllu(_file)
|
|
||||||
|
|
||||||
def evaluate_wrapper(args):
|
|
||||||
# Load CoNLL-U files
|
|
||||||
gold_ud = load_conllu_file(args.gold_file)
|
|
||||||
system_ud = load_conllu_file(args.system_file)
|
|
||||||
|
|
||||||
# Load weights if requested
|
|
||||||
deprel_weights = load_deprel_weights(args.weights)
|
|
||||||
|
|
||||||
return evaluate(gold_ud, system_ud, deprel_weights)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Parse arguments
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("gold_file", type=str,
|
|
||||||
help="Name of the CoNLL-U file with the gold data.")
|
|
||||||
parser.add_argument("system_file", type=str,
|
|
||||||
help="Name of the CoNLL-U file with the predicted data.")
|
|
||||||
parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None,
|
|
||||||
metavar="deprel_weights_file",
|
|
||||||
help="Compute WeightedLAS using given weights for Universal Dependency Relations.")
|
|
||||||
parser.add_argument("--verbose", "-v", default=0, action="count",
|
|
||||||
help="Print all metrics.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Use verbose if weights are supplied
|
|
||||||
if args.weights is not None and not args.verbose:
|
|
||||||
args.verbose = 1
|
|
||||||
|
|
||||||
# Evaluate
|
|
||||||
evaluation = evaluate_wrapper(args)
|
|
||||||
|
|
||||||
# Print the evaluation
|
|
||||||
if not args.verbose:
|
|
||||||
print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
|
|
||||||
else:
|
|
||||||
metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"]
|
|
||||||
if args.weights is not None:
|
|
||||||
metrics.append("WeightedLAS")
|
|
||||||
|
|
||||||
print("Metrics | Precision | Recall | F1 Score | AligndAcc")
|
|
||||||
print("-----------+-----------+-----------+-----------+-----------")
|
|
||||||
for metric in metrics:
|
|
||||||
print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
|
|
||||||
metric,
|
|
||||||
100 * evaluation[metric].precision,
|
|
||||||
100 * evaluation[metric].recall,
|
|
||||||
100 * evaluation[metric].f1,
|
|
||||||
"{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
|
|
||||||
))
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
# Tests, which can be executed with `python -m unittest conll17_ud_eval`.
|
|
||||||
class TestAlignment(unittest.TestCase):
|
|
||||||
@staticmethod
|
|
||||||
def _load_words(words):
|
|
||||||
"""Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors."""
|
|
||||||
lines, num_words = [], 0
|
|
||||||
for w in words:
|
|
||||||
parts = w.split(" ")
|
|
||||||
if len(parts) == 1:
|
|
||||||
num_words += 1
|
|
||||||
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1)))
|
|
||||||
else:
|
|
||||||
lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0]))
|
|
||||||
for part in parts[1:]:
|
|
||||||
num_words += 1
|
|
||||||
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1)))
|
|
||||||
return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"])))
|
|
||||||
|
|
||||||
def _test_exception(self, gold, system):
|
|
||||||
self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system))
|
|
||||||
|
|
||||||
def _test_ok(self, gold, system, correct):
|
|
||||||
metrics = evaluate(self._load_words(gold), self._load_words(system))
|
|
||||||
gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold))
|
|
||||||
system_words = sum((max(1, len(word.split(" ")) - 1) for word in system))
|
|
||||||
self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1),
|
|
||||||
(correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words)))
|
|
||||||
|
|
||||||
def test_exception(self):
|
|
||||||
self._test_exception(["a"], ["b"])
|
|
||||||
|
|
||||||
def test_equal(self):
|
|
||||||
self._test_ok(["a"], ["a"], 1)
|
|
||||||
self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3)
|
|
||||||
|
|
||||||
def test_equal_with_multiword(self):
|
|
||||||
self._test_ok(["abc a b c"], ["a", "b", "c"], 3)
|
|
||||||
self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4)
|
|
||||||
self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4)
|
|
||||||
self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5)
|
|
||||||
|
|
||||||
def test_alignment(self):
|
|
||||||
self._test_ok(["abcd"], ["a", "b", "c", "d"], 0)
|
|
||||||
self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1)
|
|
||||||
self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2)
|
|
||||||
self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2)
|
|
||||||
self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4)
|
|
||||||
self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2)
|
|
||||||
self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)
|
|
|
@ -8,8 +8,8 @@ from thinc.neural._classes.model import Model
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus, minibatch
|
||||||
from ..util import prints, minibatch, minibatch_by_words
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
|
@ -51,6 +51,8 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
train_path = util.ensure_path(train_data)
|
train_path = util.ensure_path(train_data)
|
||||||
dev_path = util.ensure_path(dev_data)
|
dev_path = util.ensure_path(dev_data)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
|
if not output_path.exists():
|
||||||
|
output_path.mkdir()
|
||||||
if not train_path.exists():
|
if not train_path.exists():
|
||||||
prints(train_path, title="Training data not found", exits=1)
|
prints(train_path, title="Training data not found", exits=1)
|
||||||
if dev_path and not dev_path.exists():
|
if dev_path and not dev_path.exists():
|
||||||
|
@ -63,14 +65,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
title="Not a valid meta.json format", exits=1)
|
title="Not a valid meta.json format", exits=1)
|
||||||
meta.setdefault('lang', lang)
|
meta.setdefault('lang', lang)
|
||||||
meta.setdefault('name', 'unnamed')
|
meta.setdefault('name', 'unnamed')
|
||||||
|
|
||||||
if not output_path.exists():
|
|
||||||
output_path.mkdir()
|
|
||||||
|
|
||||||
print("Counting training words (limit=%s" % n_sents)
|
|
||||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
|
||||||
n_train_words = corpus.count_train()
|
|
||||||
print(n_train_words)
|
|
||||||
pipeline = ['tagger', 'parser', 'ner']
|
pipeline = ['tagger', 'parser', 'ner']
|
||||||
if no_tagger and 'tagger' in pipeline:
|
if no_tagger and 'tagger' in pipeline:
|
||||||
pipeline.remove('tagger')
|
pipeline.remove('tagger')
|
||||||
|
@ -86,9 +81,13 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
|
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
|
||||||
util.env_opt('dropout_to', 0.2),
|
util.env_opt('dropout_to', 0.2),
|
||||||
util.env_opt('dropout_decay', 0.0))
|
util.env_opt('dropout_decay', 0.0))
|
||||||
batch_sizes = util.compounding(util.env_opt('batch_from', 1000),
|
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
||||||
util.env_opt('batch_to', 1000),
|
util.env_opt('batch_to', 16),
|
||||||
util.env_opt('batch_compound', 1.001))
|
util.env_opt('batch_compound', 1.001))
|
||||||
|
max_doc_len = util.env_opt('max_doc_len', 5000)
|
||||||
|
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||||
|
n_train_words = corpus.count_train()
|
||||||
|
|
||||||
lang_class = util.get_lang_class(lang)
|
lang_class = util.get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
meta['pipeline'] = pipeline
|
meta['pipeline'] = pipeline
|
||||||
|
@ -106,7 +105,6 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
lex.is_oov = False
|
lex.is_oov = False
|
||||||
for name in pipeline:
|
for name in pipeline:
|
||||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||||
nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
|
|
||||||
if parser_multitasks:
|
if parser_multitasks:
|
||||||
for objective in parser_multitasks.split(','):
|
for objective in parser_multitasks.split(','):
|
||||||
nlp.parser.add_multitask_objective(objective)
|
nlp.parser.add_multitask_objective(objective)
|
||||||
|
@ -118,20 +116,21 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
|
|
||||||
print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||||
try:
|
try:
|
||||||
|
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||||
|
gold_preproc=gold_preproc, max_length=0)
|
||||||
|
train_docs = list(train_docs)
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
train_docs = corpus.train_docs(nlp, noise_level=0.0,
|
|
||||||
gold_preproc=gold_preproc, max_length=0)
|
|
||||||
words_seen = 0
|
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch_by_words(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
|
batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
|
||||||
if not batch:
|
if not batch:
|
||||||
continue
|
continue
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer,
|
nlp.update(docs, golds, sgd=optimizer,
|
||||||
drop=next(dropout_rates), losses=losses)
|
drop=next(dropout_rates), losses=losses)
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
words_seen += sum(len(doc) for doc in docs)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
util.set_env_log(False)
|
util.set_env_log(False)
|
||||||
epoch_model_path = output_path / ('model%d' % i)
|
epoch_model_path = output_path / ('model%d' % i)
|
||||||
|
|
|
@ -1,372 +0,0 @@
|
||||||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
|
||||||
'''
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import plac
|
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
import spacy.util
|
|
||||||
from ..tokens import Token, Doc
|
|
||||||
from ..gold import GoldParse
|
|
||||||
from ..util import compounding, minibatch_by_words
|
|
||||||
from ..syntax.nonproj import projectivize
|
|
||||||
from ..matcher import Matcher
|
|
||||||
from .. import displacy
|
|
||||||
from collections import defaultdict, Counter
|
|
||||||
from timeit import default_timer as timer
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
import random
|
|
||||||
import numpy.random
|
|
||||||
import cytoolz
|
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
|
||||||
|
|
||||||
from .. import lang
|
|
||||||
from .. import lang
|
|
||||||
from ..lang import zh
|
|
||||||
from ..lang import ja
|
|
||||||
|
|
||||||
|
|
||||||
################
|
|
||||||
# Data reading #
|
|
||||||
################
|
|
||||||
|
|
||||||
space_re = re.compile('\s+')
|
|
||||||
def split_text(text):
|
|
||||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
|
||||||
max_doc_length=None, limit=None):
|
|
||||||
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
|
||||||
include Doc objects created using nlp.make_doc and then aligned against
|
|
||||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
|
||||||
created from the gold-standard segments. At least one must be True.'''
|
|
||||||
if not raw_text and not oracle_segments:
|
|
||||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
|
||||||
paragraphs = split_text(text_file.read())
|
|
||||||
conllu = read_conllu(conllu_file)
|
|
||||||
# sd is spacy doc; cd is conllu doc
|
|
||||||
# cs is conllu sent, ct is conllu token
|
|
||||||
docs = []
|
|
||||||
golds = []
|
|
||||||
for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
|
|
||||||
sent_annots = []
|
|
||||||
for cs in cd:
|
|
||||||
sent = defaultdict(list)
|
|
||||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
|
||||||
if '.' in id_:
|
|
||||||
continue
|
|
||||||
if '-' in id_:
|
|
||||||
continue
|
|
||||||
id_ = int(id_)-1
|
|
||||||
head = int(head)-1 if head != '0' else id_
|
|
||||||
sent['words'].append(word)
|
|
||||||
sent['tags'].append(tag)
|
|
||||||
sent['heads'].append(head)
|
|
||||||
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
|
||||||
sent['spaces'].append(space_after == '_')
|
|
||||||
sent['entities'] = ['-'] * len(sent['words'])
|
|
||||||
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
|
||||||
sent['deps'])
|
|
||||||
if oracle_segments:
|
|
||||||
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
|
|
||||||
golds.append(GoldParse(docs[-1], **sent))
|
|
||||||
|
|
||||||
sent_annots.append(sent)
|
|
||||||
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
|
||||||
sent_annots = []
|
|
||||||
docs.append(doc)
|
|
||||||
golds.append(gold)
|
|
||||||
if limit and len(docs) >= limit:
|
|
||||||
return docs, golds
|
|
||||||
|
|
||||||
if raw_text and sent_annots:
|
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
|
||||||
docs.append(doc)
|
|
||||||
golds.append(gold)
|
|
||||||
if limit and len(docs) >= limit:
|
|
||||||
return docs, golds
|
|
||||||
return docs, golds
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
|
||||||
docs = []
|
|
||||||
sent = []
|
|
||||||
doc = []
|
|
||||||
for line in file_:
|
|
||||||
if line.startswith('# newdoc'):
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
doc = []
|
|
||||||
elif line.startswith('#'):
|
|
||||||
continue
|
|
||||||
elif not line.strip():
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
sent = []
|
|
||||||
else:
|
|
||||||
sent.append(list(line.strip().split('\t')))
|
|
||||||
if len(sent[-1]) != 10:
|
|
||||||
print(repr(line))
|
|
||||||
raise ValueError
|
|
||||||
if sent:
|
|
||||||
doc.append(sent)
|
|
||||||
if doc:
|
|
||||||
docs.append(doc)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def _make_gold(nlp, text, sent_annots):
|
|
||||||
# Flatten the conll annotations, and adjust the head indices
|
|
||||||
flat = defaultdict(list)
|
|
||||||
for sent in sent_annots:
|
|
||||||
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
|
||||||
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
|
|
||||||
flat[field].extend(sent[field])
|
|
||||||
# Construct text if necessary
|
|
||||||
assert len(flat['words']) == len(flat['spaces'])
|
|
||||||
if text is None:
|
|
||||||
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
|
|
||||||
doc = nlp.make_doc(text)
|
|
||||||
flat.pop('spaces')
|
|
||||||
gold = GoldParse(doc, **flat)
|
|
||||||
return doc, gold
|
|
||||||
|
|
||||||
#############################
|
|
||||||
# Data transforms for spaCy #
|
|
||||||
#############################
|
|
||||||
|
|
||||||
def golds_to_gold_tuples(docs, golds):
|
|
||||||
'''Get out the annoying 'tuples' format used by begin_training, given the
|
|
||||||
GoldParse objects.'''
|
|
||||||
tuples = []
|
|
||||||
for doc, gold in zip(docs, golds):
|
|
||||||
text = doc.text
|
|
||||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
|
||||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
|
||||||
tuples.append((text, sents))
|
|
||||||
return tuples
|
|
||||||
|
|
||||||
|
|
||||||
##############
|
|
||||||
# Evaluation #
|
|
||||||
##############
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|
||||||
with text_loc.open('r', encoding='utf8') as text_file:
|
|
||||||
texts = split_text(text_file.read())
|
|
||||||
docs = list(nlp.pipe(texts))
|
|
||||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
|
||||||
write_conllu(docs, out_file)
|
|
||||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
|
||||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
|
||||||
return docs, scores
|
|
||||||
|
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
|
||||||
merger = Matcher(docs[0].vocab)
|
|
||||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
matches = merger(doc)
|
|
||||||
spans = [doc[start:end+1] for _, start, end in matches]
|
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
|
||||||
for start_char, end_char in offsets:
|
|
||||||
doc.merge(start_char, end_char)
|
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
|
||||||
for j, sent in enumerate(doc.sents):
|
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
|
||||||
for k, token in enumerate(sent):
|
|
||||||
file_.write(token._.get_conllu_lines(k) + '\n')
|
|
||||||
file_.write('\n')
|
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, ud_scores):
|
|
||||||
fields = {
|
|
||||||
'dep_loss': losses.get('parser', 0.0),
|
|
||||||
'tag_loss': losses.get('tagger', 0.0),
|
|
||||||
'words': ud_scores['Words'].f1 * 100,
|
|
||||||
'sents': ud_scores['Sentences'].f1 * 100,
|
|
||||||
'tags': ud_scores['XPOS'].f1 * 100,
|
|
||||||
'uas': ud_scores['UAS'].f1 * 100,
|
|
||||||
'las': ud_scores['LAS'].f1 * 100,
|
|
||||||
}
|
|
||||||
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
|
||||||
if itn == 0:
|
|
||||||
print('\t'.join(header))
|
|
||||||
tpl = '\t'.join((
|
|
||||||
'{:d}',
|
|
||||||
'{dep_loss:.1f}',
|
|
||||||
'{las:.1f}',
|
|
||||||
'{uas:.1f}',
|
|
||||||
'{tags:.1f}',
|
|
||||||
'{sents:.1f}',
|
|
||||||
'{words:.1f}',
|
|
||||||
))
|
|
||||||
print(tpl.format(itn, **fields))
|
|
||||||
|
|
||||||
#def get_sent_conllu(sent, sent_id):
|
|
||||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
|
||||||
|
|
||||||
def get_token_conllu(token, i):
|
|
||||||
if token._.begins_fused:
|
|
||||||
n = 1
|
|
||||||
while token.nbor(n)._.inside_fused:
|
|
||||||
n += 1
|
|
||||||
id_ = '%d-%d' % (i, i+n)
|
|
||||||
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
|
|
||||||
else:
|
|
||||||
lines = []
|
|
||||||
if token.head.i == token.i:
|
|
||||||
head = 0
|
|
||||||
else:
|
|
||||||
head = i + (token.head.i - token.i) + 1
|
|
||||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
|
||||||
str(head), token.dep_.lower(), '_', '_']
|
|
||||||
lines.append('\t'.join(fields))
|
|
||||||
return '\n'.join(lines)
|
|
||||||
|
|
||||||
Token.set_extension('get_conllu_lines', method=get_token_conllu)
|
|
||||||
Token.set_extension('begins_fused', default=False)
|
|
||||||
Token.set_extension('inside_fused', default=False)
|
|
||||||
|
|
||||||
|
|
||||||
##################
|
|
||||||
# Initialization #
|
|
||||||
##################
|
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(corpus, config):
|
|
||||||
lang = corpus.split('_')[0]
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
if config.vectors:
|
|
||||||
nlp.vocab.from_disk(Path(config.vectors) / 'vocab')
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config):
|
|
||||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
|
||||||
if config.multitask_tag:
|
|
||||||
nlp.parser.add_multitask_objective('tag')
|
|
||||||
if config.multitask_sent:
|
|
||||||
nlp.parser.add_multitask_objective('sent_start')
|
|
||||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
|
||||||
for gold in golds:
|
|
||||||
for tag in gold.tags:
|
|
||||||
if tag is not None:
|
|
||||||
nlp.tagger.add_label(tag)
|
|
||||||
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
|
|
||||||
|
|
||||||
|
|
||||||
########################
|
|
||||||
# Command line helpers #
|
|
||||||
########################
|
|
||||||
|
|
||||||
class Config(object):
|
|
||||||
def __init__(self, vectors=None, max_doc_length=10, multitask_tag=True,
|
|
||||||
multitask_sent=True, nr_epoch=30, batch_size=1000, dropout=0.2):
|
|
||||||
for key, value in locals().items():
|
|
||||||
setattr(self, key, value)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, loc):
|
|
||||||
with Path(loc).open('r', encoding='utf8') as file_:
|
|
||||||
cfg = json.load(file_)
|
|
||||||
return cls(**cfg)
|
|
||||||
|
|
||||||
|
|
||||||
class Dataset(object):
|
|
||||||
def __init__(self, path, section):
|
|
||||||
self.path = path
|
|
||||||
self.section = section
|
|
||||||
self.conllu = None
|
|
||||||
self.text = None
|
|
||||||
for file_path in self.path.iterdir():
|
|
||||||
name = file_path.parts[-1]
|
|
||||||
if section in name and name.endswith('conllu'):
|
|
||||||
self.conllu = file_path
|
|
||||||
elif section in name and name.endswith('txt'):
|
|
||||||
self.text = file_path
|
|
||||||
if self.conllu is None:
|
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
|
||||||
raise IOError(msg.format(section=section, path=path))
|
|
||||||
if self.text is None:
|
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
|
||||||
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
|
|
||||||
|
|
||||||
|
|
||||||
class TreebankPaths(object):
|
|
||||||
def __init__(self, ud_path, treebank, **cfg):
|
|
||||||
self.train = Dataset(ud_path / treebank, 'train')
|
|
||||||
self.dev = Dataset(ud_path / treebank, 'dev')
|
|
||||||
self.lang = self.train.lang
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
|
||||||
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
|
||||||
"positional", None, str),
|
|
||||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
|
||||||
config=("Path to json formatted config file", "positional"),
|
|
||||||
limit=("Size limit", "option", "n", int)
|
|
||||||
)
|
|
||||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
|
||||||
|
|
||||||
random.seed(0)
|
|
||||||
numpy.random.seed(0)
|
|
||||||
|
|
||||||
config = Config.load(config)
|
|
||||||
paths = TreebankPaths(ud_dir, corpus)
|
|
||||||
if not (parses_dir / corpus).exists():
|
|
||||||
(parses_dir / corpus).mkdir()
|
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
|
||||||
nlp = load_nlp(paths.lang, config)
|
|
||||||
|
|
||||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
|
||||||
max_doc_length=config.max_doc_length, limit=limit)
|
|
||||||
|
|
||||||
optimizer = initialize_pipeline(nlp, docs, golds, config)
|
|
||||||
|
|
||||||
batch_sizes = compounding(config.batch_size //10, config.batch_size, 1.001)
|
|
||||||
for i in range(config.nr_epoch):
|
|
||||||
docs = [nlp.make_doc(doc.text) for doc in docs]
|
|
||||||
Xs = list(zip(docs, golds))
|
|
||||||
random.shuffle(Xs)
|
|
||||||
batches = minibatch_by_words(Xs, size=batch_sizes)
|
|
||||||
losses = {}
|
|
||||||
n_train_words = sum(len(doc) for doc in docs)
|
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
|
||||||
for batch in batches:
|
|
||||||
batch_docs, batch_gold = zip(*batch)
|
|
||||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
|
||||||
nlp.update(batch_docs, batch_gold, sgd=optimizer,
|
|
||||||
drop=config.dropout, losses=losses)
|
|
||||||
|
|
||||||
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
|
|
||||||
with nlp.use_params(optimizer.averages):
|
|
||||||
parsed_docs, scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
|
|
||||||
print_progress(i, losses, scores)
|
|
||||||
_render_parses(i, parsed_docs[:50])
|
|
||||||
|
|
||||||
|
|
||||||
def _render_parses(i, to_render):
|
|
||||||
to_render[0].user_data['title'] = "Batch %d" % i
|
|
||||||
with Path('/tmp/parses.html').open('w') as file_:
|
|
||||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
|
||||||
file_.write(html)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
plac.call(main)
|
|
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import six
|
||||||
import ftfy
|
import ftfy
|
||||||
import sys
|
import sys
|
||||||
import ujson
|
import ujson
|
||||||
|
@ -46,10 +47,9 @@ is_windows = sys.platform.startswith('win')
|
||||||
is_linux = sys.platform.startswith('linux')
|
is_linux = sys.platform.startswith('linux')
|
||||||
is_osx = sys.platform == 'darwin'
|
is_osx = sys.platform == 'darwin'
|
||||||
|
|
||||||
# See: https://github.com/benjaminp/six/blob/master/six.py
|
is_python2 = six.PY2
|
||||||
is_python2 = sys.version_info[0] == 2
|
is_python3 = six.PY3
|
||||||
is_python3 = sys.version_info[0] == 3
|
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1]<5)
|
||||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
|
|
||||||
|
|
||||||
if is_python2:
|
if is_python2:
|
||||||
bytes_ = str
|
bytes_ = str
|
||||||
|
|
360
spacy/gold.pyx
360
spacy/gold.pyx
|
@ -3,25 +3,16 @@
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import ujson
|
||||||
import random
|
import random
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import itertools
|
import itertools
|
||||||
import numpy
|
|
||||||
import tempfile
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
import msgpack
|
|
||||||
|
|
||||||
import ujson
|
|
||||||
|
|
||||||
from . import _align
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
from . import util
|
from . import util
|
||||||
from .util import minibatch, itershuffle
|
from .util import minibatch
|
||||||
from .compat import json_dumps
|
|
||||||
|
|
||||||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
entities = []
|
entities = []
|
||||||
|
@ -68,138 +59,160 @@ def merge_sents(sents):
|
||||||
return [(m_deps, m_brackets)]
|
return [(m_deps, m_brackets)]
|
||||||
|
|
||||||
|
|
||||||
punct_re = re.compile(r'\W')
|
|
||||||
def align(cand_words, gold_words):
|
def align(cand_words, gold_words):
|
||||||
|
cost, edit_path = _min_edit_path(cand_words, gold_words)
|
||||||
|
alignment = []
|
||||||
|
i_of_gold = 0
|
||||||
|
for move in edit_path:
|
||||||
|
if move == 'M':
|
||||||
|
alignment.append(i_of_gold)
|
||||||
|
i_of_gold += 1
|
||||||
|
elif move == 'S':
|
||||||
|
alignment.append(None)
|
||||||
|
i_of_gold += 1
|
||||||
|
elif move == 'D':
|
||||||
|
alignment.append(None)
|
||||||
|
elif move == 'I':
|
||||||
|
i_of_gold += 1
|
||||||
|
else:
|
||||||
|
raise Exception(move)
|
||||||
|
return alignment
|
||||||
|
|
||||||
|
|
||||||
|
punct_re = re.compile(r'\W')
|
||||||
|
|
||||||
|
|
||||||
|
def _min_edit_path(cand_words, gold_words):
|
||||||
|
cdef:
|
||||||
|
Pool mem
|
||||||
|
int i, j, n_cand, n_gold
|
||||||
|
int* curr_costs
|
||||||
|
int* prev_costs
|
||||||
|
|
||||||
|
# TODO: Fix this --- just do it properly, make the full edit matrix and
|
||||||
|
# then walk back over it...
|
||||||
|
# Preprocess inputs
|
||||||
|
cand_words = [punct_re.sub('', w).lower() for w in cand_words]
|
||||||
|
gold_words = [punct_re.sub('', w).lower() for w in gold_words]
|
||||||
|
|
||||||
if cand_words == gold_words:
|
if cand_words == gold_words:
|
||||||
alignment = numpy.arange(len(cand_words))
|
return 0, ''.join(['M' for _ in gold_words])
|
||||||
return 0, alignment, alignment, {}, {}
|
mem = Pool()
|
||||||
cand_words = [w.replace(' ', '') for w in cand_words]
|
n_cand = len(cand_words)
|
||||||
gold_words = [w.replace(' ', '') for w in gold_words]
|
n_gold = len(gold_words)
|
||||||
cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
|
# Levenshtein distance, except we need the history, and we may want
|
||||||
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words],
|
# different costs. Mark operations with a string, and score the history
|
||||||
[len(w) for w in gold_words])
|
# using _edit_cost.
|
||||||
for i, j in list(i2j_multi.items()):
|
previous_row = []
|
||||||
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
|
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||||
i2j[i] = j
|
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||||
i2j_multi.pop(i)
|
for i in range(n_gold + 1):
|
||||||
for j, i in list(j2i_multi.items()):
|
cell = ''
|
||||||
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
|
for j in range(i):
|
||||||
j2i[j] = i
|
cell += 'I'
|
||||||
j2i_multi.pop(j)
|
previous_row.append('I' * i)
|
||||||
return cost, i2j, j2i, i2j_multi, j2i_multi
|
prev_costs[i] = i
|
||||||
|
for i, cand in enumerate(cand_words):
|
||||||
|
current_row = ['D' * (i + 1)]
|
||||||
|
curr_costs[0] = i+1
|
||||||
|
for j, gold in enumerate(gold_words):
|
||||||
|
if gold.lower() == cand.lower():
|
||||||
|
s_cost = prev_costs[j]
|
||||||
|
i_cost = curr_costs[j] + 1
|
||||||
|
d_cost = prev_costs[j + 1] + 1
|
||||||
|
else:
|
||||||
|
s_cost = prev_costs[j] + 1
|
||||||
|
i_cost = curr_costs[j] + 1
|
||||||
|
d_cost = prev_costs[j + 1] + (1 if cand else 0)
|
||||||
|
|
||||||
|
if s_cost <= i_cost and s_cost <= d_cost:
|
||||||
|
best_cost = s_cost
|
||||||
|
best_hist = previous_row[j] + ('M' if gold == cand else 'S')
|
||||||
|
elif i_cost <= s_cost and i_cost <= d_cost:
|
||||||
|
best_cost = i_cost
|
||||||
|
best_hist = current_row[j] + 'I'
|
||||||
|
else:
|
||||||
|
best_cost = d_cost
|
||||||
|
best_hist = previous_row[j + 1] + 'D'
|
||||||
|
|
||||||
|
current_row.append(best_hist)
|
||||||
|
curr_costs[j+1] = best_cost
|
||||||
|
previous_row = current_row
|
||||||
|
for j in range(len(gold_words) + 1):
|
||||||
|
prev_costs[j] = curr_costs[j]
|
||||||
|
curr_costs[j] = 0
|
||||||
|
|
||||||
|
return prev_costs[n_gold], previous_row[-1]
|
||||||
|
|
||||||
|
|
||||||
class GoldCorpus(object):
|
class GoldCorpus(object):
|
||||||
"""An annotated corpus, using the JSON file format. Manages
|
"""An annotated corpus, using the JSON file format. Manages
|
||||||
annotations for tagging, dependency parsing and NER."""
|
annotations for tagging, dependency parsing and NER."""
|
||||||
def __init__(self, train, dev, gold_preproc=False, limit=None):
|
def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
|
||||||
"""Create a GoldCorpus.
|
"""Create a GoldCorpus.
|
||||||
|
|
||||||
train_path (unicode or Path): File or directory of training data.
|
train_path (unicode or Path): File or directory of training data.
|
||||||
dev_path (unicode or Path): File or directory of development data.
|
dev_path (unicode or Path): File or directory of development data.
|
||||||
RETURNS (GoldCorpus): The newly created object.
|
RETURNS (GoldCorpus): The newly created object.
|
||||||
"""
|
"""
|
||||||
|
self.train_path = util.ensure_path(train_path)
|
||||||
|
self.dev_path = util.ensure_path(dev_path)
|
||||||
self.limit = limit
|
self.limit = limit
|
||||||
if isinstance(train, str) or isinstance(train, Path):
|
self.train_locs = self.walk_corpus(self.train_path)
|
||||||
train = self.read_tuples(self.walk_corpus(train))
|
self.dev_locs = self.walk_corpus(self.dev_path)
|
||||||
dev = self.read_tuples(self.walk_corpus(dev))
|
|
||||||
|
|
||||||
# Write temp directory with one doc per file, so we can shuffle
|
@property
|
||||||
# and stream
|
def train_tuples(self):
|
||||||
self.tmp_dir = Path(tempfile.mkdtemp())
|
|
||||||
self.write_msgpack(self.tmp_dir / 'train', train)
|
|
||||||
self.write_msgpack(self.tmp_dir / 'dev', dev)
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
shutil.rmtree(self.tmp_dir)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def write_msgpack(directory, doc_tuples):
|
|
||||||
if not directory.exists():
|
|
||||||
directory.mkdir()
|
|
||||||
for i, doc_tuple in enumerate(doc_tuples):
|
|
||||||
with open(directory / '{}.msg'.format(i), 'wb') as file_:
|
|
||||||
msgpack.dump([doc_tuple], file_, use_bin_type=True, encoding='utf8')
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def walk_corpus(path):
|
|
||||||
path = util.ensure_path(path)
|
|
||||||
if not path.is_dir():
|
|
||||||
return [path]
|
|
||||||
paths = [path]
|
|
||||||
locs = []
|
|
||||||
seen = set()
|
|
||||||
for path in paths:
|
|
||||||
if str(path) in seen:
|
|
||||||
continue
|
|
||||||
seen.add(str(path))
|
|
||||||
if path.parts[-1].startswith('.'):
|
|
||||||
continue
|
|
||||||
elif path.is_dir():
|
|
||||||
paths.extend(path.iterdir())
|
|
||||||
elif path.parts[-1].endswith('.json'):
|
|
||||||
locs.append(path)
|
|
||||||
return locs
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def read_tuples(locs, limit=0):
|
|
||||||
i = 0
|
i = 0
|
||||||
for loc in locs:
|
for loc in self.train_locs:
|
||||||
loc = util.ensure_path(loc)
|
gold_tuples = read_json_file(loc)
|
||||||
if loc.parts[-1].endswith('json'):
|
|
||||||
gold_tuples = read_json_file(loc)
|
|
||||||
elif loc.parts[-1].endswith('msg'):
|
|
||||||
with loc.open('rb') as file_:
|
|
||||||
gold_tuples = msgpack.load(file_, encoding='utf8')
|
|
||||||
else:
|
|
||||||
msg = "Cannot read from file: %s. Supported formats: .json, .msg"
|
|
||||||
raise ValueError(msg % loc)
|
|
||||||
for item in gold_tuples:
|
for item in gold_tuples:
|
||||||
yield item
|
yield item
|
||||||
i += len(item[1])
|
i += len(item[1])
|
||||||
if limit and i >= limit:
|
if self.limit and i >= self.limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dev_tuples(self):
|
def dev_tuples(self):
|
||||||
locs = (self.tmp_dir / 'dev').iterdir()
|
i = 0
|
||||||
yield from self.read_tuples(locs, limit=self.limit)
|
for loc in self.dev_locs:
|
||||||
|
gold_tuples = read_json_file(loc)
|
||||||
@property
|
for item in gold_tuples:
|
||||||
def train_tuples(self):
|
yield item
|
||||||
locs = (self.tmp_dir / 'train').iterdir()
|
i += len(item[1])
|
||||||
yield from self.read_tuples(locs, limit=self.limit)
|
if self.limit and i >= self.limit:
|
||||||
|
break
|
||||||
|
|
||||||
def count_train(self):
|
def count_train(self):
|
||||||
n = 0
|
n = 0
|
||||||
i = 0
|
i = 0
|
||||||
for raw_text, paragraph_tuples in self.train_tuples:
|
for raw_text, paragraph_tuples in self.train_tuples:
|
||||||
for sent_tuples, brackets in paragraph_tuples:
|
n += sum([len(s[0][1]) for s in paragraph_tuples])
|
||||||
n += len(sent_tuples[1])
|
|
||||||
if self.limit and i >= self.limit:
|
if self.limit and i >= self.limit:
|
||||||
break
|
break
|
||||||
i += len(paragraph_tuples)
|
i += len(paragraph_tuples)
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def train_docs(self, nlp, gold_preproc=False, max_length=None,
|
def train_docs(self, nlp, gold_preproc=False,
|
||||||
noise_level=0.0):
|
projectivize=False, max_length=None,
|
||||||
locs = list((self.tmp_dir / 'train').iterdir())
|
noise_level=0.0):
|
||||||
random.shuffle(locs)
|
train_tuples = self.train_tuples
|
||||||
train_tuples = self.read_tuples(locs, limit=self.limit)
|
if projectivize:
|
||||||
|
train_tuples = nonproj.preprocess_training_data(
|
||||||
|
self.train_tuples, label_freq_cutoff=100)
|
||||||
|
random.shuffle(train_tuples)
|
||||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
noise_level=noise_level,
|
noise_level=noise_level)
|
||||||
make_projective=True)
|
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
def dev_docs(self, nlp, gold_preproc=False):
|
def dev_docs(self, nlp, gold_preproc=False):
|
||||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples,
|
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||||
gold_preproc=gold_preproc)
|
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
||||||
noise_level=0.0, make_projective=False):
|
noise_level=0.0):
|
||||||
for raw_text, paragraph_tuples in tuples:
|
for raw_text, paragraph_tuples in tuples:
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
raw_text = None
|
raw_text = None
|
||||||
|
@ -207,7 +220,7 @@ class GoldCorpus(object):
|
||||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||||
gold_preproc, noise_level=noise_level)
|
gold_preproc, noise_level=noise_level)
|
||||||
golds = cls._make_golds(docs, paragraph_tuples, make_projective)
|
golds = cls._make_golds(docs, paragraph_tuples)
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
if (not max_length) or len(doc) < max_length:
|
if (not max_length) or len(doc) < max_length:
|
||||||
yield doc, gold
|
yield doc, gold
|
||||||
|
@ -224,18 +237,35 @@ class GoldCorpus(object):
|
||||||
for (sent_tuples, brackets) in paragraph_tuples]
|
for (sent_tuples, brackets) in paragraph_tuples]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_golds(cls, docs, paragraph_tuples, make_projective):
|
def _make_golds(cls, docs, paragraph_tuples):
|
||||||
assert len(docs) == len(paragraph_tuples)
|
assert len(docs) == len(paragraph_tuples)
|
||||||
if len(docs) == 1:
|
if len(docs) == 1:
|
||||||
return [GoldParse.from_annot_tuples(docs[0],
|
return [GoldParse.from_annot_tuples(docs[0],
|
||||||
paragraph_tuples[0][0],
|
paragraph_tuples[0][0])]
|
||||||
make_projective=make_projective)]
|
|
||||||
else:
|
else:
|
||||||
return [GoldParse.from_annot_tuples(doc, sent_tuples,
|
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
||||||
make_projective=make_projective)
|
|
||||||
for doc, (sent_tuples, brackets)
|
for doc, (sent_tuples, brackets)
|
||||||
in zip(docs, paragraph_tuples)]
|
in zip(docs, paragraph_tuples)]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def walk_corpus(path):
|
||||||
|
if not path.is_dir():
|
||||||
|
return [path]
|
||||||
|
paths = [path]
|
||||||
|
locs = []
|
||||||
|
seen = set()
|
||||||
|
for path in paths:
|
||||||
|
if str(path) in seen:
|
||||||
|
continue
|
||||||
|
seen.add(str(path))
|
||||||
|
if path.parts[-1].startswith('.'):
|
||||||
|
continue
|
||||||
|
elif path.is_dir():
|
||||||
|
paths.extend(path.iterdir())
|
||||||
|
elif path.parts[-1].endswith('.json'):
|
||||||
|
locs.append(path)
|
||||||
|
return locs
|
||||||
|
|
||||||
|
|
||||||
def add_noise(orig, noise_level):
|
def add_noise(orig, noise_level):
|
||||||
if random.random() >= noise_level:
|
if random.random() >= noise_level:
|
||||||
|
@ -267,7 +297,11 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
for filename in loc.iterdir():
|
for filename in loc.iterdir():
|
||||||
yield from read_json_file(loc / filename, limit=limit)
|
yield from read_json_file(loc / filename, limit=limit)
|
||||||
else:
|
else:
|
||||||
for doc in _json_iterate(loc):
|
with loc.open('r', encoding='utf8') as file_:
|
||||||
|
docs = ujson.load(file_)
|
||||||
|
if limit is not None:
|
||||||
|
docs = docs[:limit]
|
||||||
|
for doc in docs:
|
||||||
if docs_filter is not None and not docs_filter(doc):
|
if docs_filter is not None and not docs_filter(doc):
|
||||||
continue
|
continue
|
||||||
paragraphs = []
|
paragraphs = []
|
||||||
|
@ -297,56 +331,6 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
yield [paragraph.get('raw', None), sents]
|
yield [paragraph.get('raw', None), sents]
|
||||||
|
|
||||||
|
|
||||||
def _json_iterate(loc):
|
|
||||||
# We should've made these files jsonl...But since we didn't, parse out
|
|
||||||
# the docs one-by-one to reduce memory usage.
|
|
||||||
# It's okay to read in the whole file -- just don't parse it into JSON.
|
|
||||||
cdef bytes py_raw
|
|
||||||
loc = util.ensure_path(loc)
|
|
||||||
with loc.open('rb') as file_:
|
|
||||||
py_raw = file_.read()
|
|
||||||
raw = <char*>py_raw
|
|
||||||
cdef int square_depth = 0
|
|
||||||
cdef int curly_depth = 0
|
|
||||||
cdef int inside_string = 0
|
|
||||||
cdef int escape = 0
|
|
||||||
cdef int start = -1
|
|
||||||
cdef char c
|
|
||||||
cdef char quote = ord('"')
|
|
||||||
cdef char backslash = ord('\\')
|
|
||||||
cdef char open_square = ord('[')
|
|
||||||
cdef char close_square = ord(']')
|
|
||||||
cdef char open_curly = ord('{')
|
|
||||||
cdef char close_curly = ord('}')
|
|
||||||
for i in range(len(py_raw)):
|
|
||||||
c = raw[i]
|
|
||||||
if c == backslash:
|
|
||||||
escape = True
|
|
||||||
continue
|
|
||||||
if escape:
|
|
||||||
escape = False
|
|
||||||
continue
|
|
||||||
if c == quote:
|
|
||||||
inside_string = not inside_string
|
|
||||||
continue
|
|
||||||
if inside_string:
|
|
||||||
continue
|
|
||||||
if c == open_square:
|
|
||||||
square_depth += 1
|
|
||||||
elif c == close_square:
|
|
||||||
square_depth -= 1
|
|
||||||
elif c == open_curly:
|
|
||||||
if square_depth == 1 and curly_depth == 0:
|
|
||||||
start = i
|
|
||||||
curly_depth += 1
|
|
||||||
elif c == close_curly:
|
|
||||||
curly_depth -= 1
|
|
||||||
if square_depth == 1 and curly_depth == 0:
|
|
||||||
py_str = py_raw[start : i+1].decode('utf8')
|
|
||||||
yield ujson.loads(py_str)
|
|
||||||
start = -1
|
|
||||||
|
|
||||||
|
|
||||||
def iob_to_biluo(tags):
|
def iob_to_biluo(tags):
|
||||||
out = []
|
out = []
|
||||||
curr_label = None
|
curr_label = None
|
||||||
|
@ -450,21 +434,8 @@ cdef class GoldParse:
|
||||||
self.labels = [None] * len(doc)
|
self.labels = [None] * len(doc)
|
||||||
self.ner = [None] * len(doc)
|
self.ner = [None] * len(doc)
|
||||||
|
|
||||||
# This needs to be done before we align the words
|
self.cand_to_gold = align([t.orth_ for t in doc], words)
|
||||||
if make_projective and heads is not None and deps is not None:
|
self.gold_to_cand = align(words, [t.orth_ for t in doc])
|
||||||
heads, deps = nonproj.projectivize(heads, deps)
|
|
||||||
|
|
||||||
# Do many-to-one alignment for misaligned tokens.
|
|
||||||
# If we over-segment, we'll have one gold word that covers a sequence
|
|
||||||
# of predicted words
|
|
||||||
# If we under-segment, we'll have one predicted word that covers a
|
|
||||||
# sequence of gold words.
|
|
||||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
|
||||||
# a sequence of gold words. That's many-to-many -- we don't do that.
|
|
||||||
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
|
||||||
|
|
||||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
|
||||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
|
||||||
|
|
||||||
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
||||||
self.orig_annot = list(zip(*annot_tuples))
|
self.orig_annot = list(zip(*annot_tuples))
|
||||||
|
@ -472,47 +443,12 @@ cdef class GoldParse:
|
||||||
for i, gold_i in enumerate(self.cand_to_gold):
|
for i, gold_i in enumerate(self.cand_to_gold):
|
||||||
if doc[i].text.isspace():
|
if doc[i].text.isspace():
|
||||||
self.words[i] = doc[i].text
|
self.words[i] = doc[i].text
|
||||||
self.tags[i] = '_SP'
|
self.tags[i] = 'SP'
|
||||||
self.heads[i] = None
|
self.heads[i] = None
|
||||||
self.labels[i] = None
|
self.labels[i] = None
|
||||||
self.ner[i] = 'O'
|
self.ner[i] = 'O'
|
||||||
if gold_i is None:
|
if gold_i is None:
|
||||||
if i in i2j_multi:
|
pass
|
||||||
self.words[i] = words[i2j_multi[i]]
|
|
||||||
self.tags[i] = tags[i2j_multi[i]]
|
|
||||||
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
|
||||||
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
|
||||||
# Set next word in multi-token span as head, until last
|
|
||||||
if not is_last:
|
|
||||||
self.heads[i] = i+1
|
|
||||||
self.labels[i] = 'subtok'
|
|
||||||
else:
|
|
||||||
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
|
|
||||||
self.labels[i] = deps[i2j_multi[i]]
|
|
||||||
# Now set NER...This is annoying because if we've split
|
|
||||||
# got an entity word split into two, we need to adjust the
|
|
||||||
# BILOU tags. We can't have BB or LL etc.
|
|
||||||
# Case 1: O -- easy.
|
|
||||||
ner_tag = entities[i2j_multi[i]]
|
|
||||||
if ner_tag == 'O':
|
|
||||||
self.ner[i] = 'O'
|
|
||||||
# Case 2: U. This has to become a B I* L sequence.
|
|
||||||
elif ner_tag.startswith('U-'):
|
|
||||||
if is_first:
|
|
||||||
self.ner[i] = ner_tag.replace('U-', 'B-', 1)
|
|
||||||
elif is_last:
|
|
||||||
self.ner[i] = ner_tag.replace('U-', 'L-', 1)
|
|
||||||
else:
|
|
||||||
self.ner[i] = ner_tag.replace('U-', 'I-', 1)
|
|
||||||
# Case 3: L. If not last, change to I.
|
|
||||||
elif ner_tag.startswith('L-'):
|
|
||||||
if is_last:
|
|
||||||
self.ner[i] = ner_tag
|
|
||||||
else:
|
|
||||||
self.ner[i] = ner_tag.replace('L-', 'I-', 1)
|
|
||||||
# Case 4: I. Stays correct
|
|
||||||
elif ner_tag.startswith('I-'):
|
|
||||||
self.ner[i] = ner_tag
|
|
||||||
else:
|
else:
|
||||||
self.words[i] = words[gold_i]
|
self.words[i] = words[gold_i]
|
||||||
self.tags[i] = tags[gold_i]
|
self.tags[i] = tags[gold_i]
|
||||||
|
@ -527,6 +463,10 @@ cdef class GoldParse:
|
||||||
if cycle is not None:
|
if cycle is not None:
|
||||||
raise Exception("Cycle found: %s" % cycle)
|
raise Exception("Cycle found: %s" % cycle)
|
||||||
|
|
||||||
|
if make_projective:
|
||||||
|
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
|
||||||
|
self.heads = proj_heads
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""Get the number of gold-standard tokens.
|
"""Get the number of gold-standard tokens.
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
|
||||||
must my myself
|
must my myself
|
||||||
|
|
||||||
name namely neither never nevertheless next nine no nobody none noone nor not
|
name namely neither never nevertheless next nine no nobody none noone nor not
|
||||||
nothing now nowhere n't
|
nothing now nowhere
|
||||||
|
|
||||||
of off often on once one only onto or other others otherwise our ours ourselves
|
of off often on once one only onto or other others otherwise our ours ourselves
|
||||||
out over own
|
out over own
|
||||||
|
@ -66,6 +66,4 @@ whereafter whereas whereby wherein whereupon wherever whether which while
|
||||||
whither who whoever whole whom whose why will with within without would
|
whither who whoever whole whom whose why will with within without would
|
||||||
|
|
||||||
yet you your yours yourself yourselves
|
yet you your yours yourself yourselves
|
||||||
|
|
||||||
'd 'll 'm 're 's 've
|
|
||||||
""".split())
|
""".split())
|
||||||
|
|
|
@ -6,19 +6,17 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(obj):
|
||||||
doc = obj.doc
|
doc = obj.doc
|
||||||
if not len(doc):
|
np_label = doc.vocab.strings['NP']
|
||||||
return
|
|
||||||
np_label = doc.vocab.strings.add('NP')
|
|
||||||
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
||||||
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
||||||
stop_labels = ['punct']
|
stop_labels = ['punct']
|
||||||
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
|
||||||
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
|
||||||
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
|
||||||
token = doc[0]
|
token = doc[0]
|
||||||
while token and token.i < len(doc):
|
while token and token.i < len(doc):
|
||||||
if token.pos in [PROPN, NOUN, PRON]:
|
if token.pos in [PROPN, NOUN, PRON]:
|
||||||
left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
|
left, right = noun_bounds(token)
|
||||||
yield left.i, right.i+1, np_label
|
yield left.i, right.i+1, np_label
|
||||||
token = right
|
token = right
|
||||||
token = next_token(token)
|
token = next_token(token)
|
||||||
|
@ -35,7 +33,7 @@ def next_token(token):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
|
def noun_bounds(root):
|
||||||
left_bound = root
|
left_bound = root
|
||||||
for token in reversed(list(root.lefts)):
|
for token in reversed(list(root.lefts)):
|
||||||
if token.dep in np_left_deps:
|
if token.dep in np_left_deps:
|
||||||
|
@ -43,7 +41,7 @@ def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
|
||||||
right_bound = root
|
right_bound = root
|
||||||
for token in root.rights:
|
for token in root.rights:
|
||||||
if (token.dep in np_right_deps):
|
if (token.dep in np_right_deps):
|
||||||
left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
|
left, right = noun_bounds(token)
|
||||||
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
|
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||||
doc[left_bound.i: right.i])):
|
doc[left_bound.i: right.i])):
|
||||||
break
|
break
|
||||||
|
|
|
@ -1,15 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
"""
|
|
||||||
Example sentences to test spaCy and its language models.
|
|
||||||
>>> from spacy.lang.fi.examples import sentences
|
|
||||||
>>> docs = nlp.pipe(sentences)
|
|
||||||
"""
|
|
||||||
|
|
||||||
sentences = [
|
|
||||||
"Apple harkitsee ostavansa startup-yrityksen UK:sta 1 miljardilla dollarilla.",
|
|
||||||
"Itseajavat autot siirtävät vakuutusriskin valmistajille.",
|
|
||||||
"San Francisco harkitsee jakelurobottien kieltämistä jalkakäytävillä.",
|
|
||||||
"Lontoo on iso kaupunki Iso-Britanniassa."
|
|
||||||
]
|
|
|
@ -1,26 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
# import the symbols for the attrs you want to overwrite
|
|
||||||
from ...attrs import LIKE_NUM
|
|
||||||
|
|
||||||
# check if token resembles a number
|
|
||||||
|
|
||||||
_num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'seitsemän', 'kahdeksan', 'yhdeksän', 'kymmenen', 'yksitoista', 'kaksitoista', 'kolmetoista' 'neljätoista', 'viisitoista', 'kuusitoista', 'seitsemäntoista', 'kahdeksantoista', 'yhdeksäntoista', 'kaksikymmentä', 'kolmekymmentä', 'neljäkymmentä', 'viisikymmentä', 'kuusikymmentä'v, 'seitsemänkymmentä', 'kahdeksankymmentä', 'yhdeksänkymmentä', 'sata', 'tuhat', 'miljoona', 'miljardi', 'triljoona']
|
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
|
||||||
text = text.replace('.', '').replace(',', '')
|
|
||||||
if text.isdigit():
|
|
||||||
return True
|
|
||||||
if text.count('/') == 1:
|
|
||||||
num, denom = text.split('/')
|
|
||||||
if num.isdigit() and denom.isdigit():
|
|
||||||
return True
|
|
||||||
if text in _num_words:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
|
@ -79,7 +79,7 @@ pienestä pieni pienin poikki puolesta puolestaan päälle
|
||||||
|
|
||||||
runsaasti
|
runsaasti
|
||||||
|
|
||||||
saakka sama samaa samaan samalla saman samat samoin satojen se
|
saakka sama samaa samaan samalla saman samat samoin sata sataa satojen se
|
||||||
seitsemän sekä sen seuraavat siellä sieltä siihen siinä siis siitä sijaan siksi
|
seitsemän sekä sen seuraavat siellä sieltä siihen siinä siis siitä sijaan siksi
|
||||||
sille silloin sillä silti siltä sinne sinua sinulla sinulle sinulta sinun
|
sille silloin sillä silti siltä sinne sinua sinulla sinulle sinulta sinun
|
||||||
sinussa sinusta sinut sinuun sinä sisäkkäin sisällä siten sitten sitä ssa sta
|
sinussa sinusta sinut sinuun sinä sisäkkäin sisällä siten sitten sitä ssa sta
|
||||||
|
@ -89,7 +89,7 @@ taa taas taemmas tahansa tai takaa takaisin takana takia tallä tapauksessa
|
||||||
tarpeeksi tavalla tavoitteena te teidän teidät teihin teille teillä teiltä
|
tarpeeksi tavalla tavoitteena te teidän teidät teihin teille teillä teiltä
|
||||||
teissä teistä teitä tietysti todella toinen toisaalla toisaalle toisaalta
|
teissä teistä teitä tietysti todella toinen toisaalla toisaalle toisaalta
|
||||||
toiseen toiseksi toisella toiselle toiselta toisemme toisen toisensa toisessa
|
toiseen toiseksi toisella toiselle toiselta toisemme toisen toisensa toisessa
|
||||||
toisesta toista toistaiseksi toki tosin tule tulee tulemme tulen
|
toisesta toista toistaiseksi toki tosin tuhannen tuhat tule tulee tulemme tulen
|
||||||
tulet tulette tulevat tulimme tulin tulisi tulisimme tulisin tulisit tulisitte
|
tulet tulette tulevat tulimme tulin tulisi tulisimme tulisin tulisit tulisitte
|
||||||
tulisivat tulit tulitte tulivat tulla tulleet tullut tuntuu tuo tuohon tuoksi
|
tulisivat tulit tulitte tulivat tulla tulleet tullut tuntuu tuo tuohon tuoksi
|
||||||
tuolla tuolle tuolloin tuolta tuon tuona tuonne tuossa tuosta tuota tuskin tykö
|
tuolla tuolle tuolloin tuolta tuon tuona tuonne tuossa tuosta tuota tuskin tykö
|
||||||
|
|
|
@ -35,32 +35,14 @@ class JapaneseTokenizer(object):
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
class JapaneseCharacterSegmenter(object):
|
|
||||||
def __init__(self, vocab):
|
|
||||||
self.vocab = vocab
|
|
||||||
|
|
||||||
def __call__(self, text):
|
|
||||||
words = []
|
|
||||||
spaces = []
|
|
||||||
doc = self.tokenizer(text)
|
|
||||||
for token in self.tokenizer(text):
|
|
||||||
words.extend(list(token.text))
|
|
||||||
spaces.extend([False]*len(token.text))
|
|
||||||
spaces[-1] = bool(token.whitespace_)
|
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'ja'
|
lex_attr_getters[LANG] = lambda text: 'ja'
|
||||||
use_janome = True
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
if cls.use_janome:
|
return JapaneseTokenizer(cls, nlp)
|
||||||
return JapaneseTokenizer(cls, nlp)
|
|
||||||
else:
|
|
||||||
return JapaneseCharacterSegmenter(cls, nlp.vocab)
|
|
||||||
|
|
||||||
|
|
||||||
class Japanese(Language):
|
class Japanese(Language):
|
||||||
|
|
|
@ -144,7 +144,7 @@ def is_lower(string): return string.islower()
|
||||||
def is_space(string): return string.isspace()
|
def is_space(string): return string.isspace()
|
||||||
def is_title(string): return string.istitle()
|
def is_title(string): return string.istitle()
|
||||||
def is_upper(string): return string.isupper()
|
def is_upper(string): return string.isupper()
|
||||||
def is_stop(string, stops=set()): return string.lower() in stops
|
def is_stop(string, stops=set()): return string in stops
|
||||||
def is_oov(string): return True
|
def is_oov(string): return True
|
||||||
def get_prob(string): return -20.
|
def get_prob(string): return -20.
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .tag_map import TAG_MAP
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
@ -18,7 +17,6 @@ class PolishDefaults(Language.Defaults):
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tag_map = TAG_MAP
|
|
||||||
|
|
||||||
|
|
||||||
class Polish(Language):
|
class Polish(Language):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,7 +1,7 @@
|
||||||
# encoding: utf8
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
|
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -12,11 +12,24 @@ for exc_data in [
|
||||||
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
||||||
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
||||||
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
||||||
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
|
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
|
||||||
|
{ORTH: "adw.", LEMMA: "adwokat", POS: NOUN},
|
||||||
|
{ORTH: "afr.", LEMMA: "afrykański", POS: ADJ},
|
||||||
|
{ORTH: "c.b.d.o.", LEMMA: "co było do okazania", POS: ADV},
|
||||||
|
{ORTH: "cbdu.", LEMMA: "co było do udowodnienia", POS: ADV},
|
||||||
|
{ORTH: "mn.w.", LEMMA: "mniej więcej", POS: ADV},
|
||||||
|
{ORTH: "nt.", LEMMA: "na temat", POS: ADP},
|
||||||
|
{ORTH: "ok.", LEMMA: "około"},
|
||||||
|
{ORTH: "n.p.u.", LEMMA: "na psa urok"},
|
||||||
|
{ORTH: "ww.", LEMMA: "wyżej wymieniony", POS: ADV}]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
for orth in [
|
for orth in [
|
||||||
"w.", "r."]:
|
"w.", "r.", "br.", "bm.", "b.r.", "amer.", "am.", "bdb.", "św.", "p.", "lit.",
|
||||||
|
"wym.", "czyt.", "daw.", "d.", "zob.", "gw.", "dn.", "dyr.", "im.", "mł.",
|
||||||
|
"min.", "dot.", "muz.", "k.k.", "k.p.a.", "k.p.c.", "n.p.m.", "p.p.m.", "nb.",
|
||||||
|
"ob.", "n.e.", "p.n.e.", "zw.", "zool.", "zach.", "żarg.", "żart.", "wzgl.",
|
||||||
|
"wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,5 +24,5 @@ TAG_MAP = {
|
||||||
"ADJ": {POS: ADJ},
|
"ADJ": {POS: ADJ},
|
||||||
"VERB": {POS: VERB},
|
"VERB": {POS: VERB},
|
||||||
"PART": {POS: PART},
|
"PART": {POS: PART},
|
||||||
"_SP": {POS: SPACE}
|
"SP": {POS: SPACE}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,19 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LANG
|
|
||||||
from ...language import Language
|
|
||||||
from ...tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
class VietnameseDefaults(Language.Defaults):
|
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
|
||||||
lex_attr_getters[LANG] = lambda text: 'vi' # for pickling
|
|
||||||
|
|
||||||
|
|
||||||
class Vietnamese(Language):
|
|
||||||
lang = 'vi'
|
|
||||||
Defaults = VietnameseDefaults # override defaults
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Vietnamese']
|
|
|
@ -9,7 +9,6 @@ from ...tokens import Doc
|
||||||
class ChineseDefaults(Language.Defaults):
|
class ChineseDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'zh' # for pickling
|
lex_attr_getters[LANG] = lambda text: 'zh' # for pickling
|
||||||
use_jieba = True
|
|
||||||
|
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
|
@ -17,25 +16,14 @@ class Chinese(Language):
|
||||||
Defaults = ChineseDefaults # override defaults
|
Defaults = ChineseDefaults # override defaults
|
||||||
|
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
if self.Defaults.use_jieba:
|
try:
|
||||||
try:
|
import jieba
|
||||||
import jieba
|
except ImportError:
|
||||||
except ImportError:
|
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
||||||
msg = ("Jieba not installed. Either set Chinese.use_jieba = False, "
|
"https://github.com/fxsjy/jieba")
|
||||||
"or install it https://github.com/fxsjy/jieba")
|
words = list(jieba.cut(text, cut_all=False))
|
||||||
raise ImportError(msg)
|
words = [x for x in words if x]
|
||||||
words = list(jieba.cut(text, cut_all=False))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
words = [x for x in words if x]
|
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
|
||||||
else:
|
|
||||||
words = []
|
|
||||||
spaces = []
|
|
||||||
doc = self.tokenizer(text)
|
|
||||||
for token in self.tokenizer(text):
|
|
||||||
words.extend(list(token.text))
|
|
||||||
spaces.extend([False]*len(token.text))
|
|
||||||
spaces[-1] = bool(token.whitespace_)
|
|
||||||
return Doc(self.vocab, words=words, spaces=spaces)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Chinese']
|
__all__ = ['Chinese']
|
||||||
|
|
|
@ -17,7 +17,7 @@ from .vocab import Vocab
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||||
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
||||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
from .pipeline import merge_noun_chunks, merge_entities
|
||||||
from .compat import json_dumps, izip, basestring_
|
from .compat import json_dumps, izip, basestring_
|
||||||
from .gold import GoldParse
|
from .gold import GoldParse
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
|
@ -108,8 +108,7 @@ class Language(object):
|
||||||
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
||||||
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
||||||
'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
|
'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
|
||||||
'merge_entities': lambda nlp, **cfg: merge_entities,
|
'merge_entities': lambda nlp, **cfg: merge_entities
|
||||||
'merge_subtokens': lambda nlp, **cfg: merge_subtokens,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
|
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
|
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||||
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,13 +27,11 @@ class Lemmatizer(object):
|
||||||
univ_pos = 'adj'
|
univ_pos = 'adj'
|
||||||
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
||||||
univ_pos = 'punct'
|
univ_pos = 'punct'
|
||||||
elif univ_pos in (PROPN, 'PROPN'):
|
|
||||||
return [string]
|
|
||||||
else:
|
else:
|
||||||
return [string.lower()]
|
return list(set([string.lower()]))
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, morphology):
|
if self.is_base_form(univ_pos, morphology):
|
||||||
return [string.lower()]
|
return list(set([string.lower()]))
|
||||||
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
||||||
self.exc.get(univ_pos, {}),
|
self.exc.get(univ_pos, {}),
|
||||||
self.rules.get(univ_pos, []))
|
self.rules.get(univ_pos, []))
|
||||||
|
@ -90,7 +88,6 @@ class Lemmatizer(object):
|
||||||
|
|
||||||
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
orig = string
|
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
|
@ -108,5 +105,5 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(orig)
|
forms.append(string)
|
||||||
return list(set(forms))
|
return list(set(forms))
|
||||||
|
|
|
@ -1,19 +1,24 @@
|
||||||
# cython: infer_types=True
|
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
# cython: infer_types=True
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from libc.stdint cimport int32_t, uint64_t, uint16_t
|
import ujson
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from libcpp.pair cimport pair
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from .typedefs cimport attr_t, hash_t
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
|
from .typedefs cimport attr_t
|
||||||
|
from .typedefs cimport hash_t
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
from .lexeme cimport attr_id_t
|
from .tokens.doc cimport Doc, get_token_attr
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .tokens.doc cimport Doc
|
|
||||||
from .tokens.doc cimport get_token_attr
|
|
||||||
from .attrs cimport ID, attr_id_t, NULL_ATTR
|
|
||||||
from .attrs import IDS
|
from .attrs import IDS
|
||||||
|
from .attrs cimport attr_id_t, ID, NULL_ATTR
|
||||||
from .attrs import FLAG61 as U_ENT
|
from .attrs import FLAG61 as U_ENT
|
||||||
from .attrs import FLAG60 as B2_ENT
|
from .attrs import FLAG60 as B2_ENT
|
||||||
from .attrs import FLAG59 as B3_ENT
|
from .attrs import FLAG59 as B3_ENT
|
||||||
|
@ -43,24 +48,29 @@ from .attrs import FLAG36 as L9_ENT
|
||||||
from .attrs import FLAG35 as L10_ENT
|
from .attrs import FLAG35 as L10_ENT
|
||||||
|
|
||||||
|
|
||||||
cdef enum action_t:
|
cpdef enum quantifier_t:
|
||||||
REJECT = 0000
|
_META
|
||||||
MATCH = 1000
|
ONE
|
||||||
ADVANCE = 0100
|
|
||||||
RETRY = 0010
|
|
||||||
RETRY_EXTEND = 0011
|
|
||||||
MATCH_EXTEND = 1001
|
|
||||||
MATCH_REJECT = 2000
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum quantifier_t:
|
|
||||||
ZERO
|
ZERO
|
||||||
ZERO_ONE
|
ZERO_ONE
|
||||||
ZERO_PLUS
|
ZERO_PLUS
|
||||||
ONE
|
|
||||||
ONE_PLUS
|
|
||||||
|
|
||||||
|
|
||||||
|
cdef enum action_t:
|
||||||
|
REJECT
|
||||||
|
ADVANCE
|
||||||
|
REPEAT
|
||||||
|
ACCEPT
|
||||||
|
ADVANCE_ZERO
|
||||||
|
ACCEPT_PREV
|
||||||
|
PANIC
|
||||||
|
|
||||||
|
# A "match expression" conists of one or more token patterns
|
||||||
|
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
|
||||||
|
# A state is an (int, pattern pointer) pair, where the int is the start
|
||||||
|
# position, and the pattern pointer shows where we're up to
|
||||||
|
# in the pattern.
|
||||||
|
|
||||||
cdef struct AttrValueC:
|
cdef struct AttrValueC:
|
||||||
attr_id_t attr
|
attr_id_t attr
|
||||||
attr_t value
|
attr_t value
|
||||||
|
@ -70,231 +80,10 @@ cdef struct TokenPatternC:
|
||||||
AttrValueC* attrs
|
AttrValueC* attrs
|
||||||
int32_t nr_attr
|
int32_t nr_attr
|
||||||
quantifier_t quantifier
|
quantifier_t quantifier
|
||||||
hash_t key
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct ActionC:
|
ctypedef TokenPatternC* TokenPatternC_ptr
|
||||||
char emit_match
|
ctypedef pair[int, TokenPatternC_ptr] StateC
|
||||||
char next_state_next_token
|
|
||||||
char next_state_same_token
|
|
||||||
char same_state_next_token
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct PatternStateC:
|
|
||||||
TokenPatternC* pattern
|
|
||||||
int32_t start
|
|
||||||
int32_t length
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct MatchC:
|
|
||||||
attr_t pattern_id
|
|
||||||
int32_t start
|
|
||||||
int32_t length
|
|
||||||
|
|
||||||
|
|
||||||
cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
|
|
||||||
cdef vector[PatternStateC] states
|
|
||||||
cdef vector[MatchC] matches
|
|
||||||
cdef PatternStateC state
|
|
||||||
cdef Pool mem = Pool()
|
|
||||||
# TODO: Prefill this with the extra attribute values.
|
|
||||||
extra_attrs = <attr_t**>mem.alloc(len(doc), sizeof(attr_t*))
|
|
||||||
# Main loop
|
|
||||||
cdef int i, j
|
|
||||||
for i in range(doc.length):
|
|
||||||
for j in range(n):
|
|
||||||
states.push_back(PatternStateC(patterns[j], i, 0))
|
|
||||||
transition_states(states, matches, &doc.c[i], extra_attrs[i])
|
|
||||||
# Handle matches that end in 0-width patterns
|
|
||||||
finish_states(matches, states)
|
|
||||||
return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length)
|
|
||||||
for i in range(matches.size())]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
|
||||||
const TokenC* token, const attr_t* extra_attrs) except *:
|
|
||||||
cdef int q = 0
|
|
||||||
cdef vector[PatternStateC] new_states
|
|
||||||
for i in range(states.size()):
|
|
||||||
action = get_action(states[i], token, extra_attrs)
|
|
||||||
if action == REJECT:
|
|
||||||
continue
|
|
||||||
state = states[i]
|
|
||||||
states[q] = state
|
|
||||||
while action in (RETRY, RETRY_EXTEND):
|
|
||||||
if action == RETRY_EXTEND:
|
|
||||||
new_states.push_back(
|
|
||||||
PatternStateC(pattern=state.pattern, start=state.start,
|
|
||||||
length=state.length+1))
|
|
||||||
states[q].pattern += 1
|
|
||||||
action = get_action(states[q], token, extra_attrs)
|
|
||||||
if action == REJECT:
|
|
||||||
pass
|
|
||||||
elif action == ADVANCE:
|
|
||||||
states[q].pattern += 1
|
|
||||||
states[q].length += 1
|
|
||||||
q += 1
|
|
||||||
else:
|
|
||||||
ent_id = state.pattern[1].attrs.value
|
|
||||||
if action == MATCH:
|
|
||||||
matches.push_back(
|
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
|
||||||
length=state.length+1))
|
|
||||||
elif action == MATCH_REJECT:
|
|
||||||
matches.push_back(
|
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
|
||||||
length=state.length))
|
|
||||||
elif action == MATCH_EXTEND:
|
|
||||||
matches.push_back(
|
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
|
||||||
length=state.length))
|
|
||||||
states[q].length += 1
|
|
||||||
q += 1
|
|
||||||
states.resize(q)
|
|
||||||
for i in range(new_states.size()):
|
|
||||||
states.push_back(new_states[i])
|
|
||||||
|
|
||||||
|
|
||||||
cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
|
|
||||||
'''Handle states that end in zero-width patterns.'''
|
|
||||||
cdef PatternStateC state
|
|
||||||
for i in range(states.size()):
|
|
||||||
state = states[i]
|
|
||||||
while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
|
|
||||||
is_final = get_is_final(state)
|
|
||||||
if is_final:
|
|
||||||
ent_id = state.pattern[1].attrs.value
|
|
||||||
matches.push_back(
|
|
||||||
MatchC(pattern_id=ent_id, start=state.start, length=state.length))
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
state.pattern += 1
|
|
||||||
|
|
||||||
|
|
||||||
cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
|
|
||||||
'''We need to consider:
|
|
||||||
|
|
||||||
a) Does the token match the specification? [Yes, No]
|
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
|
||||||
c) Is this the last specification? [final, non-final]
|
|
||||||
|
|
||||||
We can transition in the following ways:
|
|
||||||
|
|
||||||
a) Do we emit a match?
|
|
||||||
b) Do we add a state with (next state, next token)?
|
|
||||||
c) Do we add a state with (next state, same token)?
|
|
||||||
d) Do we add a state with (same state, next token)?
|
|
||||||
|
|
||||||
We'll code the actions as boolean strings, so 0000 means no to all 4,
|
|
||||||
1000 means match but no states added, etc.
|
|
||||||
|
|
||||||
1:
|
|
||||||
Yes, final:
|
|
||||||
1000
|
|
||||||
Yes, non-final:
|
|
||||||
0100
|
|
||||||
No, final:
|
|
||||||
0000
|
|
||||||
No, non-final
|
|
||||||
0000
|
|
||||||
0+:
|
|
||||||
Yes, final:
|
|
||||||
1001
|
|
||||||
Yes, non-final:
|
|
||||||
0011
|
|
||||||
No, final:
|
|
||||||
1000 (note: Don't include last token!)
|
|
||||||
No, non-final:
|
|
||||||
0010
|
|
||||||
?:
|
|
||||||
Yes, final:
|
|
||||||
1000
|
|
||||||
Yes, non-final:
|
|
||||||
0100
|
|
||||||
No, final:
|
|
||||||
1000 (note: Don't include last token!)
|
|
||||||
No, non-final:
|
|
||||||
0010
|
|
||||||
|
|
||||||
Possible combinations: 1000, 0100, 0000, 1001, 0011, 0010,
|
|
||||||
|
|
||||||
We'll name the bits "match", "advance", "retry", "extend"
|
|
||||||
REJECT = 0000
|
|
||||||
MATCH = 1000
|
|
||||||
ADVANCE = 0100
|
|
||||||
RETRY = 0010
|
|
||||||
MATCH_EXTEND = 1001
|
|
||||||
RETRY_EXTEND = 0011
|
|
||||||
MATCH_REJECT = 2000 # Match, but don't include last token
|
|
||||||
|
|
||||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
|
||||||
'''
|
|
||||||
cdef char is_match
|
|
||||||
is_match = get_is_match(state, token, extra_attrs)
|
|
||||||
quantifier = get_quantifier(state)
|
|
||||||
is_final = get_is_final(state)
|
|
||||||
if quantifier == ZERO:
|
|
||||||
is_match = not is_match
|
|
||||||
quantifier = ONE
|
|
||||||
if quantifier == ONE:
|
|
||||||
if is_match and is_final:
|
|
||||||
# Yes, final: 1000
|
|
||||||
return MATCH
|
|
||||||
elif is_match and not is_final:
|
|
||||||
# Yes, non-final: 0100
|
|
||||||
return ADVANCE
|
|
||||||
elif not is_match and is_final:
|
|
||||||
# No, final: 0000
|
|
||||||
return REJECT
|
|
||||||
else:
|
|
||||||
return REJECT
|
|
||||||
elif quantifier == ZERO_PLUS:
|
|
||||||
if is_match and is_final:
|
|
||||||
# Yes, final: 1001
|
|
||||||
return MATCH_EXTEND
|
|
||||||
elif is_match and not is_final:
|
|
||||||
# Yes, non-final: 0011
|
|
||||||
return RETRY_EXTEND
|
|
||||||
elif not is_match and is_final:
|
|
||||||
# No, final 2000 (note: Don't include last token!)
|
|
||||||
return MATCH_REJECT
|
|
||||||
else:
|
|
||||||
# No, non-final 0010
|
|
||||||
return RETRY
|
|
||||||
elif quantifier == ZERO_ONE:
|
|
||||||
if is_match and is_final:
|
|
||||||
# Yes, final: 1000
|
|
||||||
return MATCH
|
|
||||||
elif is_match and not is_final:
|
|
||||||
# Yes, non-final: 0100
|
|
||||||
return ADVANCE
|
|
||||||
elif not is_match and is_final:
|
|
||||||
# No, final 2000 (note: Don't include last token!)
|
|
||||||
return MATCH_REJECT
|
|
||||||
else:
|
|
||||||
# No, non-final 0010
|
|
||||||
return RETRY
|
|
||||||
|
|
||||||
|
|
||||||
cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
|
|
||||||
spec = state.pattern
|
|
||||||
for attr in spec.attrs[:spec.nr_attr]:
|
|
||||||
if get_token_attr(token, attr.attr) != attr.value:
|
|
||||||
return 0
|
|
||||||
else:
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
cdef char get_is_final(PatternStateC state) nogil:
|
|
||||||
if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
cdef char get_quantifier(PatternStateC state) nogil:
|
|
||||||
return state.pattern.quantifier
|
|
||||||
|
|
||||||
|
|
||||||
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
||||||
|
@ -308,7 +97,6 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
||||||
for j, (attr, value) in enumerate(spec):
|
for j, (attr, value) in enumerate(spec):
|
||||||
pattern[i].attrs[j].attr = attr
|
pattern[i].attrs[j].attr = attr
|
||||||
pattern[i].attrs[j].value = value
|
pattern[i].attrs[j].value = value
|
||||||
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
|
|
||||||
i = len(token_specs)
|
i = len(token_specs)
|
||||||
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
|
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
|
||||||
pattern[i].attrs[0].attr = ID
|
pattern[i].attrs[0].attr = ID
|
||||||
|
@ -317,16 +105,48 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
||||||
return pattern
|
return pattern
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
|
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
|
||||||
while pattern.nr_attr != 0:
|
while pattern.nr_attr != 0:
|
||||||
pattern += 1
|
pattern += 1
|
||||||
id_attr = pattern[0].attrs[0]
|
id_attr = pattern[0].attrs[0]
|
||||||
|
assert id_attr.attr == ID
|
||||||
return id_attr.value
|
return id_attr.value
|
||||||
|
|
||||||
|
|
||||||
|
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||||
|
lookahead = &pattern[1]
|
||||||
|
for attr in pattern.attrs[:pattern.nr_attr]:
|
||||||
|
if get_token_attr(token, attr.attr) != attr.value:
|
||||||
|
if pattern.quantifier == ONE:
|
||||||
|
return REJECT
|
||||||
|
elif pattern.quantifier == ZERO:
|
||||||
|
return ACCEPT if lookahead.nr_attr == 0 else ADVANCE
|
||||||
|
elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
|
||||||
|
return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO
|
||||||
|
else:
|
||||||
|
return PANIC
|
||||||
|
if pattern.quantifier == ZERO:
|
||||||
|
return REJECT
|
||||||
|
elif lookahead.nr_attr == 0:
|
||||||
|
return ACCEPT
|
||||||
|
elif pattern.quantifier in (ONE, ZERO_ONE):
|
||||||
|
return ADVANCE
|
||||||
|
elif pattern.quantifier == ZERO_PLUS:
|
||||||
|
# This is a bandaid over the 'shadowing' problem described here:
|
||||||
|
# https://github.com/explosion/spaCy/issues/864
|
||||||
|
next_action = get_action(lookahead, token)
|
||||||
|
if next_action is REJECT:
|
||||||
|
return REPEAT
|
||||||
|
else:
|
||||||
|
return ADVANCE_ZERO
|
||||||
|
else:
|
||||||
|
return PANIC
|
||||||
|
|
||||||
|
|
||||||
def _convert_strings(token_specs, string_store):
|
def _convert_strings(token_specs, string_store):
|
||||||
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
||||||
operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
||||||
'?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
|
'?': (ZERO_ONE,), '1': (ONE,)}
|
||||||
tokens = []
|
tokens = []
|
||||||
op = ONE
|
op = ONE
|
||||||
for spec in token_specs:
|
for spec in token_specs:
|
||||||
|
@ -356,6 +176,21 @@ def _convert_strings(token_specs, string_store):
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def merge_phrase(matcher, doc, i, matches):
|
||||||
|
"""Callback to merge a phrase on match."""
|
||||||
|
ent_id, label, start, end = matches[i]
|
||||||
|
span = doc[start:end]
|
||||||
|
span.merge(ent_type=label, ent_id=ent_id)
|
||||||
|
|
||||||
|
|
||||||
|
def unpickle_matcher(vocab, patterns, callbacks):
|
||||||
|
matcher = Matcher(vocab)
|
||||||
|
for key, specs in patterns.items():
|
||||||
|
callback = callbacks.get(key, None)
|
||||||
|
matcher.add(key, callback, *specs)
|
||||||
|
return matcher
|
||||||
|
|
||||||
|
|
||||||
cdef class Matcher:
|
cdef class Matcher:
|
||||||
"""Match sequences of tokens, based on pattern rules."""
|
"""Match sequences of tokens, based on pattern rules."""
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
|
@ -476,7 +311,7 @@ cdef class Matcher:
|
||||||
if key not in self._patterns:
|
if key not in self._patterns:
|
||||||
return default
|
return default
|
||||||
return (self._callbacks[key], self._patterns[key])
|
return (self._callbacks[key], self._patterns[key])
|
||||||
|
|
||||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
|
@ -498,9 +333,85 @@ cdef class Matcher:
|
||||||
describing the matches. A match tuple describes a span
|
describing the matches. A match tuple describes a span
|
||||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||||
"""
|
"""
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doc)
|
cdef vector[StateC] partials
|
||||||
for i, (key, start, end) in enumerate(matches):
|
cdef int n_partials = 0
|
||||||
on_match = self._callbacks.get(key, None)
|
cdef int q = 0
|
||||||
|
cdef int i, token_i
|
||||||
|
cdef const TokenC* token
|
||||||
|
cdef StateC state
|
||||||
|
matches = []
|
||||||
|
for token_i in range(doc.length):
|
||||||
|
token = &doc.c[token_i]
|
||||||
|
q = 0
|
||||||
|
# Go over the open matches, extending or finalizing if able.
|
||||||
|
# Otherwise, we over-write them (q doesn't advance)
|
||||||
|
for state in partials:
|
||||||
|
action = get_action(state.second, token)
|
||||||
|
if action == PANIC:
|
||||||
|
raise Exception("Error selecting action in matcher")
|
||||||
|
while action == ADVANCE_ZERO:
|
||||||
|
state.second += 1
|
||||||
|
action = get_action(state.second, token)
|
||||||
|
if action == PANIC:
|
||||||
|
raise Exception("Error selecting action in matcher")
|
||||||
|
|
||||||
|
if action == REPEAT:
|
||||||
|
# Leave the state in the queue, and advance to next slot
|
||||||
|
# (i.e. we don't overwrite -- we want to greedily match
|
||||||
|
# more pattern.
|
||||||
|
q += 1
|
||||||
|
elif action == REJECT:
|
||||||
|
pass
|
||||||
|
elif action == ADVANCE:
|
||||||
|
partials[q] = state
|
||||||
|
partials[q].second += 1
|
||||||
|
q += 1
|
||||||
|
elif action in (ACCEPT, ACCEPT_PREV):
|
||||||
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
|
# to adjust the start position.
|
||||||
|
start = state.first
|
||||||
|
end = token_i+1 if action == ACCEPT else token_i
|
||||||
|
ent_id = state.second[1].attrs[0].value
|
||||||
|
label = state.second[1].attrs[1].value
|
||||||
|
matches.append((ent_id, start, end))
|
||||||
|
|
||||||
|
partials.resize(q)
|
||||||
|
# Check whether we open any new patterns on this token
|
||||||
|
for pattern in self.patterns:
|
||||||
|
action = get_action(pattern, token)
|
||||||
|
if action == PANIC:
|
||||||
|
raise Exception("Error selecting action in matcher")
|
||||||
|
while action == ADVANCE_ZERO:
|
||||||
|
pattern += 1
|
||||||
|
action = get_action(pattern, token)
|
||||||
|
if action == REPEAT:
|
||||||
|
state.first = token_i
|
||||||
|
state.second = pattern
|
||||||
|
partials.push_back(state)
|
||||||
|
elif action == ADVANCE:
|
||||||
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
|
# to adjust the start position.
|
||||||
|
state.first = token_i
|
||||||
|
state.second = pattern + 1
|
||||||
|
partials.push_back(state)
|
||||||
|
elif action in (ACCEPT, ACCEPT_PREV):
|
||||||
|
start = token_i
|
||||||
|
end = token_i+1 if action == ACCEPT else token_i
|
||||||
|
ent_id = pattern[1].attrs[0].value
|
||||||
|
label = pattern[1].attrs[1].value
|
||||||
|
matches.append((ent_id, start, end))
|
||||||
|
# Look for open patterns that are actually satisfied
|
||||||
|
for state in partials:
|
||||||
|
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
|
||||||
|
state.second += 1
|
||||||
|
if state.second.nr_attr == 0:
|
||||||
|
start = state.first
|
||||||
|
end = len(doc)
|
||||||
|
ent_id = state.second.attrs[0].value
|
||||||
|
label = state.second.attrs[0].value
|
||||||
|
matches.append((ent_id, start, end))
|
||||||
|
for i, (ent_id, start, end) in enumerate(matches):
|
||||||
|
on_match = self._callbacks.get(ent_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
on_match(self, doc, i, matches)
|
on_match(self, doc, i, matches)
|
||||||
return matches
|
return matches
|
||||||
|
@ -512,37 +423,31 @@ cdef class Matcher:
|
||||||
return key
|
return key
|
||||||
|
|
||||||
|
|
||||||
def unpickle_matcher(vocab, patterns, callbacks):
|
|
||||||
matcher = Matcher(vocab)
|
|
||||||
for key, specs in patterns.items():
|
|
||||||
callback = callbacks.get(key, None)
|
|
||||||
matcher.add(key, callback, *specs)
|
|
||||||
return matcher
|
|
||||||
|
|
||||||
|
|
||||||
def _get_longest_matches(matches):
|
|
||||||
'''Filter out matches that have a longer equivalent.'''
|
|
||||||
longest_matches = {}
|
|
||||||
for pattern_id, start, end in matches:
|
|
||||||
key = (pattern_id, start)
|
|
||||||
length = end-start
|
|
||||||
if key not in longest_matches or length > longest_matches[key]:
|
|
||||||
longest_matches[key] = length
|
|
||||||
return [(pattern_id, start, start+length)
|
|
||||||
for (pattern_id, start), length in longest_matches.items()]
|
|
||||||
|
|
||||||
|
|
||||||
def get_bilou(length):
|
def get_bilou(length):
|
||||||
if length == 0:
|
if length == 1:
|
||||||
raise ValueError("Length must be >= 1")
|
|
||||||
elif length == 1:
|
|
||||||
return [U_ENT]
|
return [U_ENT]
|
||||||
elif length == 2:
|
elif length == 2:
|
||||||
return [B2_ENT, L2_ENT]
|
return [B2_ENT, L2_ENT]
|
||||||
elif length == 3:
|
elif length == 3:
|
||||||
return [B3_ENT, I3_ENT, L3_ENT]
|
return [B3_ENT, I3_ENT, L3_ENT]
|
||||||
|
elif length == 4:
|
||||||
|
return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
|
||||||
|
elif length == 5:
|
||||||
|
return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
|
||||||
|
elif length == 6:
|
||||||
|
return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
|
||||||
|
elif length == 7:
|
||||||
|
return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
|
||||||
|
elif length == 8:
|
||||||
|
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
||||||
|
elif length == 9:
|
||||||
|
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
|
||||||
|
L9_ENT]
|
||||||
|
elif length == 10:
|
||||||
|
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
||||||
|
I10_ENT, I10_ENT, L10_ENT]
|
||||||
else:
|
else:
|
||||||
return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
|
raise ValueError("Max length currently 10 for phrase matching")
|
||||||
|
|
||||||
|
|
||||||
cdef class PhraseMatcher:
|
cdef class PhraseMatcher:
|
||||||
|
@ -551,21 +456,21 @@ cdef class PhraseMatcher:
|
||||||
cdef Matcher matcher
|
cdef Matcher matcher
|
||||||
cdef PreshMap phrase_ids
|
cdef PreshMap phrase_ids
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
cdef attr_t* _phrase_key
|
||||||
cdef public object _callbacks
|
cdef public object _callbacks
|
||||||
cdef public object _patterns
|
cdef public object _patterns
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, max_length=10):
|
def __init__(self, Vocab vocab, max_length=10):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.matcher = Matcher(self.vocab)
|
self.matcher = Matcher(self.vocab)
|
||||||
self.phrase_ids = PreshMap()
|
self.phrase_ids = PreshMap()
|
||||||
abstract_patterns = [
|
abstract_patterns = []
|
||||||
[{U_ENT: True}],
|
for length in range(1, max_length):
|
||||||
[{B2_ENT: True}, {L2_ENT: True}],
|
abstract_patterns.append([{tag: True}
|
||||||
[{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
|
for tag in get_bilou(length)])
|
||||||
[{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
|
|
||||||
]
|
|
||||||
self.matcher.add('Candidate', None, *abstract_patterns)
|
self.matcher.add('Candidate', None, *abstract_patterns)
|
||||||
self._callbacks = {}
|
self._callbacks = {}
|
||||||
|
|
||||||
|
@ -599,24 +504,29 @@ cdef class PhraseMatcher:
|
||||||
*docs (Doc): `Doc` objects representing match patterns.
|
*docs (Doc): `Doc` objects representing match patterns.
|
||||||
"""
|
"""
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
|
for doc in docs:
|
||||||
|
if len(doc) >= self.max_length:
|
||||||
|
msg = (
|
||||||
|
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
|
||||||
|
"Length can be set on initialization, up to 10."
|
||||||
|
)
|
||||||
|
raise ValueError(msg % (len(doc), self.max_length))
|
||||||
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||||
self._callbacks[ent_id] = on_match
|
self._callbacks[ent_id] = on_match
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef hash_t phrase_hash
|
cdef hash_t phrase_hash
|
||||||
cdef Pool mem = Pool()
|
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
length = doc.length
|
length = doc.length
|
||||||
if length == 0:
|
|
||||||
continue
|
|
||||||
tags = get_bilou(length)
|
tags = get_bilou(length)
|
||||||
phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
|
for i in range(self.max_length):
|
||||||
|
self._phrase_key[i] = 0
|
||||||
for i, tag in enumerate(tags):
|
for i, tag in enumerate(tags):
|
||||||
lexeme = self.vocab[doc.c[i].lex.orth]
|
lexeme = self.vocab[doc.c[i].lex.orth]
|
||||||
lexeme.set_flag(tag, True)
|
lexeme.set_flag(tag, True)
|
||||||
phrase_key[i] = lexeme.orth
|
self._phrase_key[i] = lexeme.orth
|
||||||
phrase_hash = hash64(phrase_key,
|
phrase_hash = hash64(self._phrase_key,
|
||||||
length * sizeof(attr_t), 0)
|
self.max_length * sizeof(attr_t), 0)
|
||||||
self.phrase_ids.set(phrase_hash, <void*>ent_id)
|
self.phrase_ids.set(phrase_hash, <void*>ent_id)
|
||||||
|
|
||||||
def __call__(self, Doc doc):
|
def __call__(self, Doc doc):
|
||||||
|
@ -638,45 +548,28 @@ cdef class PhraseMatcher:
|
||||||
on_match(self, doc, i, matches)
|
on_match(self, doc, i, matches)
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=1000, n_threads=2, return_matches=False,
|
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||||
as_tuples=False):
|
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
docs (iterable): A stream of documents.
|
docs (iterable): A stream of documents.
|
||||||
batch_size (int): Number of documents to accumulate into a working set.
|
batch_size (int): Number of documents to accumulate into a working set.
|
||||||
n_threads (int): The number of threads with which to work on the buffer
|
n_threads (int): The number of threads with which to work on the buffer
|
||||||
in parallel, if the implementation supports multi-threading.
|
in parallel, if the implementation supports multi-threading.
|
||||||
return_matches (bool): Yield the match lists along with the docs, making
|
|
||||||
results (doc, matches) tuples.
|
|
||||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
|
||||||
and yield (result, context) tuples out.
|
|
||||||
If both return_matches and as_tuples are True, the output will
|
|
||||||
be a sequence of ((doc, matches), context) tuples.
|
|
||||||
YIELDS (Doc): Documents, in order.
|
YIELDS (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
if as_tuples:
|
for doc in stream:
|
||||||
for doc, context in stream:
|
self(doc)
|
||||||
matches = self(doc)
|
yield doc
|
||||||
if return_matches:
|
|
||||||
yield ((doc, matches), context)
|
|
||||||
else:
|
|
||||||
yield (doc, context)
|
|
||||||
else:
|
|
||||||
for doc in stream:
|
|
||||||
matches = self(doc)
|
|
||||||
if return_matches:
|
|
||||||
yield (doc, matches)
|
|
||||||
else:
|
|
||||||
yield doc
|
|
||||||
|
|
||||||
def accept_match(self, Doc doc, int start, int end):
|
def accept_match(self, Doc doc, int start, int end):
|
||||||
|
assert (end - start) < self.max_length
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef Pool mem = Pool()
|
for i in range(self.max_length):
|
||||||
phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t))
|
self._phrase_key[i] = 0
|
||||||
for i, j in enumerate(range(start, end)):
|
for i, j in enumerate(range(start, end)):
|
||||||
phrase_key[i] = doc.c[j].lex.orth
|
self._phrase_key[i] = doc.c[j].lex.orth
|
||||||
cdef hash_t key = hash64(phrase_key,
|
cdef hash_t key = hash64(self._phrase_key,
|
||||||
(end-start) * sizeof(attr_t), 0)
|
self.max_length * sizeof(attr_t), 0)
|
||||||
ent_id = <hash_t>self.phrase_ids.get(key)
|
ent_id = <hash_t>self.phrase_ids.get(key)
|
||||||
if ent_id == 0:
|
if ent_id == 0:
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -47,9 +47,7 @@ cdef class Morphology:
|
||||||
cdef enum univ_morph_t:
|
cdef enum univ_morph_t:
|
||||||
NIL = 0
|
NIL = 0
|
||||||
Animacy_anim = symbols.Animacy_anim
|
Animacy_anim = symbols.Animacy_anim
|
||||||
Animacy_inan
|
Animacy_inam
|
||||||
Animacy_hum
|
|
||||||
Animacy_nhum
|
|
||||||
Aspect_freq
|
Aspect_freq
|
||||||
Aspect_imp
|
Aspect_imp
|
||||||
Aspect_mod
|
Aspect_mod
|
||||||
|
|
|
@ -184,9 +184,7 @@ cdef class Morphology:
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
"Animacy_anim": Animacy_anim,
|
"Animacy_anim": Animacy_anim,
|
||||||
"Animacy_inan": Animacy_inan,
|
"Animacy_inam": Animacy_inam,
|
||||||
"Animacy_hum": Animacy_hum, # U20
|
|
||||||
"Animacy_nhum": Animacy_nhum,
|
|
||||||
"Aspect_freq": Aspect_freq,
|
"Aspect_freq": Aspect_freq,
|
||||||
"Aspect_imp": Aspect_imp,
|
"Aspect_imp": Aspect_imp,
|
||||||
"Aspect_mod": Aspect_mod,
|
"Aspect_mod": Aspect_mod,
|
||||||
|
|
|
@ -25,7 +25,6 @@ from .morphology cimport Morphology
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .compat import json_dumps
|
from .compat import json_dumps
|
||||||
from .matcher import Matcher
|
|
||||||
|
|
||||||
from .attrs import POS
|
from .attrs import POS
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
@ -98,17 +97,6 @@ def merge_entities(doc):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def merge_subtokens(doc, label='subtok'):
|
|
||||||
merger = Matcher(doc.vocab)
|
|
||||||
merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
|
|
||||||
matches = merger(doc)
|
|
||||||
spans = [doc[start:end+1] for _, start, end in matches]
|
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
|
||||||
for start_char, end_char in offsets:
|
|
||||||
doc.merge(start_char, end_char)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
class Pipe(object):
|
class Pipe(object):
|
||||||
"""This class is not instantiated directly. Components inherit from it, and
|
"""This class is not instantiated directly. Components inherit from it, and
|
||||||
it defines the interface that components should follow to function as
|
it defines the interface that components should follow to function as
|
||||||
|
@ -179,7 +167,7 @@ class Pipe(object):
|
||||||
problem.
|
problem.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def create_optimizer(self):
|
def create_optimizer(self):
|
||||||
return create_default_optimizer(self.model.ops,
|
return create_default_optimizer(self.model.ops,
|
||||||
**self.cfg.get('optimizer', {}))
|
**self.cfg.get('optimizer', {}))
|
||||||
|
@ -664,13 +652,11 @@ class MultitaskObjective(Tagger):
|
||||||
self.make_label = self.make_dep_tag_offset
|
self.make_label = self.make_dep_tag_offset
|
||||||
elif target == 'ent_tag':
|
elif target == 'ent_tag':
|
||||||
self.make_label = self.make_ent_tag
|
self.make_label = self.make_ent_tag
|
||||||
elif target == 'sent_start':
|
|
||||||
self.make_label = self.make_sent_start
|
|
||||||
elif hasattr(target, '__call__'):
|
elif hasattr(target, '__call__'):
|
||||||
self.make_label = target
|
self.make_label = target
|
||||||
else:
|
else:
|
||||||
raise ValueError("MultitaskObjective target should be function or "
|
raise ValueError("MultitaskObjective target should be function or "
|
||||||
"one of: dep, tag, ent, sent_start, dep_tag_offset, ent_tag.")
|
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
self.cfg.setdefault('pretrained_dims',
|
self.cfg.setdefault('pretrained_dims',
|
||||||
|
@ -730,7 +716,11 @@ class MultitaskObjective(Tagger):
|
||||||
for i, gold in enumerate(golds):
|
for i, gold in enumerate(golds):
|
||||||
for j in range(len(docs[i])):
|
for j in range(len(docs[i])):
|
||||||
# Handes alignment for tokenization differences
|
# Handes alignment for tokenization differences
|
||||||
label = self.make_label(j, gold.words, gold.tags,
|
gold_idx = gold.cand_to_gold[j]
|
||||||
|
if gold_idx is None:
|
||||||
|
idx += 1
|
||||||
|
continue
|
||||||
|
label = self.make_label(gold_idx, gold.words, gold.tags,
|
||||||
gold.heads, gold.labels, gold.ents)
|
gold.heads, gold.labels, gold.ents)
|
||||||
if label is None or label not in self.labels:
|
if label is None or label not in self.labels:
|
||||||
correct[idx] = guesses[idx]
|
correct[idx] = guesses[idx]
|
||||||
|
@ -775,51 +765,6 @@ class MultitaskObjective(Tagger):
|
||||||
else:
|
else:
|
||||||
return '%s-%s' % (tags[i], ents[i])
|
return '%s-%s' % (tags[i], ents[i])
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
|
|
||||||
'''A multi-task objective for representing sentence boundaries,
|
|
||||||
using BILU scheme. (O is impossible)
|
|
||||||
|
|
||||||
The implementation of this method uses an internal cache that relies
|
|
||||||
on the identity of the heads array, to avoid requiring a new piece
|
|
||||||
of gold data. You can pass cache=False if you know the cache will
|
|
||||||
do the wrong thing.
|
|
||||||
'''
|
|
||||||
assert len(words) == len(heads)
|
|
||||||
assert target < len(words), (target, len(words))
|
|
||||||
if cache:
|
|
||||||
if id(heads) in _cache:
|
|
||||||
return _cache[id(heads)][target]
|
|
||||||
else:
|
|
||||||
for key in list(_cache.keys()):
|
|
||||||
_cache.pop(key)
|
|
||||||
sent_tags = ['I-SENT'] * len(words)
|
|
||||||
_cache[id(heads)] = sent_tags
|
|
||||||
else:
|
|
||||||
sent_tags = ['I-SENT'] * len(words)
|
|
||||||
|
|
||||||
def _find_root(child):
|
|
||||||
seen = set([child])
|
|
||||||
while child is not None and heads[child] != child:
|
|
||||||
seen.add(child)
|
|
||||||
child = heads[child]
|
|
||||||
return child
|
|
||||||
|
|
||||||
sentences = {}
|
|
||||||
for i in range(len(words)):
|
|
||||||
root = _find_root(i)
|
|
||||||
if root is None:
|
|
||||||
sent_tags[i] = None
|
|
||||||
else:
|
|
||||||
sentences.setdefault(root, []).append(i)
|
|
||||||
for root, span in sorted(sentences.items()):
|
|
||||||
if len(span) == 1:
|
|
||||||
sent_tags[span[0]] = 'U-SENT'
|
|
||||||
else:
|
|
||||||
sent_tags[span[0]] = 'B-SENT'
|
|
||||||
sent_tags[span[-1]] = 'L-SENT'
|
|
||||||
return sent_tags[target]
|
|
||||||
|
|
||||||
|
|
||||||
class SimilarityHook(Pipe):
|
class SimilarityHook(Pipe):
|
||||||
"""
|
"""
|
||||||
|
@ -878,8 +823,8 @@ class TextCategorizer(Pipe):
|
||||||
name = 'textcat'
|
name = 'textcat'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, **cfg):
|
def Model(cls, nr_class=1, width=64, **cfg):
|
||||||
return build_text_classifier(**cfg)
|
return build_text_classifier(nr_class, width, **cfg)
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -945,15 +890,6 @@ class TextCategorizer(Pipe):
|
||||||
if label in self.labels:
|
if label in self.labels:
|
||||||
return 0
|
return 0
|
||||||
if self.model not in (None, True, False):
|
if self.model not in (None, True, False):
|
||||||
# This functionality was available previously, but was broken.
|
|
||||||
# The problem is that we resize the last layer, but the last layer
|
|
||||||
# is actually just an ensemble. We're not resizing the child layers
|
|
||||||
# -- a huge problem.
|
|
||||||
raise ValueError(
|
|
||||||
"Cannot currently add labels to pre-trained text classifier. "
|
|
||||||
"Add labels before training begins. This functionality was "
|
|
||||||
"available in previous versions, but had significant bugs that "
|
|
||||||
"let to poor performance")
|
|
||||||
smaller = self.model._layers[-1]
|
smaller = self.model._layers[-1]
|
||||||
larger = Affine(len(self.labels)+1, smaller.nI)
|
larger = Affine(len(self.labels)+1, smaller.nI)
|
||||||
copy_array(larger.W[:smaller.nO], smaller.W)
|
copy_array(larger.W[:smaller.nO], smaller.W)
|
||||||
|
@ -969,9 +905,8 @@ class TextCategorizer(Pipe):
|
||||||
token_vector_width = 64
|
token_vector_width = 64
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
self.cfg['nr_class'] = len(self.labels)
|
self.model = self.Model(len(self.labels), token_vector_width,
|
||||||
self.cfg['width'] = token_vector_width
|
**self.cfg)
|
||||||
self.model = self.Model(**self.cfg)
|
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
|
@ -985,7 +920,7 @@ cdef class DependencyParser(Parser):
|
||||||
@property
|
@property
|
||||||
def postprocesses(self):
|
def postprocesses(self):
|
||||||
return [nonproj.deprojectivize]
|
return [nonproj.deprojectivize]
|
||||||
|
|
||||||
def add_multitask_objective(self, target):
|
def add_multitask_objective(self, target):
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
@ -1006,7 +941,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
nr_feature = 6
|
nr_feature = 6
|
||||||
|
|
||||||
def add_multitask_objective(self, target):
|
def add_multitask_objective(self, target):
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import division, print_function, unicode_literals
|
from __future__ import division, print_function, unicode_literals
|
||||||
|
|
||||||
from .gold import tags_to_entities, GoldParse
|
from .gold import tags_to_entities
|
||||||
|
|
||||||
|
|
||||||
class PRFScore(object):
|
class PRFScore(object):
|
||||||
|
@ -84,8 +84,6 @@ class Scorer(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
||||||
if len(tokens) != len(gold):
|
|
||||||
gold = GoldParse.from_annot_tuples(tokens, zip(*gold.orig_annot))
|
|
||||||
assert len(tokens) == len(gold)
|
assert len(tokens) == len(gold)
|
||||||
gold_deps = set()
|
gold_deps = set()
|
||||||
gold_tags = set()
|
gold_tags = set()
|
||||||
|
@ -102,7 +100,8 @@ class Scorer(object):
|
||||||
continue
|
continue
|
||||||
gold_i = gold.cand_to_gold[token.i]
|
gold_i = gold.cand_to_gold[token.i]
|
||||||
if gold_i is None:
|
if gold_i is None:
|
||||||
self.tokens.fp += 1
|
if token.dep_.lower() not in punct_labels:
|
||||||
|
self.tokens.fp += 1
|
||||||
else:
|
else:
|
||||||
self.tokens.tp += 1
|
self.tokens.tp += 1
|
||||||
cand_tags.add((gold_i, token.tag_))
|
cand_tags.add((gold_i, token.tag_))
|
||||||
|
|
|
@ -85,7 +85,6 @@ cdef enum symbol_t:
|
||||||
SENT_START
|
SENT_START
|
||||||
SPACY
|
SPACY
|
||||||
PROB
|
PROB
|
||||||
LANG
|
|
||||||
|
|
||||||
ADJ
|
ADJ
|
||||||
ADP
|
ADP
|
||||||
|
@ -109,9 +108,8 @@ cdef enum symbol_t:
|
||||||
SPACE
|
SPACE
|
||||||
|
|
||||||
Animacy_anim
|
Animacy_anim
|
||||||
Animacy_inan
|
Animacy_inam
|
||||||
Animacy_hum # U20
|
Animacy_hum # U20
|
||||||
Animacy_nhum
|
|
||||||
Aspect_freq
|
Aspect_freq
|
||||||
Aspect_imp
|
Aspect_imp
|
||||||
Aspect_mod
|
Aspect_mod
|
||||||
|
@ -395,7 +393,6 @@ cdef enum symbol_t:
|
||||||
EVENT
|
EVENT
|
||||||
WORK_OF_ART
|
WORK_OF_ART
|
||||||
LANGUAGE
|
LANGUAGE
|
||||||
LAW
|
|
||||||
|
|
||||||
DATE
|
DATE
|
||||||
TIME
|
TIME
|
||||||
|
@ -454,9 +451,10 @@ cdef enum symbol_t:
|
||||||
prt
|
prt
|
||||||
punct
|
punct
|
||||||
quantmod
|
quantmod
|
||||||
relcl
|
|
||||||
rcmod
|
rcmod
|
||||||
root
|
root
|
||||||
xcomp
|
xcomp
|
||||||
|
|
||||||
acl
|
acl
|
||||||
|
LAW
|
||||||
|
LANG
|
||||||
|
|
|
@ -114,9 +114,8 @@ IDS = {
|
||||||
"SPACE": SPACE,
|
"SPACE": SPACE,
|
||||||
|
|
||||||
"Animacy_anim": Animacy_anim,
|
"Animacy_anim": Animacy_anim,
|
||||||
"Animacy_inam": Animacy_inan,
|
"Animacy_inam": Animacy_inam,
|
||||||
"Animacy_hum": Animacy_hum, # U20
|
"Animacy_hum": Animacy_hum, # U20
|
||||||
"Animacy_nhum": Animacy_nhum,
|
|
||||||
"Aspect_freq": Aspect_freq,
|
"Aspect_freq": Aspect_freq,
|
||||||
"Aspect_imp": Aspect_imp,
|
"Aspect_imp": Aspect_imp,
|
||||||
"Aspect_mod": Aspect_mod,
|
"Aspect_mod": Aspect_mod,
|
||||||
|
@ -459,7 +458,6 @@ IDS = {
|
||||||
"punct": punct,
|
"punct": punct,
|
||||||
"quantmod": quantmod,
|
"quantmod": quantmod,
|
||||||
"rcmod": rcmod,
|
"rcmod": rcmod,
|
||||||
"relcl": relcl,
|
|
||||||
"root": root,
|
"root": root,
|
||||||
"xcomp": xcomp,
|
"xcomp": xcomp,
|
||||||
|
|
||||||
|
|
|
@ -108,7 +108,7 @@ cdef cppclass StateC:
|
||||||
ids[1] = this.B(1)
|
ids[1] = this.B(1)
|
||||||
ids[2] = this.S(0)
|
ids[2] = this.S(0)
|
||||||
ids[3] = this.S(1)
|
ids[3] = this.S(1)
|
||||||
ids[4] = this.S(2)
|
ids[4] = this.H(this.S(0))
|
||||||
ids[5] = this.L(this.B(0), 1)
|
ids[5] = this.L(this.B(0), 1)
|
||||||
ids[6] = this.L(this.S(0), 1)
|
ids[6] = this.L(this.S(0), 1)
|
||||||
ids[7] = this.R(this.S(0), 1)
|
ids[7] = this.R(this.S(0), 1)
|
||||||
|
|
|
@ -6,19 +6,16 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from cpython.ref cimport Py_INCREF
|
from cpython.ref cimport Py_INCREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from collections import OrderedDict, defaultdict, Counter
|
from collections import OrderedDict
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
import json
|
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
from . import nonproj
|
from .nonproj import is_nonproj_tree
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||||
from ..gold cimport GoldParse, GoldParseC
|
from ..gold cimport GoldParse, GoldParseC
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
|
||||||
# Calculate cost as gold/not gold. We don't use scalar value anyway.
|
|
||||||
cdef int BINARY_COSTS = 1
|
|
||||||
|
|
||||||
DEF NON_MONOTONIC = True
|
DEF NON_MONOTONIC = True
|
||||||
DEF USE_BREAK = True
|
DEF USE_BREAK = True
|
||||||
|
@ -57,8 +54,6 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
|
||||||
cost += 1
|
cost += 1
|
||||||
if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
|
if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
|
||||||
cost += 1
|
cost += 1
|
||||||
if BINARY_COSTS and cost >= 1:
|
|
||||||
return cost
|
|
||||||
cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
|
cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
@ -72,8 +67,6 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
|
||||||
cost += gold.heads[target] == B_i
|
cost += gold.heads[target] == B_i
|
||||||
if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
|
if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
|
||||||
break
|
break
|
||||||
if BINARY_COSTS and cost >= 1:
|
|
||||||
return cost
|
|
||||||
if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
|
if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
|
||||||
cost += 1
|
cost += 1
|
||||||
return cost
|
return cost
|
||||||
|
@ -117,8 +110,7 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
|
||||||
cdef class Shift:
|
cdef class Shift:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1
|
||||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and sent_start != 1
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
|
@ -178,8 +170,7 @@ cdef class Reduce:
|
||||||
cdef class LeftArc:
|
cdef class LeftArc:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
return st.B_(0).sent_start != 1
|
||||||
return sent_start != 1
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
|
@ -214,8 +205,7 @@ cdef class RightArc:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
# If there's (perhaps partial) parse pre-set, don't allow cycle.
|
# If there's (perhaps partial) parse pre-set, don't allow cycle.
|
||||||
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0)
|
||||||
return sent_start != 1 and st.H(st.S(0)) != st.B(0)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, attr_t label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
|
@ -322,42 +312,39 @@ cdef class ArcEager(TransitionSystem):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_actions(cls, **kwargs):
|
def get_actions(cls, **kwargs):
|
||||||
min_freq = kwargs.get('min_freq', None)
|
actions = kwargs.get('actions', OrderedDict((
|
||||||
actions = defaultdict(lambda: Counter())
|
(SHIFT, ['']),
|
||||||
actions[SHIFT][''] = 1
|
(REDUCE, ['']),
|
||||||
actions[REDUCE][''] = 1
|
(RIGHT, []),
|
||||||
|
(LEFT, []),
|
||||||
|
(BREAK, ['ROOT']))
|
||||||
|
))
|
||||||
|
seen_actions = set()
|
||||||
for label in kwargs.get('left_labels', []):
|
for label in kwargs.get('left_labels', []):
|
||||||
actions[LEFT][label] = 1
|
if label.upper() != 'ROOT':
|
||||||
actions[SHIFT][label] = 1
|
if (LEFT, label) not in seen_actions:
|
||||||
|
actions[LEFT].append(label)
|
||||||
|
seen_actions.add((LEFT, label))
|
||||||
for label in kwargs.get('right_labels', []):
|
for label in kwargs.get('right_labels', []):
|
||||||
actions[RIGHT][label] = 1
|
if label.upper() != 'ROOT':
|
||||||
actions[REDUCE][label] = 1
|
if (RIGHT, label) not in seen_actions:
|
||||||
|
actions[RIGHT].append(label)
|
||||||
|
seen_actions.add((RIGHT, label))
|
||||||
|
|
||||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||||
heads, labels = nonproj.projectivize(heads, labels)
|
|
||||||
for child, head, label in zip(ids, heads, labels):
|
for child, head, label in zip(ids, heads, labels):
|
||||||
if label.upper() == 'ROOT' :
|
if label.upper() == 'ROOT':
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
if head == child:
|
if label != 'ROOT':
|
||||||
actions[BREAK][label] += 1
|
if head < child:
|
||||||
elif head < child:
|
if (RIGHT, label) not in seen_actions:
|
||||||
actions[RIGHT][label] += 1
|
actions[RIGHT].append(label)
|
||||||
actions[REDUCE][''] += 1
|
seen_actions.add((RIGHT, label))
|
||||||
elif head > child:
|
elif head > child:
|
||||||
actions[LEFT][label] += 1
|
if (LEFT, label) not in seen_actions:
|
||||||
actions[SHIFT][''] += 1
|
actions[LEFT].append(label)
|
||||||
if min_freq is not None:
|
seen_actions.add((LEFT, label))
|
||||||
for action, label_freqs in actions.items():
|
|
||||||
for label, freq in list(label_freqs.items()):
|
|
||||||
if freq < min_freq:
|
|
||||||
label_freqs.pop(label)
|
|
||||||
# Ensure these actions are present
|
|
||||||
actions[BREAK].setdefault('ROOT', 0)
|
|
||||||
actions[RIGHT].setdefault('subtok', 0)
|
|
||||||
actions[LEFT].setdefault('subtok', 0)
|
|
||||||
# Used for backoff
|
|
||||||
actions[RIGHT].setdefault('dep', 0)
|
|
||||||
actions[LEFT].setdefault('dep', 0)
|
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
property action_types:
|
property action_types:
|
||||||
|
@ -389,34 +376,18 @@ cdef class ArcEager(TransitionSystem):
|
||||||
def preprocess_gold(self, GoldParse gold):
|
def preprocess_gold(self, GoldParse gold):
|
||||||
if not self.has_gold(gold):
|
if not self.has_gold(gold):
|
||||||
return None
|
return None
|
||||||
for i, (head, dep) in enumerate(zip(gold.heads, gold.labels)):
|
for i in range(gold.length):
|
||||||
# Missing values
|
# Missing values
|
||||||
if head is None or dep is None:
|
if gold.heads[i] is None or gold.labels[i] is None:
|
||||||
gold.c.heads[i] = i
|
gold.c.heads[i] = i
|
||||||
gold.c.has_dep[i] = False
|
gold.c.has_dep[i] = False
|
||||||
else:
|
else:
|
||||||
if head > i:
|
label = gold.labels[i]
|
||||||
action = LEFT
|
|
||||||
elif head < i:
|
|
||||||
action = RIGHT
|
|
||||||
else:
|
|
||||||
action = BREAK
|
|
||||||
if dep not in self.labels[action]:
|
|
||||||
if action == BREAK:
|
|
||||||
dep = 'ROOT'
|
|
||||||
elif nonproj.is_decorated(dep):
|
|
||||||
backoff = nonproj.decompose(dep)[0]
|
|
||||||
if backoff in self.labels[action]:
|
|
||||||
dep = backoff
|
|
||||||
else:
|
|
||||||
dep = 'dep'
|
|
||||||
else:
|
|
||||||
dep = 'dep'
|
|
||||||
gold.c.has_dep[i] = True
|
gold.c.has_dep[i] = True
|
||||||
if dep.upper() == 'ROOT':
|
if label.upper() == 'ROOT':
|
||||||
dep = 'ROOT'
|
label = 'ROOT'
|
||||||
gold.c.heads[i] = head
|
gold.c.heads[i] = gold.heads[i]
|
||||||
gold.c.labels[i] = self.strings.add(dep)
|
gold.c.labels[i] = self.strings.add(label)
|
||||||
return gold
|
return gold
|
||||||
|
|
||||||
def get_beam_parses(self, Beam beam):
|
def get_beam_parses(self, Beam beam):
|
||||||
|
@ -556,13 +527,8 @@ cdef class ArcEager(TransitionSystem):
|
||||||
is_valid[i] = False
|
is_valid[i] = False
|
||||||
costs[i] = 9000
|
costs[i] = 9000
|
||||||
if n_gold < 1:
|
if n_gold < 1:
|
||||||
# Check label set --- leading cause
|
# Check projectivity --- leading cause
|
||||||
label_set = set([self.strings[self.c[i].label] for i in range(self.n_moves)])
|
if is_nonproj_tree(gold.heads):
|
||||||
for label_str in gold.labels:
|
|
||||||
if label_str is not None and label_str not in label_set:
|
|
||||||
raise ValueError("Cannot get gold parser action: unknown label: %s" % label_str)
|
|
||||||
# Check projectivity --- other leading cause
|
|
||||||
if nonproj.is_nonproj_tree(gold.heads):
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not find a gold-standard action to supervise the "
|
"Could not find a gold-standard action to supervise the "
|
||||||
"dependency parser. Likely cause: the tree is "
|
"dependency parser. Likely cause: the tree is "
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
from collections import OrderedDict, Counter
|
from collections import OrderedDict
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
@ -64,18 +64,21 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_actions(cls, **kwargs):
|
def get_actions(cls, **kwargs):
|
||||||
actions = {
|
actions = kwargs.get('actions', OrderedDict((
|
||||||
MISSING: Counter(),
|
(MISSING, ['']),
|
||||||
BEGIN: Counter(),
|
(BEGIN, []),
|
||||||
IN: Counter(),
|
(IN, []),
|
||||||
LAST: Counter(),
|
(LAST, []),
|
||||||
UNIT: Counter(),
|
(UNIT, []),
|
||||||
OUT: Counter()
|
(OUT, [''])
|
||||||
}
|
)))
|
||||||
actions[OUT][''] = 1
|
seen_entities = set()
|
||||||
for entity_type in kwargs.get('entity_types', []):
|
for entity_type in kwargs.get('entity_types', []):
|
||||||
|
if entity_type in seen_entities:
|
||||||
|
continue
|
||||||
|
seen_entities.add(entity_type)
|
||||||
for action in (BEGIN, IN, LAST, UNIT):
|
for action in (BEGIN, IN, LAST, UNIT):
|
||||||
actions[action][entity_type] = 1
|
actions[action].append(entity_type)
|
||||||
moves = ('M', 'B', 'I', 'L', 'U')
|
moves = ('M', 'B', 'I', 'L', 'U')
|
||||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||||
for (ids, words, tags, heads, labels, biluo), _ in sents:
|
for (ids, words, tags, heads, labels, biluo), _ in sents:
|
||||||
|
@ -84,8 +87,10 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
if ner_tag.count('-') != 1:
|
if ner_tag.count('-') != 1:
|
||||||
raise ValueError(ner_tag)
|
raise ValueError(ner_tag)
|
||||||
_, label = ner_tag.split('-')
|
_, label = ner_tag.split('-')
|
||||||
for action in (BEGIN, IN, LAST, UNIT):
|
if label not in seen_entities:
|
||||||
actions[action][label] += 1
|
seen_entities.add(label)
|
||||||
|
for move_str in ('B', 'I', 'L', 'U'):
|
||||||
|
actions[moves.index(move_str)].append(label)
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
property action_types:
|
property action_types:
|
||||||
|
@ -208,7 +213,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
raise Exception(move)
|
raise Exception(move)
|
||||||
return t
|
return t
|
||||||
|
|
||||||
def add_action(self, int action, label_name, freq=None):
|
def add_action(self, int action, label_name):
|
||||||
cdef attr_t label_id
|
cdef attr_t label_id
|
||||||
if not isinstance(label_name, (int, long)):
|
if not isinstance(label_name, (int, long)):
|
||||||
label_id = self.strings.add(label_name)
|
label_id = self.strings.add(label_name)
|
||||||
|
@ -229,12 +234,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||||
assert self.c[self.n_moves].label == label_id
|
assert self.c[self.n_moves].label == label_id
|
||||||
self.n_moves += 1
|
self.n_moves += 1
|
||||||
if self.labels.get(action, []):
|
|
||||||
freq = min(0, min(self.labels[action].values()))
|
|
||||||
self.labels[action][label_name] = freq-1
|
|
||||||
else:
|
|
||||||
self.labels[action] = Counter()
|
|
||||||
self.labels[action][label_name] = -1
|
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
cdef int initialize_state(self, StateC* st) nogil:
|
cdef int initialize_state(self, StateC* st) nogil:
|
||||||
|
|
|
@ -15,7 +15,7 @@ cdef class Parser:
|
||||||
cdef readonly object cfg
|
cdef readonly object cfg
|
||||||
cdef public object _multitasks
|
cdef public object _multitasks
|
||||||
|
|
||||||
cdef void _parseC(self, StateC** states, int nr_task,
|
cdef void _parseC(self, StateC* state,
|
||||||
const float* feat_weights, const float* bias,
|
const float* feat_weights, const float* bias,
|
||||||
const float* hW, const float* hb,
|
const float* hW, const float* hb,
|
||||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
|
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: cdivision=True
|
# cython: cdivision=True
|
||||||
# cython: boundscheck=False
|
# cython: boundscheck=False
|
||||||
|
# cython: profile=True
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
@ -27,8 +28,6 @@ from thinc.misc import LayerNorm
|
||||||
from thinc.neural.ops import CupyOps
|
from thinc.neural.ops import CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
from thinc.linalg cimport Vec, VecVec
|
from thinc.linalg cimport Vec, VecVec
|
||||||
from thinc cimport openblas
|
|
||||||
|
|
||||||
|
|
||||||
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
||||||
from .._ml import link_vectors_to_models, create_default_optimizer
|
from .._ml import link_vectors_to_models, create_default_optimizer
|
||||||
|
@ -172,8 +171,8 @@ cdef void sum_state_features(float* output,
|
||||||
else:
|
else:
|
||||||
idx = token_ids[f] * F * O + f*O
|
idx = token_ids[f] * F * O + f*O
|
||||||
feature = &cached[idx]
|
feature = &cached[idx]
|
||||||
VecVec.add_i(output,
|
for i in range(O):
|
||||||
feature, 1., O)
|
output[i] += feature[i]
|
||||||
output += O
|
output += O
|
||||||
token_ids += F
|
token_ids += F
|
||||||
|
|
||||||
|
@ -266,7 +265,7 @@ cdef class Parser:
|
||||||
|
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device('cpu'):
|
||||||
upper = chain(
|
upper = chain(
|
||||||
clone(Maxout(hidden_width, hidden_width), depth-1),
|
clone(LayerNorm(Maxout(hidden_width, hidden_width)), depth-1),
|
||||||
zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
|
zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -302,7 +301,7 @@ cdef class Parser:
|
||||||
"""
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
if moves is True:
|
if moves is True:
|
||||||
self.moves = self.TransitionSystem(self.vocab.strings)
|
self.moves = self.TransitionSystem(self.vocab.strings, {})
|
||||||
else:
|
else:
|
||||||
self.moves = moves
|
self.moves = moves
|
||||||
if 'beam_width' not in cfg:
|
if 'beam_width' not in cfg:
|
||||||
|
@ -311,7 +310,12 @@ cdef class Parser:
|
||||||
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
||||||
if 'pretrained_dims' not in cfg:
|
if 'pretrained_dims' not in cfg:
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||||
|
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
|
if 'actions' in self.cfg:
|
||||||
|
for action, labels in self.cfg.get('actions', {}).items():
|
||||||
|
for label in labels:
|
||||||
|
self.moves.add_action(action, label)
|
||||||
self.model = model
|
self.model = model
|
||||||
self._multitasks = []
|
self._multitasks = []
|
||||||
|
|
||||||
|
@ -418,81 +422,69 @@ cdef class Parser:
|
||||||
cdef int nr_hidden = hidden_weights.shape[0]
|
cdef int nr_hidden = hidden_weights.shape[0]
|
||||||
cdef int nr_task = states.size()
|
cdef int nr_task = states.size()
|
||||||
with nogil:
|
with nogil:
|
||||||
self._parseC(&states[0], nr_task, feat_weights, bias, hW, hb,
|
for i in range(nr_task):
|
||||||
nr_class, nr_hidden, nr_feat, nr_piece)
|
self._parseC(states[i],
|
||||||
|
feat_weights, bias, hW, hb,
|
||||||
|
nr_class, nr_hidden, nr_feat, nr_piece)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
tokvecs = self.model[0].ops.unflatten(tokvecs,
|
tokvecs = self.model[0].ops.unflatten(tokvecs,
|
||||||
[len(doc) for doc in docs])
|
[len(doc) for doc in docs])
|
||||||
return state_objs, tokvecs
|
return state_objs, tokvecs
|
||||||
|
|
||||||
cdef void _parseC(self, StateC** states, int nr_task,
|
cdef void _parseC(self, StateC* state,
|
||||||
const float* feat_weights, const float* bias,
|
const float* feat_weights, const float* bias,
|
||||||
const float* hW, const float* hb,
|
const float* hW, const float* hb,
|
||||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
|
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
|
||||||
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
||||||
is_valid = <int*>calloc(nr_class, sizeof(int))
|
is_valid = <int*>calloc(nr_class, sizeof(int))
|
||||||
vectors = <float*>calloc(nr_hidden * nr_task, sizeof(float))
|
vectors = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
|
||||||
unmaxed = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
|
scores = <float*>calloc(nr_class, sizeof(float))
|
||||||
scores = <float*>calloc(nr_class*nr_task, sizeof(float))
|
|
||||||
if not (token_ids and is_valid and vectors and scores):
|
if not (token_ids and is_valid and vectors and scores):
|
||||||
with gil:
|
with gil:
|
||||||
PyErr_SetFromErrno(MemoryError)
|
PyErr_SetFromErrno(MemoryError)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
cdef int nr_todo = nr_task
|
cdef float feature
|
||||||
cdef int i, j
|
while not state.is_final():
|
||||||
cdef vector[StateC*] unfinished
|
state.set_context_tokens(token_ids, nr_feat)
|
||||||
while nr_todo >= 1:
|
memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
|
||||||
memset(vectors, 0, nr_todo * nr_hidden * sizeof(float))
|
memset(scores, 0, nr_class * sizeof(float))
|
||||||
memset(scores, 0, nr_todo * nr_class * sizeof(float))
|
sum_state_features(vectors,
|
||||||
for i in range(nr_todo):
|
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
|
||||||
state = states[i]
|
for i in range(nr_hidden * nr_piece):
|
||||||
state.set_context_tokens(token_ids, nr_feat)
|
vectors[i] += bias[i]
|
||||||
memset(unmaxed, 0, nr_hidden * nr_piece * sizeof(float))
|
V = vectors
|
||||||
sum_state_features(unmaxed,
|
W = hW
|
||||||
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
|
for i in range(nr_hidden):
|
||||||
VecVec.add_i(unmaxed,
|
if nr_piece == 1:
|
||||||
bias, 1., nr_hidden*nr_piece)
|
feature = V[0] if V[0] >= 0. else 0.
|
||||||
state_vector = &vectors[i*nr_hidden]
|
elif nr_piece == 2:
|
||||||
for j in range(nr_hidden):
|
feature = V[0] if V[0] >= V[1] else V[1]
|
||||||
index = j * nr_piece
|
else:
|
||||||
which = Vec.arg_max(&unmaxed[index], nr_piece)
|
feature = Vec.max(V, nr_piece)
|
||||||
state_vector[j] = unmaxed[index + which]
|
for j in range(nr_class):
|
||||||
# Compute hidden-to-output
|
scores[j] += feature * W[j]
|
||||||
openblas.simple_gemm(scores, nr_todo, nr_class,
|
W += nr_class
|
||||||
vectors, nr_todo, nr_hidden, hW, nr_hidden, nr_class, 0, 0)
|
V += nr_piece
|
||||||
# Add bias
|
for i in range(nr_class):
|
||||||
for i in range(nr_todo):
|
scores[i] += hb[i]
|
||||||
VecVec.add_i(&scores[i*nr_class],
|
self.moves.set_valid(is_valid, state)
|
||||||
hb, 1., nr_class)
|
guess = arg_max_if_valid(scores, is_valid, nr_class)
|
||||||
# Validate actions, argmax, take action.
|
action = self.moves.c[guess]
|
||||||
for i in range(nr_todo):
|
action.do(state, action.label)
|
||||||
state = states[i]
|
state.push_hist(guess)
|
||||||
self.moves.set_valid(is_valid, state)
|
|
||||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
|
||||||
action = self.moves.c[guess]
|
|
||||||
action.do(state, action.label)
|
|
||||||
state.push_hist(guess)
|
|
||||||
if not state.is_final():
|
|
||||||
unfinished.push_back(state)
|
|
||||||
for i in range(unfinished.size()):
|
|
||||||
states[i] = unfinished[i]
|
|
||||||
nr_todo = unfinished.size()
|
|
||||||
unfinished.clear()
|
|
||||||
free(token_ids)
|
free(token_ids)
|
||||||
free(is_valid)
|
free(is_valid)
|
||||||
free(vectors)
|
free(vectors)
|
||||||
free(unmaxed)
|
|
||||||
free(scores)
|
free(scores)
|
||||||
|
|
||||||
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001,
|
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
|
||||||
float drop=0.):
|
|
||||||
cdef Beam beam
|
cdef Beam beam
|
||||||
cdef np.ndarray scores
|
cdef np.ndarray scores
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef int nr_class = self.moves.n_moves
|
cdef int nr_class = self.moves.n_moves
|
||||||
cuda_stream = util.get_cuda_stream()
|
cuda_stream = util.get_cuda_stream()
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||||
docs, cuda_stream, drop)
|
docs, cuda_stream, 0.0)
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
cdef int j = 0
|
cdef int j = 0
|
||||||
cdef int k
|
cdef int k
|
||||||
|
@ -531,8 +523,8 @@ cdef class Parser:
|
||||||
n_states += 1
|
n_states += 1
|
||||||
if n_states == 0:
|
if n_states == 0:
|
||||||
break
|
break
|
||||||
vectors, _ = state2vec.begin_update(token_ids[:n_states], drop)
|
vectors = state2vec(token_ids[:n_states])
|
||||||
scores, _ = vec2scores.begin_update(vectors, drop=drop)
|
scores = vec2scores(vectors)
|
||||||
c_scores = <float*>scores.data
|
c_scores = <float*>scores.data
|
||||||
for beam in todo:
|
for beam in todo:
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
|
@ -563,10 +555,7 @@ cdef class Parser:
|
||||||
for multitask in self._multitasks:
|
for multitask in self._multitasks:
|
||||||
multitask.update(docs, golds, drop=drop, sgd=sgd)
|
multitask.update(docs, golds, drop=drop, sgd=sgd)
|
||||||
cuda_stream = util.get_cuda_stream()
|
cuda_stream = util.get_cuda_stream()
|
||||||
# Chop sequences into lengths of this many transitions, to make the
|
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||||
# batch uniform length.
|
|
||||||
cut_gold = numpy.random.choice(range(20, 100))
|
|
||||||
states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold)
|
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||||
drop)
|
drop)
|
||||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||||
|
@ -669,7 +658,8 @@ cdef class Parser:
|
||||||
for beam in beams:
|
for beam in beams:
|
||||||
_cleanup(beam)
|
_cleanup(beam)
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500):
|
|
||||||
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||||
where N is the shortest doc. We'll make two states, one representing
|
where N is the shortest doc. We'll make two states, one representing
|
||||||
|
@ -678,7 +668,7 @@ cdef class Parser:
|
||||||
StateClass state
|
StateClass state
|
||||||
Transition action
|
Transition action
|
||||||
whole_states = self.moves.init_batch(whole_docs)
|
whole_states = self.moves.init_batch(whole_docs)
|
||||||
max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
|
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
|
||||||
max_moves = 0
|
max_moves = 0
|
||||||
states = []
|
states = []
|
||||||
golds = []
|
golds = []
|
||||||
|
@ -800,11 +790,6 @@ cdef class Parser:
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
hook(doc)
|
hook(doc)
|
||||||
|
|
||||||
@property
|
|
||||||
def labels(self):
|
|
||||||
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
|
||||||
return class_names
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tok2vec(self):
|
def tok2vec(self):
|
||||||
'''Return the embedding and convolutional layer of the model.'''
|
'''Return the embedding and convolutional layer of the model.'''
|
||||||
|
@ -823,6 +808,9 @@ cdef class Parser:
|
||||||
for action in self.moves.action_types:
|
for action in self.moves.action_types:
|
||||||
added = self.moves.add_action(action, label)
|
added = self.moves.add_action(action, label)
|
||||||
if added:
|
if added:
|
||||||
|
# Important that the labels be stored as a list! We need the
|
||||||
|
# order, or the model goes out of synch
|
||||||
|
self.cfg.setdefault('extra_labels', []).append(label)
|
||||||
resized = True
|
resized = True
|
||||||
if self.model not in (True, False, None) and resized:
|
if self.model not in (True, False, None) and resized:
|
||||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||||
|
@ -836,10 +824,12 @@ cdef class Parser:
|
||||||
def begin_training(self, gold_tuples, pipeline=None, sgd=None, **cfg):
|
def begin_training(self, gold_tuples, pipeline=None, sgd=None, **cfg):
|
||||||
if 'model' in cfg:
|
if 'model' in cfg:
|
||||||
self.model = cfg['model']
|
self.model = cfg['model']
|
||||||
cfg.setdefault('min_action_freq', 30)
|
gold_tuples = nonproj.preprocess_training_data(gold_tuples,
|
||||||
actions = self.moves.get_actions(gold_parses=gold_tuples,
|
label_freq_cutoff=100)
|
||||||
min_freq=cfg.get('min_action_freq', 30))
|
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
||||||
self.moves.initialize_actions(actions)
|
for action, labels in actions.items():
|
||||||
|
for label in labels:
|
||||||
|
self.moves.add_action(action, label)
|
||||||
cfg.setdefault('token_vector_width', 128)
|
cfg.setdefault('token_vector_width', 128)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
cfg['pretrained_dims'] = self.vocab.vectors_length
|
cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
|
@ -847,7 +837,7 @@ cdef class Parser:
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
self.model[1].begin_training(
|
self.model[1].begin_training(
|
||||||
self.model[1].ops.allocate((5, cfg['token_vector_width'])))
|
self.model[1].ops.allocate((5, cfg['token_vector_width'])))
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
self.init_multitask_objectives(gold_tuples, pipeline, sgd=sgd, **cfg)
|
self.init_multitask_objectives(gold_tuples, pipeline, sgd=sgd, **cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
|
@ -9,7 +9,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc, set_children_from_heads
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
DELIMITER = '||'
|
DELIMITER = '||'
|
||||||
|
@ -74,21 +74,7 @@ def decompose(label):
|
||||||
|
|
||||||
|
|
||||||
def is_decorated(label):
|
def is_decorated(label):
|
||||||
return DELIMITER in label
|
return label.find(DELIMITER) != -1
|
||||||
|
|
||||||
def count_decorated_labels(gold_tuples):
|
|
||||||
freqs = {}
|
|
||||||
for raw_text, sents in gold_tuples:
|
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
|
||||||
proj_heads, deco_labels = projectivize(heads, labels)
|
|
||||||
# set the label to ROOT for each root dependent
|
|
||||||
deco_labels = ['ROOT' if head == i else deco_labels[i]
|
|
||||||
for i, head in enumerate(proj_heads)]
|
|
||||||
# count label frequencies
|
|
||||||
for label in deco_labels:
|
|
||||||
if is_decorated(label):
|
|
||||||
freqs[label] = freqs.get(label, 0) + 1
|
|
||||||
return freqs
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
|
def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
|
||||||
|
@ -138,9 +124,8 @@ cpdef deprojectivize(Doc doc):
|
||||||
if DELIMITER in label:
|
if DELIMITER in label:
|
||||||
new_label, head_label = label.split(DELIMITER)
|
new_label, head_label = label.split(DELIMITER)
|
||||||
new_head = _find_new_head(doc[i], head_label)
|
new_head = _find_new_head(doc[i], head_label)
|
||||||
doc.c[i].head = new_head.i - i
|
doc[i].head = new_head
|
||||||
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
||||||
set_children_from_heads(doc.c, doc.length)
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -206,12 +191,9 @@ def _filter_labels(gold_tuples, cutoff, freqs):
|
||||||
for raw_text, sents in gold_tuples:
|
for raw_text, sents in gold_tuples:
|
||||||
filtered_sents = []
|
filtered_sents = []
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||||
filtered_labels = []
|
filtered_labels = [decompose(label)[0]
|
||||||
for label in labels:
|
if freqs.get(label, cutoff) < cutoff
|
||||||
if is_decorated(label) and freqs.get(label, 0) < cutoff:
|
else label for label in labels]
|
||||||
filtered_labels.append(decompose(label)[0])
|
|
||||||
else:
|
|
||||||
filtered_labels.append(label)
|
|
||||||
filtered_sents.append(
|
filtered_sents.append(
|
||||||
((ids, words, tags, heads, filtered_labels, iob), ctnts))
|
((ids, words, tags, heads, filtered_labels, iob), ctnts))
|
||||||
filtered.append((raw_text, filtered_sents))
|
filtered.append((raw_text, filtered_sents))
|
||||||
|
|
|
@ -42,7 +42,6 @@ cdef class TransitionSystem:
|
||||||
cdef public attr_t root_label
|
cdef public attr_t root_label
|
||||||
cdef public freqs
|
cdef public freqs
|
||||||
cdef init_state_t init_beam_state
|
cdef init_state_t init_beam_state
|
||||||
cdef public object labels
|
|
||||||
|
|
||||||
cdef int initialize_state(self, StateC* state) nogil
|
cdef int initialize_state(self, StateC* state) nogil
|
||||||
cdef int finalize_state(self, StateC* state) nogil
|
cdef int finalize_state(self, StateC* state) nogil
|
||||||
|
|
|
@ -5,7 +5,7 @@ from __future__ import unicode_literals
|
||||||
from cpython.ref cimport Py_INCREF
|
from cpython.ref cimport Py_INCREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from collections import OrderedDict, Counter
|
from collections import OrderedDict
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
@ -28,7 +28,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
||||||
|
|
||||||
|
|
||||||
cdef class TransitionSystem:
|
cdef class TransitionSystem:
|
||||||
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
|
def __init__(self, StringStore string_table, labels_by_action):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = string_table
|
self.strings = string_table
|
||||||
self.n_moves = 0
|
self.n_moves = 0
|
||||||
|
@ -36,14 +36,21 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
|
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
|
||||||
|
|
||||||
self.labels = {}
|
for action, label_strs in labels_by_action.items():
|
||||||
if labels_by_action:
|
for label_str in label_strs:
|
||||||
self.initialize_actions(labels_by_action, min_freq=min_freq)
|
self.add_action(int(action), label_str)
|
||||||
self.root_label = self.strings.add('ROOT')
|
self.root_label = self.strings.add('ROOT')
|
||||||
self.init_beam_state = _init_state
|
self.init_beam_state = _init_state
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (self.__class__, (self.strings, self.labels), None, None)
|
labels_by_action = OrderedDict()
|
||||||
|
cdef Transition t
|
||||||
|
for trans in self.c[:self.n_moves]:
|
||||||
|
label_str = self.strings[trans.label]
|
||||||
|
labels_by_action.setdefault(trans.move, []).append(label_str)
|
||||||
|
return (self.__class__,
|
||||||
|
(self.strings, labels_by_action),
|
||||||
|
None, None)
|
||||||
|
|
||||||
def init_batch(self, docs):
|
def init_batch(self, docs):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
|
@ -139,22 +146,6 @@ cdef class TransitionSystem:
|
||||||
act = self.c[clas]
|
act = self.c[clas]
|
||||||
return self.move_name(act.move, act.label)
|
return self.move_name(act.move, act.label)
|
||||||
|
|
||||||
def initialize_actions(self, labels_by_action, min_freq=None):
|
|
||||||
self.labels = {}
|
|
||||||
self.n_moves = 0
|
|
||||||
for action, label_freqs in sorted(labels_by_action.items()):
|
|
||||||
action = int(action)
|
|
||||||
# Make sure we take a copy here, and that we get a Counter
|
|
||||||
self.labels[action] = Counter()
|
|
||||||
# Have to be careful here: Sorting must be stable, or our model
|
|
||||||
# won't be read back in correctly.
|
|
||||||
sorted_labels = [(f, L) for L, f in label_freqs.items()]
|
|
||||||
sorted_labels.sort()
|
|
||||||
sorted_labels.reverse()
|
|
||||||
for freq, label_str in sorted_labels:
|
|
||||||
self.add_action(int(action), label_str)
|
|
||||||
self.labels[action][label_str] = freq
|
|
||||||
|
|
||||||
def add_action(self, int action, label_name):
|
def add_action(self, int action, label_name):
|
||||||
cdef attr_t label_id
|
cdef attr_t label_id
|
||||||
if not isinstance(label_name, int) and \
|
if not isinstance(label_name, int) and \
|
||||||
|
@ -173,14 +164,6 @@ cdef class TransitionSystem:
|
||||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||||
assert self.c[self.n_moves].label == label_id
|
assert self.c[self.n_moves].label == label_id
|
||||||
self.n_moves += 1
|
self.n_moves += 1
|
||||||
if self.labels.get(action, []):
|
|
||||||
new_freq = min(self.labels[action].values())
|
|
||||||
else:
|
|
||||||
self.labels[action] = Counter()
|
|
||||||
new_freq = -1
|
|
||||||
if new_freq > 0:
|
|
||||||
new_freq = 0
|
|
||||||
self.labels[action][label_name] = new_freq-1
|
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
|
@ -195,18 +178,26 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
transitions = []
|
transitions = []
|
||||||
|
for trans in self.c[:self.n_moves]:
|
||||||
|
transitions.append({
|
||||||
|
'clas': trans.clas,
|
||||||
|
'move': trans.move,
|
||||||
|
'label': self.strings[trans.label],
|
||||||
|
'name': self.move_name(trans.move, trans.label)
|
||||||
|
})
|
||||||
serializers = {
|
serializers = {
|
||||||
'moves': lambda: json_dumps(self.labels),
|
'transitions': lambda: json_dumps(transitions),
|
||||||
'strings': lambda: self.strings.to_bytes()
|
'strings': lambda: self.strings.to_bytes()
|
||||||
}
|
}
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
labels = {}
|
transitions = []
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'moves': lambda b: labels.update(ujson.loads(b)),
|
'transitions': lambda b: transitions.extend(ujson.loads(b)),
|
||||||
'strings': lambda b: self.strings.from_bytes(b)
|
'strings': lambda b: self.strings.from_bytes(b)
|
||||||
}
|
}
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
self.initialize_actions(labels)
|
for trans in transitions:
|
||||||
|
self.add_action(trans['move'], trans['label'])
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -19,15 +19,6 @@ def doc(en_tokenizer):
|
||||||
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def doc_not_parsed(en_tokenizer):
|
|
||||||
text = "This is a sentence. This is another sentence. And a third."
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
d = get_doc(tokens.vocab, [t.text for t in tokens])
|
|
||||||
d.is_parsed = False
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
|
||||||
def test_spans_sent_spans(doc):
|
def test_spans_sent_spans(doc):
|
||||||
sents = list(doc.sents)
|
sents = list(doc.sents)
|
||||||
assert sents[0].start == 0
|
assert sents[0].start == 0
|
||||||
|
@ -43,7 +34,6 @@ def test_spans_root(doc):
|
||||||
assert span.root.text == 'sentence'
|
assert span.root.text == 'sentence'
|
||||||
assert span.root.head.text == 'is'
|
assert span.root.head.text == 'is'
|
||||||
|
|
||||||
|
|
||||||
def test_spans_string_fn(doc):
|
def test_spans_string_fn(doc):
|
||||||
span = doc[0:4]
|
span = doc[0:4]
|
||||||
assert len(span) == 4
|
assert len(span) == 4
|
||||||
|
@ -51,7 +41,6 @@ def test_spans_string_fn(doc):
|
||||||
assert span.upper_ == 'THIS IS A SENTENCE'
|
assert span.upper_ == 'THIS IS A SENTENCE'
|
||||||
assert span.lower_ == 'this is a sentence'
|
assert span.lower_ == 'this is a sentence'
|
||||||
|
|
||||||
|
|
||||||
def test_spans_root2(en_tokenizer):
|
def test_spans_root2(en_tokenizer):
|
||||||
text = "through North and South Carolina"
|
text = "through North and South Carolina"
|
||||||
heads = [0, 3, -1, -2, -4]
|
heads = [0, 3, -1, -2, -4]
|
||||||
|
@ -60,17 +49,12 @@ def test_spans_root2(en_tokenizer):
|
||||||
assert doc[-2:].root.text == 'Carolina'
|
assert doc[-2:].root.text == 'Carolina'
|
||||||
|
|
||||||
|
|
||||||
def test_spans_span_sent(doc, doc_not_parsed):
|
def test_spans_span_sent(doc):
|
||||||
"""Test span.sent property"""
|
"""Test span.sent property"""
|
||||||
assert len(list(doc.sents))
|
assert len(list(doc.sents))
|
||||||
assert doc[:2].sent.root.text == 'is'
|
assert doc[:2].sent.root.text == 'is'
|
||||||
assert doc[:2].sent.text == 'This is a sentence .'
|
assert doc[:2].sent.text == 'This is a sentence .'
|
||||||
assert doc[6:7].sent.root.left_edge.text == 'This'
|
assert doc[6:7].sent.root.left_edge.text == 'This'
|
||||||
# test on manual sbd
|
|
||||||
doc_not_parsed[0].is_sent_start = True
|
|
||||||
doc_not_parsed[5].is_sent_start = True
|
|
||||||
assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
|
|
||||||
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
|
|
||||||
|
|
||||||
|
|
||||||
def test_spans_lca_matrix(en_tokenizer):
|
def test_spans_lca_matrix(en_tokenizer):
|
||||||
|
@ -145,7 +129,7 @@ def test_span_to_array(doc):
|
||||||
assert arr[0, 1] == len(span[0])
|
assert arr[0, 1] == len(span[0])
|
||||||
|
|
||||||
|
|
||||||
#def test_span_as_doc(doc):
|
def test_span_as_doc(doc):
|
||||||
# span = doc[4:10]
|
span = doc[4:10]
|
||||||
# span_doc = span.as_doc()
|
span_doc = span.as_doc()
|
||||||
# assert span.text == span_doc.text.strip()
|
assert span.text == span_doc.text.strip()
|
||||||
|
|
36
spacy/tests/gold/test_lev_align.py
Normal file
36
spacy/tests/gold/test_lev_align.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Find the min-cost alignment between two tokenizations"""
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...gold import _min_edit_path as min_edit_path
|
||||||
|
from ...gold import align
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('cand,gold,path', [
|
||||||
|
(["U.S", ".", "policy"], ["U.S.", "policy"], (0, 'MDM')),
|
||||||
|
(["U.N", ".", "policy"], ["U.S.", "policy"], (1, 'SDM')),
|
||||||
|
(["The", "cat", "sat", "down"], ["The", "cat", "sat", "down"], (0, 'MMMM')),
|
||||||
|
(["cat", "sat", "down"], ["The", "cat", "sat", "down"], (1, 'IMMM')),
|
||||||
|
(["The", "cat", "down"], ["The", "cat", "sat", "down"], (1, 'MMIM')),
|
||||||
|
(["The", "cat", "sag", "down"], ["The", "cat", "sat", "down"], (1, 'MMSM'))])
|
||||||
|
def test_gold_lev_align_edit_path(cand, gold, path):
|
||||||
|
assert min_edit_path(cand, gold) == path
|
||||||
|
|
||||||
|
|
||||||
|
def test_gold_lev_align_edit_path2():
|
||||||
|
cand = ["your", "stuff"]
|
||||||
|
gold = ["you", "r", "stuff"]
|
||||||
|
assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('cand,gold,result', [
|
||||||
|
(["U.S", ".", "policy"], ["U.S.", "policy"], [0, None, 1]),
|
||||||
|
(["your", "stuff"], ["you", "r", "stuff"], [None, 2]),
|
||||||
|
(["i", "like", "2", "guys", " ", "well", "id", "just", "come", "straight", "out"],
|
||||||
|
["i", "like", "2", "guys", "well", "i", "d", "just", "come", "straight", "out"],
|
||||||
|
[0, 1, 2, 3, None, 4, None, 7, 8, 9, 10])])
|
||||||
|
def test_gold_lev_align(cand, gold, result):
|
||||||
|
assert align(cand, gold) == result
|
|
@ -2,9 +2,9 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ....parts_of_speech import SPACE
|
from ....parts_of_speech import SPACE
|
||||||
from ....compat import unicode_
|
|
||||||
from ...util import get_doc
|
from ...util import get_doc
|
||||||
|
|
||||||
|
import six
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,8 +24,8 @@ def test_tag_names(EN):
|
||||||
text = "I ate pizzas with anchovies."
|
text = "I ate pizzas with anchovies."
|
||||||
doc = EN(text, disable=['parser'])
|
doc = EN(text, disable=['parser'])
|
||||||
assert type(doc[2].pos) == int
|
assert type(doc[2].pos) == int
|
||||||
assert isinstance(doc[2].pos_, unicode_)
|
assert isinstance(doc[2].pos_, six.text_type)
|
||||||
assert isinstance(doc[2].dep_, unicode_)
|
assert isinstance(doc[2].dep_, six.text_type)
|
||||||
assert doc[2].tag_ == u'NNS'
|
assert doc[2].tag_ == u'NNS'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,75 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
from ...vocab import Vocab
|
|
||||||
from ...pipeline import DependencyParser
|
|
||||||
from ...tokens import Doc
|
|
||||||
from ...gold import GoldParse
|
|
||||||
from ...syntax.nonproj import projectivize
|
|
||||||
|
|
||||||
annot_tuples = [
|
|
||||||
(0, 'When', 'WRB', 11, 'advmod', 'O'),
|
|
||||||
(1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'),
|
|
||||||
(2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'),
|
|
||||||
(3, ',', ',', 2, 'punct', 'O'),
|
|
||||||
(4, 'our', 'PRP$', 6, 'poss', 'O'),
|
|
||||||
(5, 'embedded', 'VBN', 6, 'amod', 'O'),
|
|
||||||
(6, 'reporter', 'NN', 2, 'appos', 'O'),
|
|
||||||
(7, 'with', 'IN', 6, 'prep', 'O'),
|
|
||||||
(8, 'the', 'DT', 10, 'det', 'B-ORG'),
|
|
||||||
(9, '3rd', 'NNP', 10, 'compound', 'I-ORG'),
|
|
||||||
(10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'),
|
|
||||||
(11, 'says', 'VBZ', 44, 'advcl', 'O'),
|
|
||||||
(12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'),
|
|
||||||
(13, 'battalions', 'NNS', 16, 'nsubj', 'O'),
|
|
||||||
(14, 'of', 'IN', 13, 'prep', 'O'),
|
|
||||||
(15, 'troops', 'NNS', 14, 'pobj', 'O'),
|
|
||||||
(16, 'are', 'VBP', 11, 'ccomp', 'O'),
|
|
||||||
(17, 'on', 'IN', 16, 'prep', 'O'),
|
|
||||||
(18, 'the', 'DT', 19, 'det', 'O'),
|
|
||||||
(19, 'ground', 'NN', 17, 'pobj', 'O'),
|
|
||||||
(20, ',', ',', 17, 'punct', 'O'),
|
|
||||||
(21, 'inside', 'IN', 17, 'prep', 'O'),
|
|
||||||
(22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'),
|
|
||||||
(23, 'itself', 'PRP', 22, 'appos', 'O'),
|
|
||||||
(24, ',', ',', 16, 'punct', 'O'),
|
|
||||||
(25, 'have', 'VBP', 26, 'aux', 'O'),
|
|
||||||
(26, 'taken', 'VBN', 16, 'dep', 'O'),
|
|
||||||
(27, 'up', 'RP', 26, 'prt', 'O'),
|
|
||||||
(28, 'positions', 'NNS', 26, 'dobj', 'O'),
|
|
||||||
(29, 'they', 'PRP', 31, 'nsubj', 'O'),
|
|
||||||
(30, "'re", 'VBP', 31, 'aux', 'O'),
|
|
||||||
(31, 'going', 'VBG', 26, 'parataxis', 'O'),
|
|
||||||
(32, 'to', 'TO', 33, 'aux', 'O'),
|
|
||||||
(33, 'spend', 'VB', 31, 'xcomp', 'O'),
|
|
||||||
(34, 'the', 'DT', 35, 'det', 'B-TIME'),
|
|
||||||
(35, 'night', 'NN', 33, 'dobj', 'L-TIME'),
|
|
||||||
(36, 'there', 'RB', 33, 'advmod', 'O'),
|
|
||||||
(37, 'presumably', 'RB', 33, 'advmod', 'O'),
|
|
||||||
(38, ',', ',', 44, 'punct', 'O'),
|
|
||||||
(39, 'how', 'WRB', 40, 'advmod', 'O'),
|
|
||||||
(40, 'many', 'JJ', 41, 'amod', 'O'),
|
|
||||||
(41, 'soldiers', 'NNS', 44, 'pobj', 'O'),
|
|
||||||
(42, 'are', 'VBP', 44, 'aux', 'O'),
|
|
||||||
(43, 'we', 'PRP', 44, 'nsubj', 'O'),
|
|
||||||
(44, 'talking', 'VBG', 44, 'ROOT', 'O'),
|
|
||||||
(45, 'about', 'IN', 44, 'prep', 'O'),
|
|
||||||
(46, 'right', 'RB', 47, 'advmod', 'O'),
|
|
||||||
(47, 'now', 'RB', 44, 'advmod', 'O'),
|
|
||||||
(48, '?', '.', 44, 'punct', 'O')]
|
|
||||||
|
|
||||||
def test_get_oracle_actions():
|
|
||||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
|
||||||
parser = DependencyParser(doc.vocab)
|
|
||||||
parser.moves.add_action(0, '')
|
|
||||||
parser.moves.add_action(1, '')
|
|
||||||
parser.moves.add_action(1, '')
|
|
||||||
parser.moves.add_action(4, 'ROOT')
|
|
||||||
for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
|
|
||||||
if head > i:
|
|
||||||
parser.moves.add_action(2, dep)
|
|
||||||
elif head < i:
|
|
||||||
parser.moves.add_action(3, dep)
|
|
||||||
ids, words, tags, heads, deps, ents = zip(*annot_tuples)
|
|
||||||
heads, deps = projectivize(heads, deps)
|
|
||||||
gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
|
|
||||||
parser.moves.preprocess_gold(gold)
|
|
||||||
actions = parser.moves.get_oracle_sequence(doc, gold)
|
|
|
@ -13,8 +13,8 @@ from ...vocab import Vocab
|
||||||
('a b', 0, 2),
|
('a b', 0, 2),
|
||||||
('a c', 0, 1),
|
('a c', 0, 1),
|
||||||
('a b c', 0, 2),
|
('a b c', 0, 2),
|
||||||
('a b b c', 0, 3),
|
('a b b c', 0, 2),
|
||||||
('a b b', 0, 3),
|
('a b b', 0, 2),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_issue1450_matcher_end_zero_plus(string, start, end):
|
def test_issue1450_matcher_end_zero_plus(string, start, end):
|
||||||
|
@ -54,6 +54,5 @@ def test_issue1450_matcher_end_zero_plus(string, start, end):
|
||||||
if start is None or end is None:
|
if start is None or end is None:
|
||||||
assert matches == []
|
assert matches == []
|
||||||
|
|
||||||
print(matches)
|
assert matches[0][1] == start
|
||||||
assert matches[-1][1] == start
|
assert matches[0][2] == end
|
||||||
assert matches[-1][2] == end
|
|
||||||
|
|
|
@ -1,65 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import re
|
|
||||||
|
|
||||||
from ...matcher import Matcher
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
|
|
||||||
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
|
|
||||||
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
|
|
||||||
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
|
|
||||||
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
|
|
||||||
|
|
||||||
re_pattern1 = 'AA*'
|
|
||||||
re_pattern2 = 'A*A'
|
|
||||||
re_pattern3 = 'AA'
|
|
||||||
re_pattern4 = 'BA*B'
|
|
||||||
re_pattern5 = 'B*A*B'
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def text():
|
|
||||||
return "(ABBAAAAAB)."
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def doc(en_tokenizer,text):
|
|
||||||
doc = en_tokenizer(' '.join(text))
|
|
||||||
return doc
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize('pattern,re_pattern',[
|
|
||||||
(pattern1,re_pattern1),
|
|
||||||
(pattern2,re_pattern2),
|
|
||||||
(pattern3,re_pattern3),
|
|
||||||
(pattern4,re_pattern4),
|
|
||||||
(pattern5,re_pattern5)])
|
|
||||||
def test_greedy_matching(doc,text,pattern,re_pattern):
|
|
||||||
"""
|
|
||||||
Test that the greedy matching behavior of the * op
|
|
||||||
is consistant with other re implementations
|
|
||||||
"""
|
|
||||||
matcher = Matcher(doc.vocab)
|
|
||||||
matcher.add(re_pattern,None,pattern)
|
|
||||||
matches = matcher(doc)
|
|
||||||
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
|
|
||||||
for match,re_match in zip(matches,re_matches):
|
|
||||||
assert match[1:]==re_match
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize('pattern,re_pattern',[
|
|
||||||
(pattern1,re_pattern1),
|
|
||||||
(pattern2,re_pattern2),
|
|
||||||
(pattern3,re_pattern3),
|
|
||||||
(pattern4,re_pattern4),
|
|
||||||
(pattern5,re_pattern5)])
|
|
||||||
def test_match_consuming(doc,text,pattern,re_pattern):
|
|
||||||
"""
|
|
||||||
Test that matcher.__call__ consumes tokens on a match
|
|
||||||
similar to re.findall
|
|
||||||
"""
|
|
||||||
matcher = Matcher(doc.vocab)
|
|
||||||
matcher.add(re_pattern,None,pattern)
|
|
||||||
matches = matcher(doc)
|
|
||||||
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
|
|
||||||
assert len(matches)==len(re_matches)
|
|
|
@ -1,11 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from ...lang.lex_attrs import is_stop
|
|
||||||
from ...lang.en.stop_words import STOP_WORDS
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word', ['the'])
|
|
||||||
def test_lex_attrs_stop_words_case_sensitivity(word):
|
|
||||||
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
|
|
@ -6,6 +6,7 @@ from ...vocab import Vocab
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_issue1945():
|
def test_issue1945():
|
||||||
text = "a a a"
|
text = "a a a"
|
||||||
matcher = Matcher(Vocab())
|
matcher = Matcher(Vocab())
|
||||||
|
|
|
@ -22,9 +22,10 @@ def test_basic_case():
|
||||||
assert end == 4
|
assert end == 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_issue850():
|
def test_issue850():
|
||||||
"""The variable-length pattern matches the
|
"""The problem here is that the variable-length pattern matches the
|
||||||
succeeding token. Check we handle the ambiguity correctly."""
|
succeeding token. We then don't handle the ambiguity correctly."""
|
||||||
matcher = Matcher(Vocab(
|
matcher = Matcher(Vocab(
|
||||||
lex_attr_getters={LOWER: lambda string: string.lower()}))
|
lex_attr_getters={LOWER: lambda string: string.lower()}))
|
||||||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import pytest
|
|
||||||
from .._align import align, multi_align
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string1,string2,cost', [
|
|
||||||
('hello', 'hell', 1),
|
|
||||||
('rat', 'cat', 1),
|
|
||||||
('rat', 'rat', 0),
|
|
||||||
('rat', 'catsie', 4),
|
|
||||||
('t', 'catsie', 5),
|
|
||||||
])
|
|
||||||
def test_align_costs(string1, string2, cost):
|
|
||||||
output_cost, i2j, j2i, matrix = align(string1, string2)
|
|
||||||
assert output_cost == cost
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string1,string2,i2j', [
|
|
||||||
('hello', 'hell', [0,1,2,3,-1]),
|
|
||||||
('rat', 'cat', [0,1,2]),
|
|
||||||
('rat', 'rat', [0,1,2]),
|
|
||||||
('rat', 'catsie', [0,1,2]),
|
|
||||||
('t', 'catsie', [2]),
|
|
||||||
])
|
|
||||||
def test_align_i2j(string1, string2, i2j):
|
|
||||||
output_cost, output_i2j, j2i, matrix = align(string1, string2)
|
|
||||||
assert list(output_i2j) == i2j
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string1,string2,j2i', [
|
|
||||||
('hello', 'hell', [0,1,2,3]),
|
|
||||||
('rat', 'cat', [0,1,2]),
|
|
||||||
('rat', 'rat', [0,1,2]),
|
|
||||||
('rat', 'catsie', [0,1,2, -1, -1, -1]),
|
|
||||||
('t', 'catsie', [-1, -1, 0, -1, -1, -1]),
|
|
||||||
])
|
|
||||||
def test_align_i2j(string1, string2, j2i):
|
|
||||||
output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
|
|
||||||
assert list(output_j2i) == j2i
|
|
||||||
|
|
||||||
def test_align_strings():
|
|
||||||
words1 = ['hello', 'this', 'is', 'test!']
|
|
||||||
words2 = ['hellothis', 'is', 'test', '!']
|
|
||||||
cost, i2j, j2i, matrix = align(words1, words2)
|
|
||||||
assert cost == 4
|
|
||||||
assert list(i2j) == [-1, -1, 1, -1]
|
|
||||||
assert list(j2i) == [-1, 2, -1, -1]
|
|
||||||
|
|
||||||
def test_align_many_to_one():
|
|
||||||
words1 = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
|
|
||||||
words2 = ['ab', 'bc', 'e', 'fg', 'h']
|
|
||||||
cost, i2j, j2i, matrix = align(words1, words2)
|
|
||||||
assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
|
|
||||||
lengths1 = [len(w) for w in words1]
|
|
||||||
lengths2 = [len(w) for w in words2]
|
|
||||||
i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
|
|
||||||
assert i2j_multi[0] == 0
|
|
||||||
assert i2j_multi[1] == 0
|
|
||||||
assert i2j_multi[2] == 1
|
|
||||||
assert i2j_multi[3] == 1
|
|
||||||
assert i2j_multi[3] == 1
|
|
||||||
assert i2j_multi[5] == 3
|
|
||||||
assert i2j_multi[6] == 3
|
|
||||||
|
|
||||||
assert j2i_multi[0] == 1
|
|
||||||
assert j2i_multi[1] == 3
|
|
|
@ -3,17 +3,12 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..matcher import Matcher, PhraseMatcher
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
from .util import get_doc
|
from .util import get_doc
|
||||||
from ..util import get_lang_class
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def en_vocab():
|
|
||||||
return get_lang_class('en').Defaults.create_vocab()
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def matcher(en_vocab):
|
def matcher(en_vocab):
|
||||||
rules = {
|
rules = {
|
||||||
'JS': [[{'ORTH': 'JavaScript'}]],
|
'JS': [[{'ORTH': 'JavaScript'}]],
|
||||||
|
@ -26,196 +21,187 @@ def matcher(en_vocab):
|
||||||
return matcher
|
return matcher
|
||||||
|
|
||||||
|
|
||||||
#def test_matcher_from_api_docs(en_vocab):
|
def test_matcher_from_api_docs(en_vocab):
|
||||||
# matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
# pattern = [{'ORTH': 'test'}]
|
pattern = [{'ORTH': 'test'}]
|
||||||
# assert len(matcher) == 0
|
assert len(matcher) == 0
|
||||||
# matcher.add('Rule', None, pattern)
|
matcher.add('Rule', None, pattern)
|
||||||
# assert len(matcher) == 1
|
assert len(matcher) == 1
|
||||||
# matcher.remove('Rule')
|
matcher.remove('Rule')
|
||||||
# assert 'Rule' not in matcher
|
assert 'Rule' not in matcher
|
||||||
# matcher.add('Rule', None, pattern)
|
matcher.add('Rule', None, pattern)
|
||||||
# assert 'Rule' in matcher
|
assert 'Rule' in matcher
|
||||||
# on_match, patterns = matcher.get('Rule')
|
on_match, patterns = matcher.get('Rule')
|
||||||
# assert len(patterns[0])
|
assert len(patterns[0])
|
||||||
#
|
|
||||||
#
|
|
||||||
#def test_matcher_from_usage_docs(en_vocab):
|
|
||||||
# text = "Wow 😀 This is really cool! 😂 😂"
|
|
||||||
# doc = get_doc(en_vocab, words=text.split(' '))
|
|
||||||
# pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
|
|
||||||
# pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
|
|
||||||
#
|
|
||||||
# def label_sentiment(matcher, doc, i, matches):
|
|
||||||
# match_id, start, end = matches[i]
|
|
||||||
# if doc.vocab.strings[match_id] == 'HAPPY':
|
|
||||||
# doc.sentiment += 0.1
|
|
||||||
# span = doc[start : end]
|
|
||||||
# token = span.merge()
|
|
||||||
# token.vocab[token.text].norm_ = 'happy emoji'
|
|
||||||
#
|
|
||||||
# matcher = Matcher(en_vocab)
|
|
||||||
# matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
|
||||||
# matches = matcher(doc)
|
|
||||||
# assert doc.sentiment != 0
|
|
||||||
# assert doc[1].norm_ == 'happy emoji'
|
|
||||||
|
|
||||||
|
|
||||||
#@pytest.mark.parametrize('words', [["Some", "words"]])
|
def test_matcher_from_usage_docs(en_vocab):
|
||||||
#def test_matcher_init(en_vocab, words):
|
text = "Wow 😀 This is really cool! 😂 😂"
|
||||||
# matcher = Matcher(en_vocab)
|
doc = get_doc(en_vocab, words=text.split(' '))
|
||||||
# doc = get_doc(en_vocab, words)
|
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
|
||||||
# assert len(matcher) == 0
|
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
|
||||||
# assert matcher(doc) == []
|
|
||||||
#
|
def label_sentiment(matcher, doc, i, matches):
|
||||||
#
|
match_id, start, end = matches[i]
|
||||||
#def test_matcher_contains(matcher):
|
if doc.vocab.strings[match_id] == 'HAPPY':
|
||||||
# matcher.add('TEST', None, [{'ORTH': 'test'}])
|
doc.sentiment += 0.1
|
||||||
# assert 'TEST' in matcher
|
span = doc[start : end]
|
||||||
# assert 'TEST2' not in matcher
|
token = span.merge()
|
||||||
#
|
token.vocab[token.text].norm_ = 'happy emoji'
|
||||||
#
|
|
||||||
#def test_matcher_no_match(matcher):
|
matcher = Matcher(en_vocab)
|
||||||
# words = ["I", "like", "cheese", "."]
|
matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
||||||
# doc = get_doc(matcher.vocab, words)
|
matches = matcher(doc)
|
||||||
# assert matcher(doc) == []
|
assert doc.sentiment != 0
|
||||||
#
|
assert doc[1].norm_ == 'happy emoji'
|
||||||
#
|
|
||||||
#def test_matcher_compile(en_vocab):
|
|
||||||
# rules = {
|
@pytest.mark.parametrize('words', [["Some", "words"]])
|
||||||
# 'JS': [[{'ORTH': 'JavaScript'}]],
|
def test_matcher_init(en_vocab, words):
|
||||||
# 'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
|
matcher = Matcher(en_vocab)
|
||||||
# 'Java': [[{'LOWER': 'java'}]]
|
doc = get_doc(en_vocab, words)
|
||||||
# }
|
assert len(matcher) == 0
|
||||||
# matcher = Matcher(en_vocab)
|
assert matcher(doc) == []
|
||||||
# for key, patterns in rules.items():
|
|
||||||
# matcher.add(key, None, *patterns)
|
|
||||||
# assert len(matcher) == 3
|
def test_matcher_contains(matcher):
|
||||||
#
|
matcher.add('TEST', None, [{'ORTH': 'test'}])
|
||||||
#
|
assert 'TEST' in matcher
|
||||||
#def test_matcher_match_start(matcher):
|
assert 'TEST2' not in matcher
|
||||||
# words = ["JavaScript", "is", "good"]
|
|
||||||
# doc = get_doc(matcher.vocab, words)
|
|
||||||
# assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
|
def test_matcher_no_match(matcher):
|
||||||
#
|
words = ["I", "like", "cheese", "."]
|
||||||
#
|
doc = get_doc(matcher.vocab, words)
|
||||||
#def test_matcher_match_end(matcher):
|
assert matcher(doc) == []
|
||||||
# words = ["I", "like", "java"]
|
|
||||||
# doc = get_doc(matcher.vocab, words)
|
|
||||||
# assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
|
def test_matcher_compile(matcher):
|
||||||
#
|
assert len(matcher) == 3
|
||||||
#
|
|
||||||
#def test_matcher_match_middle(matcher):
|
|
||||||
# words = ["I", "like", "Google", "Now", "best"]
|
def test_matcher_match_start(matcher):
|
||||||
# doc = get_doc(matcher.vocab, words)
|
words = ["JavaScript", "is", "good"]
|
||||||
# assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
|
doc = get_doc(matcher.vocab, words)
|
||||||
#
|
assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
|
||||||
#
|
|
||||||
#def test_matcher_match_multi(matcher):
|
|
||||||
# words = ["I", "like", "Google", "Now", "and", "java", "best"]
|
def test_matcher_match_end(matcher):
|
||||||
# doc = get_doc(matcher.vocab, words)
|
words = ["I", "like", "java"]
|
||||||
# assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
|
doc = get_doc(matcher.vocab, words)
|
||||||
# (doc.vocab.strings['Java'], 5, 6)]
|
assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
|
||||||
#
|
|
||||||
#
|
|
||||||
#def test_matcher_empty_dict(en_vocab):
|
def test_matcher_match_middle(matcher):
|
||||||
# '''Test matcher allows empty token specs, meaning match on any token.'''
|
words = ["I", "like", "Google", "Now", "best"]
|
||||||
# matcher = Matcher(en_vocab)
|
doc = get_doc(matcher.vocab, words)
|
||||||
# abc = ["a", "b", "c"]
|
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
|
||||||
# doc = get_doc(matcher.vocab, abc)
|
|
||||||
# matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
|
|
||||||
# matches = matcher(doc)
|
def test_matcher_match_multi(matcher):
|
||||||
# assert len(matches) == 1
|
words = ["I", "like", "Google", "Now", "and", "java", "best"]
|
||||||
# assert matches[0][1:] == (0, 3)
|
doc = get_doc(matcher.vocab, words)
|
||||||
# matcher = Matcher(en_vocab)
|
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
|
||||||
# matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
(doc.vocab.strings['Java'], 5, 6)]
|
||||||
# matches = matcher(doc)
|
|
||||||
# assert matches[0][1:] == (0, 2)
|
|
||||||
#
|
def test_matcher_empty_dict(en_vocab):
|
||||||
#
|
'''Test matcher allows empty token specs, meaning match on any token.'''
|
||||||
#def test_matcher_operator_shadow(en_vocab):
|
matcher = Matcher(en_vocab)
|
||||||
# matcher = Matcher(en_vocab)
|
abc = ["a", "b", "c"]
|
||||||
# abc = ["a", "b", "c"]
|
doc = get_doc(matcher.vocab, abc)
|
||||||
# doc = get_doc(matcher.vocab, abc)
|
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
|
||||||
# matcher.add('A.C', None, [{'ORTH': 'a'},
|
matches = matcher(doc)
|
||||||
# {"IS_ALPHA": True, "OP": "+"},
|
assert len(matches) == 1
|
||||||
# {'ORTH': 'c'}])
|
assert matches[0][1:] == (0, 3)
|
||||||
# matches = matcher(doc)
|
matcher = Matcher(en_vocab)
|
||||||
# assert len(matches) == 1
|
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
||||||
# assert matches[0][1:] == (0, 3)
|
matches = matcher(doc)
|
||||||
#
|
assert matches[0][1:] == (0, 2)
|
||||||
#
|
|
||||||
#def test_matcher_phrase_matcher(en_vocab):
|
|
||||||
# words = ["Google", "Now"]
|
def test_matcher_operator_shadow(en_vocab):
|
||||||
# doc = get_doc(en_vocab, words)
|
matcher = Matcher(en_vocab)
|
||||||
# matcher = PhraseMatcher(en_vocab)
|
abc = ["a", "b", "c"]
|
||||||
# matcher.add('COMPANY', None, doc)
|
doc = get_doc(matcher.vocab, abc)
|
||||||
# words = ["I", "like", "Google", "Now", "best"]
|
matcher.add('A.C', None, [{'ORTH': 'a'},
|
||||||
# doc = get_doc(en_vocab, words)
|
{"IS_ALPHA": True, "OP": "+"},
|
||||||
# assert len(matcher(doc)) == 1
|
{'ORTH': 'c'}])
|
||||||
#
|
matches = matcher(doc)
|
||||||
#
|
assert len(matches) == 1
|
||||||
#def test_phrase_matcher_length(en_vocab):
|
assert matches[0][1:] == (0, 3)
|
||||||
# matcher = PhraseMatcher(en_vocab)
|
|
||||||
# assert len(matcher) == 0
|
|
||||||
# matcher.add('TEST', None, get_doc(en_vocab, ['test']))
|
def test_matcher_phrase_matcher(en_vocab):
|
||||||
# assert len(matcher) == 1
|
words = ["Google", "Now"]
|
||||||
# matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
|
doc = get_doc(en_vocab, words)
|
||||||
# assert len(matcher) == 2
|
matcher = PhraseMatcher(en_vocab)
|
||||||
#
|
matcher.add('COMPANY', None, doc)
|
||||||
#
|
words = ["I", "like", "Google", "Now", "best"]
|
||||||
#def test_phrase_matcher_contains(en_vocab):
|
doc = get_doc(en_vocab, words)
|
||||||
# matcher = PhraseMatcher(en_vocab)
|
assert len(matcher(doc)) == 1
|
||||||
# matcher.add('TEST', None, get_doc(en_vocab, ['test']))
|
|
||||||
# assert 'TEST' in matcher
|
|
||||||
# assert 'TEST2' not in matcher
|
def test_phrase_matcher_length(en_vocab):
|
||||||
#
|
matcher = PhraseMatcher(en_vocab)
|
||||||
#
|
assert len(matcher) == 0
|
||||||
#def test_matcher_match_zero(matcher):
|
matcher.add('TEST', None, get_doc(en_vocab, ['test']))
|
||||||
# words1 = 'He said , " some words " ...'.split()
|
assert len(matcher) == 1
|
||||||
# words2 = 'He said , " some three words " ...'.split()
|
matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
|
||||||
# pattern1 = [{'ORTH': '"'},
|
assert len(matcher) == 2
|
||||||
# {'OP': '!', 'IS_PUNCT': True},
|
|
||||||
# {'OP': '!', 'IS_PUNCT': True},
|
|
||||||
# {'ORTH': '"'}]
|
def test_phrase_matcher_contains(en_vocab):
|
||||||
# pattern2 = [{'ORTH': '"'},
|
matcher = PhraseMatcher(en_vocab)
|
||||||
# {'IS_PUNCT': True},
|
matcher.add('TEST', None, get_doc(en_vocab, ['test']))
|
||||||
# {'IS_PUNCT': True},
|
assert 'TEST' in matcher
|
||||||
# {'IS_PUNCT': True},
|
assert 'TEST2' not in matcher
|
||||||
# {'ORTH': '"'}]
|
|
||||||
#
|
|
||||||
# matcher.add('Quote', None, pattern1)
|
def test_matcher_match_zero(matcher):
|
||||||
# doc = get_doc(matcher.vocab, words1)
|
words1 = 'He said , " some words " ...'.split()
|
||||||
# assert len(matcher(doc)) == 1
|
words2 = 'He said , " some three words " ...'.split()
|
||||||
#
|
pattern1 = [{'ORTH': '"'},
|
||||||
# doc = get_doc(matcher.vocab, words2)
|
{'OP': '!', 'IS_PUNCT': True},
|
||||||
# assert len(matcher(doc)) == 0
|
{'OP': '!', 'IS_PUNCT': True},
|
||||||
# matcher.add('Quote', None, pattern2)
|
{'ORTH': '"'}]
|
||||||
# assert len(matcher(doc)) == 0
|
pattern2 = [{'ORTH': '"'},
|
||||||
#
|
{'IS_PUNCT': True},
|
||||||
#
|
{'IS_PUNCT': True},
|
||||||
#def test_matcher_match_zero_plus(matcher):
|
{'IS_PUNCT': True},
|
||||||
# words = 'He said , " some words " ...'.split()
|
{'ORTH': '"'}]
|
||||||
# pattern = [{'ORTH': '"'},
|
|
||||||
# {'OP': '*', 'IS_PUNCT': False},
|
matcher.add('Quote', None, pattern1)
|
||||||
# {'ORTH': '"'}]
|
doc = get_doc(matcher.vocab, words1)
|
||||||
# matcher = Matcher(matcher.vocab)
|
assert len(matcher(doc)) == 1
|
||||||
# matcher.add('Quote', None, pattern)
|
|
||||||
# doc = get_doc(matcher.vocab, words)
|
doc = get_doc(matcher.vocab, words2)
|
||||||
# assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 0
|
||||||
#
|
matcher.add('Quote', None, pattern2)
|
||||||
#
|
assert len(matcher(doc)) == 0
|
||||||
#def test_matcher_match_one_plus(matcher):
|
|
||||||
# control = Matcher(matcher.vocab)
|
|
||||||
# control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
|
def test_matcher_match_zero_plus(matcher):
|
||||||
# doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
|
words = 'He said , " some words " ...'.split()
|
||||||
# m = control(doc)
|
pattern = [{'ORTH': '"'},
|
||||||
# assert len(m) == 2
|
{'OP': '*', 'IS_PUNCT': False},
|
||||||
# matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
|
{'ORTH': '"'}]
|
||||||
# {'ORTH': 'Philippe', 'OP': '+'}])
|
matcher.add('Quote', None, pattern)
|
||||||
# m = matcher(doc)
|
doc = get_doc(matcher.vocab, words)
|
||||||
# assert len(m) == 1
|
assert len(matcher(doc)) == 1
|
||||||
#
|
|
||||||
|
|
||||||
|
def test_matcher_match_one_plus(matcher):
|
||||||
|
control = Matcher(matcher.vocab)
|
||||||
|
control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
|
||||||
|
doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
|
||||||
|
m = control(doc)
|
||||||
|
assert len(m) == 2
|
||||||
|
matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
|
||||||
|
{'ORTH': 'Philippe', 'OP': '+'}])
|
||||||
|
m = matcher(doc)
|
||||||
|
assert len(m) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_operator_combos(matcher):
|
def test_operator_combos(matcher):
|
||||||
cases = [
|
cases = [
|
||||||
|
@ -266,8 +252,9 @@ def test_matcher_end_zero_plus(matcher):
|
||||||
)
|
)
|
||||||
nlp = lambda string: Doc(matcher.vocab, words=string.split())
|
nlp = lambda string: Doc(matcher.vocab, words=string.split())
|
||||||
assert len(matcher(nlp(u'a'))) == 1
|
assert len(matcher(nlp(u'a'))) == 1
|
||||||
assert len(matcher(nlp(u'a b'))) == 2
|
assert len(matcher(nlp(u'a b'))) == 1
|
||||||
|
assert len(matcher(nlp(u'a b'))) == 1
|
||||||
assert len(matcher(nlp(u'a c'))) == 1
|
assert len(matcher(nlp(u'a c'))) == 1
|
||||||
assert len(matcher(nlp(u'a b c'))) == 2
|
assert len(matcher(nlp(u'a b c'))) == 1
|
||||||
assert len(matcher(nlp(u'a b b c'))) == 3
|
assert len(matcher(nlp(u'a b b c'))) == 1
|
||||||
assert len(matcher(nlp(u'a b b'))) == 3
|
assert len(matcher(nlp(u'a b b'))) == 1
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
import random
|
|
||||||
import numpy.random
|
|
||||||
|
|
||||||
from ..pipeline import TextCategorizer
|
|
||||||
from ..lang.en import English
|
|
||||||
from ..vocab import Vocab
|
|
||||||
from ..tokens import Doc
|
|
||||||
from ..gold import GoldParse
|
|
||||||
|
|
||||||
|
|
||||||
def test_textcat_learns_multilabel():
|
|
||||||
random.seed(0)
|
|
||||||
numpy.random.seed(0)
|
|
||||||
docs = []
|
|
||||||
nlp = English()
|
|
||||||
vocab = nlp.vocab
|
|
||||||
letters = ['a', 'b', 'c']
|
|
||||||
for w1 in letters:
|
|
||||||
for w2 in letters:
|
|
||||||
cats = {letter: float(w2==letter) for letter in letters}
|
|
||||||
docs.append((Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3), cats))
|
|
||||||
random.shuffle(docs)
|
|
||||||
model = TextCategorizer(vocab, width=8)
|
|
||||||
for letter in letters:
|
|
||||||
model.add_label(letter)
|
|
||||||
optimizer = model.begin_training()
|
|
||||||
for i in range(30):
|
|
||||||
losses = {}
|
|
||||||
Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
|
|
||||||
Xs = [doc for doc, cats in docs]
|
|
||||||
model.update(Xs, Ys, sgd=optimizer, losses=losses)
|
|
||||||
random.shuffle(docs)
|
|
||||||
for w1 in letters:
|
|
||||||
for w2 in letters:
|
|
||||||
doc = Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3)
|
|
||||||
truth = {letter: w2==letter for letter in letters}
|
|
||||||
model(doc)
|
|
||||||
for cat, score in doc.cats.items():
|
|
||||||
if not truth[cat]:
|
|
||||||
assert score < 0.5
|
|
||||||
else:
|
|
||||||
assert score > 0.5
|
|
||||||
|
|
|
@ -19,9 +19,6 @@ ctypedef fused LexemeOrToken:
|
||||||
const_TokenC_ptr
|
const_TokenC_ptr
|
||||||
|
|
||||||
|
|
||||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
|
|
||||||
|
|
||||||
|
|
||||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
|
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -186,20 +186,6 @@ cdef class Doc:
|
||||||
def _(self):
|
def _(self):
|
||||||
return Underscore(Underscore.doc_extensions, self)
|
return Underscore(Underscore.doc_extensions, self)
|
||||||
|
|
||||||
@property
|
|
||||||
def is_sentenced(self):
|
|
||||||
# Check if the document has sentence boundaries,
|
|
||||||
# i.e at least one tok has the sent_start in (-1, 1)
|
|
||||||
if 'sents' in self.user_hooks:
|
|
||||||
return True
|
|
||||||
if self.is_parsed:
|
|
||||||
return True
|
|
||||||
for i in range(self.length):
|
|
||||||
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a `Token` or `Span` object.
|
"""Get a `Token` or `Span` object.
|
||||||
|
|
||||||
|
@ -531,23 +517,29 @@ cdef class Doc:
|
||||||
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if not self.is_sentenced:
|
|
||||||
raise ValueError(
|
|
||||||
"Sentence boundaries unset. You can add the 'sentencizer' "
|
|
||||||
"component to the pipeline with: "
|
|
||||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
|
||||||
"Alternatively, add the dependency parser, or set "
|
|
||||||
"sentence boundaries by setting doc[i].sent_start")
|
|
||||||
if 'sents' in self.user_hooks:
|
if 'sents' in self.user_hooks:
|
||||||
yield from self.user_hooks['sents'](self)
|
yield from self.user_hooks['sents'](self)
|
||||||
else:
|
return
|
||||||
start = 0
|
|
||||||
|
cdef int i
|
||||||
|
if not self.is_parsed:
|
||||||
for i in range(1, self.length):
|
for i in range(1, self.length):
|
||||||
if self.c[i].sent_start == 1:
|
if self.c[i].sent_start != 0:
|
||||||
yield Span(self, start, i)
|
break
|
||||||
start = i
|
else:
|
||||||
if start != self.length:
|
raise ValueError(
|
||||||
yield Span(self, start, self.length)
|
"Sentence boundaries unset. You can add the 'sentencizer' "
|
||||||
|
"component to the pipeline with: "
|
||||||
|
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||||
|
"Alternatively, add the dependency parser, or set "
|
||||||
|
"sentence boundaries by setting doc[i].sent_start")
|
||||||
|
start = 0
|
||||||
|
for i in range(1, self.length):
|
||||||
|
if self.c[i].sent_start == 1:
|
||||||
|
yield Span(self, start, i)
|
||||||
|
start = i
|
||||||
|
if start != self.length:
|
||||||
|
yield Span(self, start, self.length)
|
||||||
|
|
||||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
||||||
if self.length == 0:
|
if self.length == 0:
|
||||||
|
|
|
@ -285,42 +285,16 @@ cdef class Span:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sent' in self.doc.user_span_hooks:
|
if 'sent' in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks['sent'](self)
|
return self.doc.user_span_hooks['sent'](self)
|
||||||
# This should raise if we're not parsed
|
# This should raise if we're not parsed.
|
||||||
# or doesen't have any sbd component :)
|
|
||||||
self.doc.sents
|
self.doc.sents
|
||||||
# if doc is parsed we can use the deps to find the sentence
|
|
||||||
# otherwise we use the `sent_start` token attribute
|
|
||||||
cdef int n = 0
|
cdef int n = 0
|
||||||
cdef int i
|
root = &self.doc.c[self.start]
|
||||||
if self.doc.is_parsed:
|
while root.head != 0:
|
||||||
root = &self.doc.c[self.start]
|
root += root.head
|
||||||
n = 0
|
n += 1
|
||||||
while root.head != 0:
|
if n >= self.doc.length:
|
||||||
root += root.head
|
raise RuntimeError
|
||||||
n += 1
|
return self.doc[root.l_edge:root.r_edge + 1]
|
||||||
if n >= self.doc.length:
|
|
||||||
raise RuntimeError
|
|
||||||
return self.doc[root.l_edge:root.r_edge + 1]
|
|
||||||
elif self.doc.is_sentenced:
|
|
||||||
# find start of the sentence
|
|
||||||
start = self.start
|
|
||||||
while self.doc.c[start].sent_start != 1 and start > 0:
|
|
||||||
start += -1
|
|
||||||
# find end of the sentence
|
|
||||||
end = self.end
|
|
||||||
n = 0
|
|
||||||
while end < self.doc.length and self.doc.c[end].sent_start != 1:
|
|
||||||
end += 1
|
|
||||||
n += 1
|
|
||||||
if n >= self.doc.length:
|
|
||||||
break
|
|
||||||
#
|
|
||||||
return self.doc[start:end]
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"Access to sentence requires either the dependency parse "
|
|
||||||
"or sentence boundaries to be set by setting " +
|
|
||||||
"doc[i].is_sent_start = True")
|
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
|
|
|
@ -34,11 +34,11 @@ cdef class Token:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_extension(cls, name):
|
def get_extension(cls, name):
|
||||||
return Underscore.span_extensions.get(name)
|
return Underscore.token_extensions.get(name)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def has_extension(cls, name):
|
def has_extension(cls, name):
|
||||||
return name in Underscore.span_extensions
|
return name in Underscore.token_extensions
|
||||||
|
|
||||||
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
||||||
"""Construct a `Token` object.
|
"""Construct a `Token` object.
|
||||||
|
|
|
@ -436,29 +436,6 @@ def decaying(start, stop, decay):
|
||||||
nr_upd += 1
|
nr_upd += 1
|
||||||
|
|
||||||
|
|
||||||
def minibatch_by_words(items, size, count_words=len):
|
|
||||||
'''Create minibatches of a given number of words.'''
|
|
||||||
if isinstance(size, int):
|
|
||||||
size_ = itertools.repeat(size)
|
|
||||||
else:
|
|
||||||
size_ = size
|
|
||||||
items = iter(items)
|
|
||||||
while True:
|
|
||||||
batch_size = next(size_)
|
|
||||||
batch = []
|
|
||||||
while batch_size >= 0:
|
|
||||||
try:
|
|
||||||
doc, gold = next(items)
|
|
||||||
except StopIteration:
|
|
||||||
if batch:
|
|
||||||
yield batch
|
|
||||||
return
|
|
||||||
batch_size -= count_words(doc)
|
|
||||||
batch.append((doc, gold))
|
|
||||||
if batch:
|
|
||||||
yield batch
|
|
||||||
|
|
||||||
|
|
||||||
def itershuffle(iterable, bufsize=1000):
|
def itershuffle(iterable, bufsize=1000):
|
||||||
"""Shuffle an iterator. This works by holding `bufsize` items back
|
"""Shuffle an iterator. This works by holding `bufsize` items back
|
||||||
and yielding them sometime later. Obviously, this is not unbiased –
|
and yielding them sometime later. Obviously, this is not unbiased –
|
||||||
|
@ -474,7 +451,7 @@ def itershuffle(iterable, bufsize=1000):
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
for i in range(random.randint(1, bufsize-len(buf))):
|
for i in range(random.randint(1, bufsize-len(buf))):
|
||||||
buf.append(next(iterable))
|
buf.append(iterable.next())
|
||||||
random.shuffle(buf)
|
random.shuffle(buf)
|
||||||
for i in range(random.randint(1, bufsize)):
|
for i in range(random.randint(1, bufsize)):
|
||||||
if buf:
|
if buf:
|
||||||
|
|
|
@ -120,6 +120,9 @@ include ../_includes/_mixins
|
||||||
| A Practical Real-World Approach to Gaining Actionable Insights
|
| A Practical Real-World Approach to Gaining Actionable Insights
|
||||||
| from your Data
|
| from your Data
|
||||||
|
|
||||||
|
+card("Practical Machine Learning with Python", "", "Dipanjan Sarkar et al. (Apress, 2017)", "book")
|
||||||
|
| A Problem-Solver's Guide to Building Real-World Intelligent Systems
|
||||||
|
|
||||||
+section("notebooks")
|
+section("notebooks")
|
||||||
+h(2, "notebooks") Jupyter notebooks
|
+h(2, "notebooks") Jupyter notebooks
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user