mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Revert "Merge branch 'develop' of https://github.com/explosion/spaCy into develop"
This reverts commitc9ba3d3c2d
, reversing changes made to92c26a35d4
.
This commit is contained in:
parent
8b7a74570f
commit
1f7229f40f
|
@ -32,7 +32,7 @@ test_script:
|
|||
# Note that you must use the environment variable %PYTHON% to refer to
|
||||
# the interpreter you're using - Appveyor does not do anything special
|
||||
# to put the Python version you want to use on PATH.
|
||||
- "%PYTHON%\\python.exe -m pytest spacy/"
|
||||
- "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
|
||||
|
||||
after_test:
|
||||
# This step builds your wheels.
|
||||
|
|
11
.buildkite/train.yml
Normal file
11
.buildkite/train.yml
Normal file
|
@ -0,0 +1,11 @@
|
|||
steps:
|
||||
-
|
||||
command: "fab env clean make test wheel"
|
||||
label: ":dizzy: :python:"
|
||||
artifact_paths: "dist/*.whl"
|
||||
- wait
|
||||
- trigger: "spacy-train-from-wheel"
|
||||
label: ":dizzy: :train:"
|
||||
build:
|
||||
env:
|
||||
SPACY_VERSION: "{$SPACY_VERSION}"
|
|
@ -182,7 +182,7 @@ If you've made a contribution to spaCy, you should fill in the
|
|||
[spaCy contributor agreement](.github/CONTRIBUTOR_AGREEMENT.md) to ensure that
|
||||
your contribution can be used across the project. If you agree to be bound by
|
||||
the terms of the agreement, fill in the [template](.github/CONTRIBUTOR_AGREEMENT.md)
|
||||
and include it with your pull request, or sumit it separately to
|
||||
and include it with your pull request, or submit it separately to
|
||||
[`.github/contributors/`](/.github/contributors). The name of the file should be
|
||||
your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
|
392
examples/training/conllu.py
Normal file
392
examples/training/conllu.py
Normal file
|
@ -0,0 +1,392 @@
|
|||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||
.conllu format for development data, allowing the official scorer to be used.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import tqdm
|
||||
import attr
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
|
||||
import spacy
|
||||
import spacy.util
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import numpy.random
|
||||
import cytoolz
|
||||
|
||||
import conll17_ud_eval
|
||||
|
||||
import spacy.lang.zh
|
||||
import spacy.lang.ja
|
||||
|
||||
spacy.lang.zh.Chinese.Defaults.use_jieba = False
|
||||
spacy.lang.ja.Japanese.Defaults.use_janome = False
|
||||
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
||||
def minibatch_by_words(items, size=5000):
|
||||
random.shuffle(items)
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(size)
|
||||
else:
|
||||
size_ = size
|
||||
items = iter(items)
|
||||
while True:
|
||||
batch_size = next(size_)
|
||||
batch = []
|
||||
while batch_size >= 0:
|
||||
try:
|
||||
doc, gold = next(items)
|
||||
except StopIteration:
|
||||
if batch:
|
||||
yield batch
|
||||
return
|
||||
batch_size -= len(doc)
|
||||
batch.append((doc, gold))
|
||||
if batch:
|
||||
yield batch
|
||||
else:
|
||||
break
|
||||
|
||||
################
|
||||
# Data reading #
|
||||
################
|
||||
|
||||
space_re = re.compile('\s+')
|
||||
def split_text(text):
|
||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
||||
|
||||
|
||||
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
||||
max_doc_length=None, limit=None):
|
||||
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||
include Doc objects created using nlp.make_doc and then aligned against
|
||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||
created from the gold-standard segments. At least one must be True.'''
|
||||
if not raw_text and not oracle_segments:
|
||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
||||
paragraphs = split_text(text_file.read())
|
||||
conllu = read_conllu(conllu_file)
|
||||
# sd is spacy doc; cd is conllu doc
|
||||
# cs is conllu sent, ct is conllu token
|
||||
docs = []
|
||||
golds = []
|
||||
for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
|
||||
sent_annots = []
|
||||
for cs in cd:
|
||||
sent = defaultdict(list)
|
||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
||||
if '.' in id_:
|
||||
continue
|
||||
if '-' in id_:
|
||||
continue
|
||||
id_ = int(id_)-1
|
||||
head = int(head)-1 if head != '0' else id_
|
||||
sent['words'].append(word)
|
||||
sent['tags'].append(tag)
|
||||
sent['heads'].append(head)
|
||||
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
||||
sent['spaces'].append(space_after == '_')
|
||||
sent['entities'] = ['-'] * len(sent['words'])
|
||||
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
||||
sent['deps'])
|
||||
if oracle_segments:
|
||||
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
|
||||
golds.append(GoldParse(docs[-1], **sent))
|
||||
|
||||
sent_annots.append(sent)
|
||||
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||
sent_annots = []
|
||||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
|
||||
if raw_text and sent_annots:
|
||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
return docs, golds
|
||||
|
||||
|
||||
def read_conllu(file_):
|
||||
docs = []
|
||||
sent = []
|
||||
doc = []
|
||||
for line in file_:
|
||||
if line.startswith('# newdoc'):
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
doc = []
|
||||
elif line.startswith('#'):
|
||||
continue
|
||||
elif not line.strip():
|
||||
if sent:
|
||||
doc.append(sent)
|
||||
sent = []
|
||||
else:
|
||||
sent.append(list(line.strip().split('\t')))
|
||||
if len(sent[-1]) != 10:
|
||||
print(repr(line))
|
||||
raise ValueError
|
||||
if sent:
|
||||
doc.append(sent)
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
|
||||
def _make_gold(nlp, text, sent_annots):
|
||||
# Flatten the conll annotations, and adjust the head indices
|
||||
flat = defaultdict(list)
|
||||
for sent in sent_annots:
|
||||
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
||||
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
|
||||
flat[field].extend(sent[field])
|
||||
# Construct text if necessary
|
||||
assert len(flat['words']) == len(flat['spaces'])
|
||||
if text is None:
|
||||
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
|
||||
doc = nlp.make_doc(text)
|
||||
flat.pop('spaces')
|
||||
gold = GoldParse(doc, **flat)
|
||||
return doc, gold
|
||||
|
||||
#############################
|
||||
# Data transforms for spaCy #
|
||||
#############################
|
||||
|
||||
def golds_to_gold_tuples(docs, golds):
|
||||
'''Get out the annoying 'tuples' format used by begin_training, given the
|
||||
GoldParse objects.'''
|
||||
tuples = []
|
||||
for doc, gold in zip(docs, golds):
|
||||
text = doc.text
|
||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
||||
tuples.append((text, sents))
|
||||
return tuples
|
||||
|
||||
|
||||
##############
|
||||
# Evaluation #
|
||||
##############
|
||||
|
||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||
with text_loc.open('r', encoding='utf8') as text_file:
|
||||
texts = split_text(text_file.read())
|
||||
docs = list(nlp.pipe(texts))
|
||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
||||
write_conllu(docs, out_file)
|
||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||
return scores
|
||||
|
||||
|
||||
def write_conllu(docs, file_):
|
||||
merger = Matcher(docs[0].vocab)
|
||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
||||
for i, doc in enumerate(docs):
|
||||
matches = merger(doc)
|
||||
spans = [doc[start:end+1] for _, start, end in matches]
|
||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||
for start_char, end_char in offsets:
|
||||
doc.merge(start_char, end_char)
|
||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
||||
for j, sent in enumerate(doc.sents):
|
||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||
file_.write("# text = {text}\n".format(text=sent.text))
|
||||
for k, token in enumerate(sent):
|
||||
file_.write(token._.get_conllu_lines(k) + '\n')
|
||||
file_.write('\n')
|
||||
|
||||
|
||||
def print_progress(itn, losses, ud_scores):
|
||||
fields = {
|
||||
'dep_loss': losses.get('parser', 0.0),
|
||||
'tag_loss': losses.get('tagger', 0.0),
|
||||
'words': ud_scores['Words'].f1 * 100,
|
||||
'sents': ud_scores['Sentences'].f1 * 100,
|
||||
'tags': ud_scores['XPOS'].f1 * 100,
|
||||
'uas': ud_scores['UAS'].f1 * 100,
|
||||
'las': ud_scores['LAS'].f1 * 100,
|
||||
}
|
||||
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
||||
if itn == 0:
|
||||
print('\t'.join(header))
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.1f}',
|
||||
'{las:.1f}',
|
||||
'{uas:.1f}',
|
||||
'{tags:.1f}',
|
||||
'{sents:.1f}',
|
||||
'{words:.1f}',
|
||||
))
|
||||
print(tpl.format(itn, **fields))
|
||||
|
||||
#def get_sent_conllu(sent, sent_id):
|
||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
||||
|
||||
def get_token_conllu(token, i):
|
||||
if token._.begins_fused:
|
||||
n = 1
|
||||
while token.nbor(n)._.inside_fused:
|
||||
n += 1
|
||||
id_ = '%d-%d' % (i, i+n)
|
||||
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
|
||||
else:
|
||||
lines = []
|
||||
if token.head.i == token.i:
|
||||
head = 0
|
||||
else:
|
||||
head = i + (token.head.i - token.i) + 1
|
||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
||||
str(head), token.dep_.lower(), '_', '_']
|
||||
lines.append('\t'.join(fields))
|
||||
return '\n'.join(lines)
|
||||
|
||||
Token.set_extension('get_conllu_lines', method=get_token_conllu)
|
||||
Token.set_extension('begins_fused', default=False)
|
||||
Token.set_extension('inside_fused', default=False)
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
##################
|
||||
|
||||
|
||||
def load_nlp(corpus, config):
|
||||
lang = corpus.split('_')[0]
|
||||
nlp = spacy.blank(lang)
|
||||
if config.vectors:
|
||||
nlp.vocab.from_disk(config.vectors / 'vocab')
|
||||
return nlp
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config):
|
||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
||||
if config.multitask_tag:
|
||||
nlp.parser.add_multitask_objective('tag')
|
||||
if config.multitask_sent:
|
||||
nlp.parser.add_multitask_objective('sent_start')
|
||||
nlp.parser.moves.add_action(2, 'subtok')
|
||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
||||
for gold in golds:
|
||||
for tag in gold.tags:
|
||||
if tag is not None:
|
||||
nlp.tagger.add_label(tag)
|
||||
# Replace labels that didn't make the frequency cutoff
|
||||
actions = set(nlp.parser.labels)
|
||||
label_set = set([act.split('-')[1] for act in actions if '-' in act])
|
||||
for gold in golds:
|
||||
for i, label in enumerate(gold.labels):
|
||||
if label is not None and label not in label_set:
|
||||
gold.labels[i] = label.split('||')[0]
|
||||
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
|
||||
|
||||
|
||||
########################
|
||||
# Command line helpers #
|
||||
########################
|
||||
|
||||
@attr.s
|
||||
class Config(object):
|
||||
vectors = attr.ib(default=None)
|
||||
max_doc_length = attr.ib(default=10)
|
||||
multitask_tag = attr.ib(default=True)
|
||||
multitask_sent = attr.ib(default=True)
|
||||
nr_epoch = attr.ib(default=30)
|
||||
batch_size = attr.ib(default=1000)
|
||||
dropout = attr.ib(default=0.2)
|
||||
|
||||
@classmethod
|
||||
def load(cls, loc):
|
||||
with Path(loc).open('r', encoding='utf8') as file_:
|
||||
cfg = json.load(file_)
|
||||
return cls(**cfg)
|
||||
|
||||
|
||||
class Dataset(object):
|
||||
def __init__(self, path, section):
|
||||
self.path = path
|
||||
self.section = section
|
||||
self.conllu = None
|
||||
self.text = None
|
||||
for file_path in self.path.iterdir():
|
||||
name = file_path.parts[-1]
|
||||
if section in name and name.endswith('conllu'):
|
||||
self.conllu = file_path
|
||||
elif section in name and name.endswith('txt'):
|
||||
self.text = file_path
|
||||
if self.conllu is None:
|
||||
msg = "Could not find .txt file in {path} for {section}"
|
||||
raise IOError(msg.format(section=section, path=path))
|
||||
if self.text is None:
|
||||
msg = "Could not find .txt file in {path} for {section}"
|
||||
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
|
||||
|
||||
|
||||
class TreebankPaths(object):
|
||||
def __init__(self, ud_path, treebank, **cfg):
|
||||
self.train = Dataset(ud_path / treebank, 'train')
|
||||
self.dev = Dataset(ud_path / treebank, 'dev')
|
||||
self.lang = self.train.lang
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||
"positional", None, str),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
config=("Path to json formatted config file", "positional", None, Config.load),
|
||||
limit=("Size limit", "option", "n", int)
|
||||
)
|
||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||
paths = TreebankPaths(ud_dir, corpus)
|
||||
if not (parses_dir / corpus).exists():
|
||||
(parses_dir / corpus).mkdir()
|
||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||
nlp = load_nlp(paths.lang, config)
|
||||
|
||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length, limit=limit)
|
||||
|
||||
optimizer = initialize_pipeline(nlp, docs, golds, config)
|
||||
|
||||
for i in range(config.nr_epoch):
|
||||
docs = [nlp.make_doc(doc.text) for doc in docs]
|
||||
batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
|
||||
losses = {}
|
||||
n_train_words = sum(len(doc) for doc in docs)
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
for batch in batches:
|
||||
batch_docs, batch_gold = zip(*batch)
|
||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||
nlp.update(batch_docs, batch_gold, sgd=optimizer,
|
||||
drop=config.dropout, losses=losses)
|
||||
|
||||
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
|
||||
print_progress(i, losses, scores)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,88 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf8
|
||||
"""Export spaCy model vectors for use in TensorBoard's standalone embedding projector.
|
||||
https://github.com/tensorflow/embedding-projector-standalone
|
||||
|
||||
Usage:
|
||||
|
||||
python vectors_tensorboard_standalone.py ./myVectorModel ./output [name]
|
||||
|
||||
This outputs two files that have to be copied into the "oss_data" of the standalone projector:
|
||||
|
||||
[name]_labels.tsv - metadata such as human readable labels for vectors
|
||||
[name]_tensors.bytes - numpy.ndarray of numpy.float32 precision vectors
|
||||
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
import math
|
||||
from os import path
|
||||
|
||||
import numpy
|
||||
import plac
|
||||
import spacy
|
||||
import tqdm
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
vectors_loc=("Path to spaCy model that contains vectors", "positional", None, str),
|
||||
out_loc=("Path to output folder writing tensors and labels data", "positional", None, str),
|
||||
name=("Human readable name for tsv file and vectors tensor", "positional", None, str),
|
||||
)
|
||||
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||
# A tab-separated file that contains information about the vectors for visualization
|
||||
#
|
||||
# Learn more: https://www.tensorflow.org/programmers_guide/embedding#metadata
|
||||
meta_file = "{}_labels.tsv".format(name)
|
||||
out_meta_file = path.join(out_loc, meta_file)
|
||||
|
||||
print('Loading spaCy vectors model: {}'.format(vectors_loc))
|
||||
model = spacy.load(vectors_loc)
|
||||
|
||||
print('Finding lexemes with vectors attached: {}'.format(vectors_loc))
|
||||
voacb_strings = [
|
||||
w for w in tqdm.tqdm(model.vocab.strings, total=len(model.vocab.strings), leave=False)
|
||||
if model.vocab.has_vector(w)
|
||||
]
|
||||
vector_count = len(voacb_strings)
|
||||
|
||||
print('Building Projector labels for {} vectors: {}'.format(vector_count, out_meta_file))
|
||||
vector_dimensions = model.vocab.vectors.shape[1]
|
||||
tf_vectors_variable = numpy.zeros((vector_count, vector_dimensions), dtype=numpy.float32)
|
||||
|
||||
# Write a tab-separated file that contains information about the vectors for visualization
|
||||
#
|
||||
# Reference: https://www.tensorflow.org/programmers_guide/embedding#metadata
|
||||
with open(out_meta_file, 'wb') as file_metadata:
|
||||
# Define columns in the first row
|
||||
file_metadata.write("Text\tFrequency\n".encode('utf-8'))
|
||||
# Write out a row for each vector that we add to the tensorflow variable we created
|
||||
vec_index = 0
|
||||
|
||||
for text in tqdm.tqdm(voacb_strings, total=len(voacb_strings), leave=False):
|
||||
# https://github.com/tensorflow/tensorflow/issues/9094
|
||||
text = '<Space>' if text.lstrip() == '' else text
|
||||
lex = model.vocab[text]
|
||||
|
||||
# Store vector data and metadata
|
||||
tf_vectors_variable[vec_index] = numpy.float64(model.vocab.get_vector(text))
|
||||
file_metadata.write("{}\t{}\n".format(text, math.exp(lex.prob) * len(voacb_strings)).encode('utf-8'))
|
||||
vec_index += 1
|
||||
|
||||
# Write out "[name]_tensors.bytes" file for standalone embeddings projector to load
|
||||
tensor_path = '{}_tensors.bytes'.format(name)
|
||||
tf_vectors_variable.tofile(path.join(out_loc, tensor_path))
|
||||
|
||||
print('Done.')
|
||||
print('Add the following entry to "oss_data/oss_demo_projector_config.json"')
|
||||
print(json.dumps({
|
||||
"tensorName": name,
|
||||
"tensorShape": [vector_count, vector_dimensions],
|
||||
"tensorPath": 'oss_data/{}'.format(tensor_path),
|
||||
"metadataPath": 'oss_data/{}'.format(meta_file)
|
||||
}, indent=2))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
83
fabfile.py
vendored
83
fabfile.py
vendored
|
@ -1,49 +1,92 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import contextlib
|
||||
from pathlib import Path
|
||||
from fabric.api import local, lcd, env, settings, prefix
|
||||
from fabtools.python import virtualenv
|
||||
from os import path, environ
|
||||
import shutil
|
||||
|
||||
|
||||
PWD = path.dirname(__file__)
|
||||
ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env'
|
||||
VENV_DIR = path.join(PWD, ENV)
|
||||
VENV_DIR = Path(PWD) / ENV
|
||||
|
||||
|
||||
def env(lang='python2.7'):
|
||||
if path.exists(VENV_DIR):
|
||||
@contextlib.contextmanager
|
||||
def virtualenv(name, create=False, python='/usr/bin/python3.6'):
|
||||
python = Path(python).resolve()
|
||||
env_path = VENV_DIR
|
||||
if create:
|
||||
if env_path.exists():
|
||||
shutil.rmtree(str(env_path))
|
||||
local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR))
|
||||
def wrapped_local(cmd, env_vars=[], capture=False, direct=False):
|
||||
return local('source {}/bin/activate && {}'.format(env_path, cmd),
|
||||
shell='/bin/bash', capture=False)
|
||||
yield wrapped_local
|
||||
|
||||
|
||||
def env(lang='python3.6'):
|
||||
if VENV_DIR.exists():
|
||||
local('rm -rf {env}'.format(env=VENV_DIR))
|
||||
local('pip install virtualenv')
|
||||
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
||||
if lang.startswith('python3'):
|
||||
local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR))
|
||||
else:
|
||||
local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang))
|
||||
local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR))
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
print(venv_local('python --version', capture=True))
|
||||
venv_local('pip install --upgrade setuptools --no-cache-dir')
|
||||
venv_local('pip install pytest --no-cache-dir')
|
||||
venv_local('pip install wheel --no-cache-dir')
|
||||
venv_local('pip install -r requirements.txt --no-cache-dir')
|
||||
venv_local('pip install pex --no-cache-dir')
|
||||
|
||||
|
||||
|
||||
def install():
|
||||
with virtualenv(VENV_DIR):
|
||||
local('pip install --upgrade setuptools')
|
||||
local('pip install dist/*.tar.gz')
|
||||
local('pip install pytest')
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
venv_local('pip install dist/*.tar.gz')
|
||||
|
||||
|
||||
def make():
|
||||
with virtualenv(VENV_DIR):
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('pip install cython')
|
||||
local('pip install murmurhash')
|
||||
local('pip install -r requirements.txt')
|
||||
local('python setup.py build_ext --inplace')
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace',
|
||||
shell='/bin/bash')
|
||||
|
||||
def sdist():
|
||||
with virtualenv(VENV_DIR):
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('python setup.py sdist')
|
||||
|
||||
def wheel():
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
venv_local('python setup.py bdist_wheel')
|
||||
|
||||
def pex():
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
sha = local('git rev-parse --short HEAD', capture=True)
|
||||
venv_local('pex dist/*.whl -e spacy -o dist/spacy-%s.pex' % sha,
|
||||
direct=True)
|
||||
|
||||
|
||||
def clean():
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('python setup.py clean --all')
|
||||
local('rm -f dist/*.whl')
|
||||
local('rm -f dist/*.pex')
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
venv_local('python setup.py clean --all')
|
||||
|
||||
|
||||
def test():
|
||||
with virtualenv(VENV_DIR):
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
local('py.test -x spacy/tests')
|
||||
venv_local('pytest -x spacy/tests')
|
||||
|
||||
def train():
|
||||
args = environ.get('SPACY_TRAIN_ARGS', '')
|
||||
with virtualenv(VENV_DIR) as venv_local:
|
||||
venv_local('spacy train {args}'.format(args=args))
|
||||
|
|
|
@ -5,8 +5,8 @@ cymem>=1.30,<1.32
|
|||
preshed>=1.0.0,<2.0.0
|
||||
thinc>=6.11.1.dev10,<6.12.0
|
||||
murmurhash>=0.28,<0.29
|
||||
cytoolz>=0.9.0,<0.10.0
|
||||
plac<1.0.0,>=0.9.6
|
||||
six
|
||||
ujson>=1.35
|
||||
dill>=0.2,<0.3
|
||||
requests>=2.13.0,<3.0.0
|
||||
|
@ -16,4 +16,3 @@ pytest>=3.0.6,<4.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
msgpack-python==0.5.4
|
||||
msgpack-numpy==0.4.1
|
||||
html5lib==1.0b8
|
||||
|
|
4
setup.py
4
setup.py
|
@ -18,6 +18,7 @@ PACKAGES = find_packages()
|
|||
|
||||
|
||||
MOD_NAMES = [
|
||||
'spacy._align',
|
||||
'spacy.parts_of_speech',
|
||||
'spacy.strings',
|
||||
'spacy.lexeme',
|
||||
|
@ -191,8 +192,6 @@ def setup_package():
|
|||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.11.1.dev10,<6.12.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'six',
|
||||
'html5lib==1.0b8',
|
||||
'pathlib',
|
||||
'ujson>=1.35',
|
||||
'dill>=0.2,<0.3',
|
||||
|
@ -201,6 +200,7 @@ def setup_package():
|
|||
'ftfy>=4.4.2,<5.0.0',
|
||||
'msgpack-python==0.5.4',
|
||||
'msgpack-numpy==0.4.1'],
|
||||
setup_requires=['wheel'],
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
|
|
|
@ -8,6 +8,7 @@ if __name__ == '__main__':
|
|||
import sys
|
||||
from spacy.cli import download, link, info, package, train, convert
|
||||
from spacy.cli import vocab, init_model, profile, evaluate, validate
|
||||
from spacy.cli import ud_train, ud_evaluate
|
||||
from spacy.util import prints
|
||||
|
||||
commands = {
|
||||
|
@ -15,7 +16,9 @@ if __name__ == '__main__':
|
|||
'link': link,
|
||||
'info': info,
|
||||
'train': train,
|
||||
'ud-train': ud_train,
|
||||
'evaluate': evaluate,
|
||||
'ud-evaluate': ud_evaluate,
|
||||
'convert': convert,
|
||||
'package': package,
|
||||
'vocab': vocab,
|
||||
|
|
251
spacy/_align.pyx
Normal file
251
spacy/_align.pyx
Normal file
|
@ -0,0 +1,251 @@
|
|||
# cython: infer_types=True
|
||||
'''Do Levenshtein alignment, for evaluation of tokenized input.
|
||||
|
||||
Random notes:
|
||||
|
||||
r i n g
|
||||
0 1 2 3 4
|
||||
r 1 0 1 2 3
|
||||
a 2 1 1 2 3
|
||||
n 3 2 2 1 2
|
||||
g 4 3 3 2 1
|
||||
|
||||
0,0: (1,1)=min(0+0,1+1,1+1)=0 S
|
||||
1,0: (2,1)=min(1+1,0+1,2+1)=1 D
|
||||
2,0: (3,1)=min(2+1,3+1,1+1)=2 D
|
||||
3,0: (4,1)=min(3+1,4+1,2+1)=3 D
|
||||
0,1: (1,2)=min(1+1,2+1,0+1)=1 D
|
||||
1,1: (2,2)=min(0+1,1+1,1+1)=1 S
|
||||
2,1: (3,2)=min(1+1,1+1,2+1)=2 S or I
|
||||
3,1: (4,2)=min(2+1,2+1,3+1)=3 S or I
|
||||
0,2: (1,3)=min(2+1,3+1,1+1)=2 I
|
||||
1,2: (2,3)=min(1+1,2+1,1+1)=2 S or I
|
||||
2,2: (3,3)
|
||||
3,2: (4,3)
|
||||
At state (i, j) we're asking "How do I transform S[:i+1] to T[:j+1]?"
|
||||
|
||||
We know the costs to transition:
|
||||
|
||||
S[:i] -> T[:j] (at D[i,j])
|
||||
S[:i+1] -> T[:j] (at D[i+1,j])
|
||||
S[:i] -> T[:j+1] (at D[i,j+1])
|
||||
|
||||
Further, we now we can tranform:
|
||||
S[:i+1] -> S[:i] (DEL) for 1,
|
||||
T[:j+1] -> T[:j] (INS) for 1.
|
||||
S[i+1] -> T[j+1] (SUB) for 0 or 1
|
||||
|
||||
Therefore we have the costs:
|
||||
SUB: Cost(S[:i]->T[:j]) + Cost(S[i]->S[j])
|
||||
i.e. D[i, j] + S[i+1] != T[j+1]
|
||||
INS: Cost(S[:i+1]->T[:j]) + Cost(T[:j+1]->T[:j])
|
||||
i.e. D[i+1,j] + 1
|
||||
DEL: Cost(S[:i]->T[:j+1]) + Cost(S[:i+1]->S[:i])
|
||||
i.e. D[i,j+1] + 1
|
||||
|
||||
Source string S has length m, with index i
|
||||
Target string T has length n, with index j
|
||||
|
||||
Output two alignment vectors: i2j (length m) and j2i (length n)
|
||||
# function LevenshteinDistance(char s[1..m], char t[1..n]):
|
||||
# for all i and j, d[i,j] will hold the Levenshtein distance between
|
||||
# the first i characters of s and the first j characters of t
|
||||
# note that d has (m+1)*(n+1) values
|
||||
# set each element in d to zero
|
||||
ring rang
|
||||
- r i n g
|
||||
- 0 0 0 0 0
|
||||
r 0 0 0 0 0
|
||||
a 0 0 0 0 0
|
||||
n 0 0 0 0 0
|
||||
g 0 0 0 0 0
|
||||
|
||||
# source prefixes can be transformed into empty string by
|
||||
# dropping all characters
|
||||
# d[i, 0] := i
|
||||
ring rang
|
||||
- r i n g
|
||||
- 0 0 0 0 0
|
||||
r 1 0 0 0 0
|
||||
a 2 0 0 0 0
|
||||
n 3 0 0 0 0
|
||||
g 4 0 0 0 0
|
||||
|
||||
# target prefixes can be reached from empty source prefix
|
||||
# by inserting every character
|
||||
# d[0, j] := j
|
||||
- r i n g
|
||||
- 0 1 2 3 4
|
||||
r 1 0 0 0 0
|
||||
a 2 0 0 0 0
|
||||
n 3 0 0 0 0
|
||||
g 4 0 0 0 0
|
||||
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
from libc.stdint cimport uint32_t
|
||||
import numpy
|
||||
cimport numpy as np
|
||||
from .compat import unicode_
|
||||
from murmurhash.mrmr cimport hash32
|
||||
|
||||
|
||||
def align(S, T):
|
||||
cdef int m = len(S)
|
||||
cdef int n = len(T)
|
||||
cdef np.ndarray matrix = numpy.zeros((m+1, n+1), dtype='int32')
|
||||
cdef np.ndarray i2j = numpy.zeros((m,), dtype='i')
|
||||
cdef np.ndarray j2i = numpy.zeros((n,), dtype='i')
|
||||
|
||||
cdef np.ndarray S_arr = _convert_sequence(S)
|
||||
cdef np.ndarray T_arr = _convert_sequence(T)
|
||||
|
||||
fill_matrix(<int*>matrix.data,
|
||||
<const int*>S_arr.data, m, <const int*>T_arr.data, n)
|
||||
fill_i2j(i2j, matrix)
|
||||
fill_j2i(j2i, matrix)
|
||||
for i in range(i2j.shape[0]):
|
||||
if i2j[i] >= 0 and len(S[i]) != len(T[i2j[i]]):
|
||||
i2j[i] = -1
|
||||
for j in range(j2i.shape[0]):
|
||||
if j2i[j] >= 0 and len(T[j]) != len(S[j2i[j]]):
|
||||
j2i[j] = -1
|
||||
return matrix[-1,-1], i2j, j2i, matrix
|
||||
|
||||
|
||||
def multi_align(np.ndarray i2j, np.ndarray j2i, i_lengths, j_lengths):
|
||||
'''Let's say we had:
|
||||
|
||||
Guess: [aa bb cc dd]
|
||||
Truth: [aa bbcc dd]
|
||||
i2j: [0, None, -2, 2]
|
||||
j2i: [0, -2, 3]
|
||||
|
||||
We want:
|
||||
|
||||
i2j_multi: {1: 1, 2: 1}
|
||||
j2i_multi: {}
|
||||
'''
|
||||
i2j_miss = _get_regions(i2j, i_lengths)
|
||||
j2i_miss = _get_regions(j2i, j_lengths)
|
||||
|
||||
i2j_multi, j2i_multi = _get_mapping(i2j_miss, j2i_miss, i_lengths, j_lengths)
|
||||
return i2j_multi, j2i_multi
|
||||
|
||||
|
||||
def _get_regions(alignment, lengths):
|
||||
regions = {}
|
||||
start = None
|
||||
offset = 0
|
||||
for i in range(len(alignment)):
|
||||
if alignment[i] < 0:
|
||||
if start is None:
|
||||
start = offset
|
||||
regions.setdefault(start, [])
|
||||
regions[start].append(i)
|
||||
else:
|
||||
start = None
|
||||
offset += lengths[i]
|
||||
return regions
|
||||
|
||||
|
||||
def _get_mapping(miss1, miss2, lengths1, lengths2):
|
||||
i2j = {}
|
||||
j2i = {}
|
||||
for start, region1 in miss1.items():
|
||||
if not region1 or start not in miss2:
|
||||
continue
|
||||
region2 = miss2[start]
|
||||
if sum(lengths1[i] for i in region1) == sum(lengths2[i] for i in region2):
|
||||
j = region2.pop(0)
|
||||
buff = []
|
||||
# Consume tokens from region 1, until we meet the length of the
|
||||
# first token in region2. If we do, align the tokens. If
|
||||
# we exceed the length, break.
|
||||
while region1:
|
||||
buff.append(region1.pop(0))
|
||||
if sum(lengths1[i] for i in buff) == lengths2[j]:
|
||||
for i in buff:
|
||||
i2j[i] = j
|
||||
j2i[j] = buff[-1]
|
||||
j += 1
|
||||
buff = []
|
||||
elif sum(lengths1[i] for i in buff) > lengths2[j]:
|
||||
break
|
||||
else:
|
||||
if buff and sum(lengths1[i] for i in buff) == lengths2[j]:
|
||||
for i in buff:
|
||||
i2j[i] = j
|
||||
j2i[j] = buff[-1]
|
||||
return i2j, j2i
|
||||
|
||||
|
||||
def _convert_sequence(seq):
|
||||
if isinstance(seq, numpy.ndarray):
|
||||
return numpy.ascontiguousarray(seq, dtype='uint32_t')
|
||||
cdef np.ndarray output = numpy.zeros((len(seq),), dtype='uint32')
|
||||
cdef bytes item_bytes
|
||||
for i, item in enumerate(seq):
|
||||
if isinstance(item, unicode):
|
||||
item_bytes = item.encode('utf8')
|
||||
else:
|
||||
item_bytes = item
|
||||
output[i] = hash32(<void*><char*>item_bytes, len(item_bytes), 0)
|
||||
return output
|
||||
|
||||
|
||||
cdef void fill_matrix(int* D,
|
||||
const int* S, int m, const int* T, int n) nogil:
|
||||
m1 = m+1
|
||||
n1 = n+1
|
||||
for i in range(m1*n1):
|
||||
D[i] = 0
|
||||
|
||||
for i in range(m1):
|
||||
D[i*n1] = i
|
||||
|
||||
for j in range(n1):
|
||||
D[j] = j
|
||||
|
||||
cdef int sub_cost, ins_cost, del_cost
|
||||
for j in range(n):
|
||||
for i in range(m):
|
||||
i_j = i*n1 + j
|
||||
i1_j1 = (i+1)*n1 + j+1
|
||||
i1_j = (i+1)*n1 + j
|
||||
i_j1 = i*n1 + j+1
|
||||
if S[i] != T[j]:
|
||||
sub_cost = D[i_j] + 1
|
||||
else:
|
||||
sub_cost = D[i_j]
|
||||
del_cost = D[i_j1] + 1
|
||||
ins_cost = D[i1_j] + 1
|
||||
best = min(min(sub_cost, ins_cost), del_cost)
|
||||
D[i1_j1] = best
|
||||
|
||||
|
||||
cdef void fill_i2j(np.ndarray i2j, np.ndarray D) except *:
|
||||
j = D.shape[1]-2
|
||||
cdef int i = D.shape[0]-2
|
||||
while i >= 0:
|
||||
while D[i+1, j] < D[i+1, j+1]:
|
||||
j -= 1
|
||||
if D[i, j+1] < D[i+1, j+1]:
|
||||
i2j[i] = -1
|
||||
else:
|
||||
i2j[i] = j
|
||||
j -= 1
|
||||
i -= 1
|
||||
|
||||
cdef void fill_j2i(np.ndarray j2i, np.ndarray D) except *:
|
||||
i = D.shape[0]-2
|
||||
cdef int j = D.shape[1]-2
|
||||
while j >= 0:
|
||||
while D[i, j+1] < D[i+1, j+1]:
|
||||
i -= 1
|
||||
if D[i+1, j] < D[i+1, j+1]:
|
||||
j2i[j] = -1
|
||||
else:
|
||||
j2i[j] = i
|
||||
i -= 1
|
||||
j -= 1
|
251
spacy/_matcher2_notes.py
Normal file
251
spacy/_matcher2_notes.py
Normal file
|
@ -0,0 +1,251 @@
|
|||
import pytest
|
||||
|
||||
|
||||
class Vocab(object):
|
||||
pass
|
||||
|
||||
|
||||
class Doc(list):
|
||||
def __init__(self, vocab, words=None):
|
||||
list.__init__(self)
|
||||
self.extend([Token(i, w) for i, w in enumerate(words)])
|
||||
|
||||
|
||||
class Token(object):
|
||||
def __init__(self, i, word):
|
||||
self.i = i
|
||||
self.text = word
|
||||
|
||||
|
||||
def find_matches(patterns, doc):
|
||||
init_states = [(pattern, 0, None) for pattern in patterns]
|
||||
curr_states = []
|
||||
matches = []
|
||||
for token in doc:
|
||||
nexts = []
|
||||
for state in (curr_states + init_states):
|
||||
matches, nexts = transition(state, token, matches, nexts)
|
||||
curr_states = nexts
|
||||
return matches
|
||||
|
||||
|
||||
def transition(state, token, matches, nexts):
|
||||
action = get_action(state, token)
|
||||
is_match, keep_state, advance_state = [bool(int(c)) for c in action]
|
||||
pattern, i, start = state
|
||||
if start is None:
|
||||
start = token.i
|
||||
if is_match:
|
||||
matches.append((pattern, start, token.i+1))
|
||||
if advance_state:
|
||||
nexts.append((pattern, i+1, start))
|
||||
if keep_state:
|
||||
# TODO: This needs to be zero-width :(.
|
||||
nexts.append((pattern, i, start))
|
||||
return (matches, nexts)
|
||||
|
||||
|
||||
def get_action(state, token):
|
||||
'''We need to consider:
|
||||
|
||||
a) Does the token match the specification? [Yes, No]
|
||||
b) What's the quantifier? [1, 0+, ?]
|
||||
c) Is this the last specification? [final, non-final]
|
||||
|
||||
We can transition in the following ways:
|
||||
|
||||
a) Do we emit a match?
|
||||
b) Do we add a state with (next state, next token)?
|
||||
c) Do we add a state with (next state, same token)?
|
||||
d) Do we add a state with (same state, next token)?
|
||||
|
||||
We'll code the actions as boolean strings, so 0000 means no to all 4,
|
||||
1000 means match but no states added, etc.
|
||||
|
||||
1:
|
||||
Yes, final:
|
||||
1000
|
||||
Yes, non-final:
|
||||
0100
|
||||
No, final:
|
||||
0000
|
||||
No, non-final
|
||||
0000
|
||||
0+:
|
||||
Yes, final:
|
||||
1001
|
||||
Yes, non-final:
|
||||
0111
|
||||
No, final:
|
||||
1000 (note: Don't include last token!)
|
||||
No, non-final:
|
||||
0010
|
||||
?:
|
||||
Yes, final:
|
||||
1000
|
||||
Yes, non-final:
|
||||
0100
|
||||
No, final:
|
||||
1000 (note: Don't include last token!)
|
||||
No, non-final:
|
||||
0010
|
||||
|
||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
||||
'''
|
||||
is_match = get_is_match(state, token)
|
||||
operator = get_operator(state, token)
|
||||
is_final = get_is_final(state, token)
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def get_is_match(state, token):
|
||||
pattern, i, start = state
|
||||
is_match = token.text == pattern[i]['spec']
|
||||
if pattern[i].get('invert'):
|
||||
return not is_match
|
||||
else:
|
||||
return is_match
|
||||
|
||||
def get_is_final(state, token):
|
||||
pattern, i, start = state
|
||||
return i == len(pattern)-1
|
||||
|
||||
def get_operator(state, token):
|
||||
pattern, i, start = state
|
||||
return pattern[i].get('op', '1')
|
||||
|
||||
|
||||
########################
|
||||
# Tests for get_action #
|
||||
########################
|
||||
|
||||
|
||||
def test_get_action_simple_match():
|
||||
pattern = [{'spec': 'a', 'op': '1'}]
|
||||
doc = Doc(Vocab(), words=['a'])
|
||||
state = (pattern, 0, None)
|
||||
action = get_action(state, doc[0])
|
||||
assert action == '100'
|
||||
|
||||
|
||||
def test_get_action_simple_reject():
|
||||
pattern = [{'spec': 'b', 'op': '1'}]
|
||||
doc = Doc(Vocab(), words=['a'])
|
||||
state = (pattern, 0, None)
|
||||
action = get_action(state, doc[0])
|
||||
assert action == '000'
|
||||
|
||||
|
||||
def test_get_action_simple_match_match():
|
||||
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'a', 'op': '1'}]
|
||||
doc = Doc(Vocab(), words=['a', 'a'])
|
||||
state = (pattern, 0, None)
|
||||
action = get_action(state, doc[0])
|
||||
assert action == '001'
|
||||
state = (pattern, 1, 0)
|
||||
action = get_action(state, doc[1])
|
||||
assert action == '100'
|
||||
|
||||
|
||||
def test_get_action_simple_match_reject():
|
||||
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
|
||||
doc = Doc(Vocab(), words=['a', 'a'])
|
||||
state = (pattern, 0, None)
|
||||
action = get_action(state, doc[0])
|
||||
assert action == '001'
|
||||
state = (pattern, 1, 0)
|
||||
action = get_action(state, doc[1])
|
||||
assert action == '000'
|
||||
|
||||
|
||||
def test_get_action_simple_match_reject():
|
||||
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
|
||||
doc = Doc(Vocab(), words=['a', 'a'])
|
||||
state = (pattern, 0, None)
|
||||
action = get_action(state, doc[0])
|
||||
assert action == '001'
|
||||
state = (pattern, 1, 0)
|
||||
action = get_action(state, doc[1])
|
||||
assert action == '000'
|
||||
|
||||
|
||||
def test_get_action_plus_match():
|
||||
pattern = [{'spec': 'a', 'op': '1+'}]
|
||||
doc = Doc(Vocab(), words=['a'])
|
||||
state = (pattern, 0, None)
|
||||
action = get_action(state, doc[0])
|
||||
assert action == '110'
|
||||
|
||||
|
||||
def test_get_action_plus_match_match():
|
||||
pattern = [{'spec': 'a', 'op': '1+'}]
|
||||
doc = Doc(Vocab(), words=['a', 'a'])
|
||||
state = (pattern, 0, None)
|
||||
action = get_action(state, doc[0])
|
||||
assert action == '110'
|
||||
state = (pattern, 0, 0)
|
||||
action = get_action(state, doc[1])
|
||||
assert action == '110'
|
||||
|
||||
|
||||
##########################
|
||||
# Tests for find_matches #
|
||||
##########################
|
||||
|
||||
def test_find_matches_simple_accept():
|
||||
pattern = [{'spec': 'a', 'op': '1'}]
|
||||
doc = Doc(Vocab(), words=['a'])
|
||||
matches = find_matches([pattern], doc)
|
||||
assert matches == [(pattern, 0, 1)]
|
||||
|
||||
|
||||
def test_find_matches_simple_reject():
|
||||
pattern = [{'spec': 'a', 'op': '1'}]
|
||||
doc = Doc(Vocab(), words=['b'])
|
||||
matches = find_matches([pattern], doc)
|
||||
assert matches == []
|
||||
|
||||
|
||||
def test_find_matches_match_twice():
|
||||
pattern = [{'spec': 'a', 'op': '1'}]
|
||||
doc = Doc(Vocab(), words=['a', 'a'])
|
||||
matches = find_matches([pattern], doc)
|
||||
assert matches == [(pattern, 0, 1), (pattern, 1, 2)]
|
||||
|
||||
|
||||
def test_find_matches_longer_pattern():
|
||||
pattern = [{'spec': 'a', 'op': '1'}, {'spec': 'b', 'op': '1'}]
|
||||
doc = Doc(Vocab(), words=['a', 'b'])
|
||||
matches = find_matches([pattern], doc)
|
||||
assert matches == [(pattern, 0, 2)]
|
||||
|
||||
|
||||
def test_find_matches_two_patterns():
|
||||
patterns = [[{'spec': 'a', 'op': '1'}], [{'spec': 'b', 'op': '1'}]]
|
||||
doc = Doc(Vocab(), words=['a', 'b'])
|
||||
matches = find_matches(patterns, doc)
|
||||
assert matches == [(patterns[0], 0, 1), (patterns[1], 1, 2)]
|
||||
|
||||
|
||||
def test_find_matches_two_patterns_overlap():
|
||||
patterns = [[{'spec': 'a'}, {'spec': 'b'}],
|
||||
[{'spec': 'b'}, {'spec': 'c'}]]
|
||||
doc = Doc(Vocab(), words=['a', 'b', 'c'])
|
||||
matches = find_matches(patterns, doc)
|
||||
assert matches == [(patterns[0], 0, 2), (patterns[1], 1, 3)]
|
||||
|
||||
|
||||
def test_find_matches_greedy():
|
||||
patterns = [[{'spec': 'a', 'op': '1+'}]]
|
||||
doc = Doc(Vocab(), words=['a'])
|
||||
matches = find_matches(patterns, doc)
|
||||
assert matches == [(patterns[0], 0, 1)]
|
||||
doc = Doc(Vocab(), words=['a', 'a'])
|
||||
matches = find_matches(patterns, doc)
|
||||
assert matches == [(patterns[0], 0, 1), (patterns[0], 0, 2), (patterns[0], 1, 2)]
|
||||
|
||||
def test_find_matches_non_greedy():
|
||||
patterns = [[{'spec': 'a', 'op': '0+'}, {'spec': 'b', "op": "1"}]]
|
||||
doc = Doc(Vocab(), words=['b'])
|
||||
matches = find_matches(patterns, doc)
|
||||
assert matches == [(patterns[0], 0, 1)]
|
30
spacy/_ml.py
30
spacy/_ml.py
|
@ -64,23 +64,6 @@ def _flatten_add_lengths(seqs, pad=0, drop=0.):
|
|||
return (X, lengths), finish_update
|
||||
|
||||
|
||||
@layerize
|
||||
def _logistic(X, drop=0.):
|
||||
xp = get_array_module(X)
|
||||
if not isinstance(X, xp.ndarray):
|
||||
X = xp.asarray(X)
|
||||
# Clip to range (-10, 10)
|
||||
X = xp.minimum(X, 10., X)
|
||||
X = xp.maximum(X, -10., X)
|
||||
Y = 1. / (1. + xp.exp(-X))
|
||||
|
||||
def logistic_bwd(dY, sgd=None):
|
||||
dX = dY * (Y * (1-Y))
|
||||
return dX
|
||||
|
||||
return Y, logistic_bwd
|
||||
|
||||
|
||||
def _zero_init(model):
|
||||
def _zero_init_impl(self, X, y):
|
||||
self.W.fill(0)
|
||||
|
@ -144,8 +127,8 @@ class PrecomputableAffine(Model):
|
|||
self.nF = nF
|
||||
|
||||
def begin_update(self, X, drop=0.):
|
||||
Yf = self.ops.xp.dot(X,
|
||||
self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T)
|
||||
Yf = self.ops.gemm(X,
|
||||
self.W.reshape((self.nF*self.nO*self.nP, self.nI)), trans2=True)
|
||||
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
||||
Yf = self._add_padding(Yf)
|
||||
|
||||
|
@ -161,11 +144,11 @@ class PrecomputableAffine(Model):
|
|||
Wopfi = self.W.transpose((1, 2, 0, 3))
|
||||
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
||||
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
|
||||
dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
|
||||
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
|
||||
|
||||
# Reuse the buffer
|
||||
dWopfi = Wopfi; dWopfi.fill(0.)
|
||||
self.ops.xp.dot(dY.T, Xf, out=dWopfi)
|
||||
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
||||
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
||||
# (o, p, f, i) --> (f, o, p, i)
|
||||
self.d_W += dWopfi.transpose((2, 0, 1, 3))
|
||||
|
@ -467,6 +450,7 @@ def SpacyVectors(docs, drop=0.):
|
|||
|
||||
|
||||
def build_text_classifier(nr_class, width=64, **cfg):
|
||||
depth = cfg.get('depth', 2)
|
||||
nr_vector = cfg.get('nr_vector', 5000)
|
||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
||||
|
@ -518,7 +502,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
LN(Maxout(width, vectors_width))
|
||||
>> Residual(
|
||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
||||
) ** 2, pad=2
|
||||
) ** depth, pad=depth
|
||||
)
|
||||
>> flatten_add_lengths
|
||||
>> ParametricAttention(width)
|
||||
|
@ -531,8 +515,6 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
|||
_preprocess_doc
|
||||
>> LinearModel(nr_class)
|
||||
)
|
||||
#model = linear_model >> logistic
|
||||
|
||||
model = (
|
||||
(linear_model | cnn_model)
|
||||
>> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
|
||||
|
|
|
@ -9,7 +9,7 @@ __uri__ = 'https://spacy.io'
|
|||
__author__ = 'Explosion AI'
|
||||
__email__ = 'contact@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
__release__ = True
|
||||
__release__ = False
|
||||
|
||||
__docs_models__ = 'https://spacy.io/usage/models'
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
|
|
|
@ -131,7 +131,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
||||
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
||||
'Polarity', 'Animacy' # U20
|
||||
'Polarity', 'PrepCase', 'Animacy' # U20
|
||||
]
|
||||
for key in morph_keys:
|
||||
if key in stringy_attrs:
|
||||
|
|
|
@ -9,3 +9,5 @@ from .convert import convert
|
|||
from .vocab import make_vocab as vocab
|
||||
from .init_model import init_model
|
||||
from .validate import validate
|
||||
from .ud_train import main as ud_train
|
||||
from .conll17_ud_eval import main as ud_evaluate
|
||||
|
|
571
spacy/cli/conll17_ud_eval.py
Normal file
571
spacy/cli/conll17_ud_eval.py
Normal file
|
@ -0,0 +1,571 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# CoNLL 2017 UD Parsing evaluation script.
|
||||
#
|
||||
# Compatible with Python 2.7 and 3.2+, can be used either as a module
|
||||
# or a standalone executable.
|
||||
#
|
||||
# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL),
|
||||
# Faculty of Mathematics and Physics, Charles University, Czech Republic.
|
||||
#
|
||||
# Changelog:
|
||||
# - [02 Jan 2017] Version 0.9: Initial release
|
||||
# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation
|
||||
# - [10 Mar 2017] Version 1.0: Add documentation and test
|
||||
# Compare HEADs correctly using aligned words
|
||||
# Allow evaluation with errorneous spaces in forms
|
||||
# Compare forms in LCS case insensitively
|
||||
# Detect cycles and multiple root nodes
|
||||
# Compute AlignedAccuracy
|
||||
|
||||
# Command line usage
|
||||
# ------------------
|
||||
# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file
|
||||
#
|
||||
# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics
|
||||
# is printed
|
||||
# - if -v is given, several metrics are printed (as precision, recall, F1 score,
|
||||
# and in case the metric is computed on aligned words also accuracy on these):
|
||||
# - Tokens: how well do the gold tokens match system tokens
|
||||
# - Sentences: how well do the gold sentences match system sentences
|
||||
# - Words: how well can the gold words be aligned to system words
|
||||
# - UPOS: using aligned words, how well does UPOS match
|
||||
# - XPOS: using aligned words, how well does XPOS match
|
||||
# - Feats: using aligned words, how well does FEATS match
|
||||
# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
|
||||
# - Lemmas: using aligned words, how well does LEMMA match
|
||||
# - UAS: using aligned words, how well does HEAD match
|
||||
# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
|
||||
# - if weights_file is given (with lines containing deprel-weight pairs),
|
||||
# one more metric is shown:
|
||||
# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight
|
||||
|
||||
# API usage
|
||||
# ---------
|
||||
# - load_conllu(file)
|
||||
# - loads CoNLL-U file from given file object to an internal representation
|
||||
# - the file object should return str on both Python 2 and Python 3
|
||||
# - raises UDError exception if the given file cannot be loaded
|
||||
# - evaluate(gold_ud, system_ud)
|
||||
# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
|
||||
# - raises UDError if the concatenated tokens of gold and system file do not match
|
||||
# - returns a dictionary with the metrics described above, each metrics having
|
||||
# three fields: precision, recall and f1
|
||||
|
||||
# Description of token matching
|
||||
# -----------------------------
|
||||
# In order to match tokens of gold file and system file, we consider the text
|
||||
# resulting from concatenation of gold tokens and text resulting from
|
||||
# concatenation of system tokens. These texts should match -- if they do not,
|
||||
# the evaluation fails.
|
||||
#
|
||||
# If the texts do match, every token is represented as a range in this original
|
||||
# text, and tokens are equal only if their range is the same.
|
||||
|
||||
# Description of word matching
|
||||
# ----------------------------
|
||||
# When matching words of gold file and system file, we first match the tokens.
|
||||
# The words which are also tokens are matched as tokens, but words in multi-word
|
||||
# tokens have to be handled differently.
|
||||
#
|
||||
# To handle multi-word tokens, we start by finding "multi-word spans".
|
||||
# Multi-word span is a span in the original text such that
|
||||
# - it contains at least one multi-word token
|
||||
# - all multi-word tokens in the span (considering both gold and system ones)
|
||||
# are completely inside the span (i.e., they do not "stick out")
|
||||
# - the multi-word span is as small as possible
|
||||
#
|
||||
# For every multi-word span, we align the gold and system words completely
|
||||
# inside this span using LCS on their FORMs. The words not intersecting
|
||||
# (even partially) any multi-word span are then aligned as tokens.
|
||||
|
||||
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
# CoNLL-U column names
|
||||
ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)
|
||||
|
||||
# UD Error is used when raising exceptions in this module
|
||||
class UDError(Exception):
|
||||
pass
|
||||
|
||||
# Load given CoNLL-U file into internal representation
|
||||
def load_conllu(file):
|
||||
# Internal representation classes
|
||||
class UDRepresentation:
|
||||
def __init__(self):
|
||||
# Characters of all the tokens in the whole file.
|
||||
# Whitespace between tokens is not included.
|
||||
self.characters = []
|
||||
# List of UDSpan instances with start&end indices into `characters`.
|
||||
self.tokens = []
|
||||
# List of UDWord instances.
|
||||
self.words = []
|
||||
# List of UDSpan instances with start&end indices into `characters`.
|
||||
self.sentences = []
|
||||
class UDSpan:
|
||||
def __init__(self, start, end, characters):
|
||||
self.start = start
|
||||
# Note that self.end marks the first position **after the end** of span,
|
||||
# so we can use characters[start:end] or range(start, end).
|
||||
self.end = end
|
||||
self.characters = characters
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
return ''.join(self.characters[self.start:self.end])
|
||||
|
||||
def __str__(self):
|
||||
return self.text
|
||||
|
||||
def __repr__(self):
|
||||
return self.text
|
||||
class UDWord:
|
||||
def __init__(self, span, columns, is_multiword):
|
||||
# Span of this word (or MWT, see below) within ud_representation.characters.
|
||||
self.span = span
|
||||
# 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
|
||||
self.columns = columns
|
||||
# is_multiword==True means that this word is part of a multi-word token.
|
||||
# In that case, self.span marks the span of the whole multi-word token.
|
||||
self.is_multiword = is_multiword
|
||||
# Reference to the UDWord instance representing the HEAD (or None if root).
|
||||
self.parent = None
|
||||
# Let's ignore language-specific deprel subtypes.
|
||||
self.columns[DEPREL] = columns[DEPREL].split(':')[0]
|
||||
|
||||
ud = UDRepresentation()
|
||||
|
||||
# Load the CoNLL-U file
|
||||
index, sentence_start = 0, None
|
||||
linenum = 0
|
||||
while True:
|
||||
line = file.readline()
|
||||
linenum += 1
|
||||
if not line:
|
||||
break
|
||||
line = line.rstrip("\r\n")
|
||||
|
||||
# Handle sentence start boundaries
|
||||
if sentence_start is None:
|
||||
# Skip comments
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
# Start a new sentence
|
||||
ud.sentences.append(UDSpan(index, 0, ud.characters))
|
||||
sentence_start = len(ud.words)
|
||||
if not line:
|
||||
# Add parent UDWord links and check there are no cycles
|
||||
def process_word(word):
|
||||
if word.parent == "remapping":
|
||||
raise UDError("There is a cycle in a sentence")
|
||||
if word.parent is None:
|
||||
head = int(word.columns[HEAD])
|
||||
if head > len(ud.words) - sentence_start:
|
||||
raise UDError("Line {}: HEAD '{}' points outside of the sentence".format(
|
||||
linenum, word.columns[HEAD]))
|
||||
if head:
|
||||
parent = ud.words[sentence_start + head - 1]
|
||||
word.parent = "remapping"
|
||||
process_word(parent)
|
||||
word.parent = parent
|
||||
|
||||
for word in ud.words[sentence_start:]:
|
||||
process_word(word)
|
||||
|
||||
# Check there is a single root node
|
||||
if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
|
||||
raise UDError("There are multiple roots in a sentence")
|
||||
|
||||
# End the sentence
|
||||
ud.sentences[-1].end = index
|
||||
sentence_start = None
|
||||
continue
|
||||
|
||||
# Read next token/word
|
||||
columns = line.split("\t")
|
||||
if len(columns) != 10:
|
||||
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, line))
|
||||
|
||||
# Skip empty nodes
|
||||
if "." in columns[ID]:
|
||||
continue
|
||||
|
||||
# Delete spaces from FORM so gold.characters == system.characters
|
||||
# even if one of them tokenizes the space.
|
||||
columns[FORM] = columns[FORM].replace(" ", "")
|
||||
if not columns[FORM]:
|
||||
raise UDError("There is an empty FORM in the CoNLL-U file -- line %d" % linenum)
|
||||
|
||||
# Save token
|
||||
ud.characters.extend(columns[FORM])
|
||||
ud.tokens.append(UDSpan(index, index + len(columns[FORM]), ud.characters))
|
||||
index += len(columns[FORM])
|
||||
|
||||
# Handle multi-word tokens to save word(s)
|
||||
if "-" in columns[ID]:
|
||||
try:
|
||||
start, end = map(int, columns[ID].split("-"))
|
||||
except:
|
||||
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
||||
|
||||
for _ in range(start, end + 1):
|
||||
word_line = file.readline().rstrip("\r\n")
|
||||
word_columns = word_line.split("\t")
|
||||
if len(word_columns) != 10:
|
||||
print(columns)
|
||||
raise UDError("The CoNLL-U line {} does not contain 10 tab-separated columns: '{}'".format(linenum, word_line))
|
||||
ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
|
||||
# Basic tokens/words
|
||||
else:
|
||||
try:
|
||||
word_id = int(columns[ID])
|
||||
except:
|
||||
raise UDError("Cannot parse word ID '{}'".format(columns[ID]))
|
||||
if word_id != len(ud.words) - sentence_start + 1:
|
||||
raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1))
|
||||
|
||||
try:
|
||||
head_id = int(columns[HEAD])
|
||||
except:
|
||||
raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD]))
|
||||
if head_id < 0:
|
||||
raise UDError("HEAD cannot be negative")
|
||||
|
||||
ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
|
||||
|
||||
if sentence_start is not None:
|
||||
raise UDError("The CoNLL-U file does not end with empty line")
|
||||
|
||||
return ud
|
||||
|
||||
# Evaluate the gold and system treebanks (loaded using load_conllu).
|
||||
def evaluate(gold_ud, system_ud, deprel_weights=None):
|
||||
class Score:
|
||||
def __init__(self, gold_total, system_total, correct, aligned_total=None):
|
||||
self.precision = correct / system_total if system_total else 0.0
|
||||
self.recall = correct / gold_total if gold_total else 0.0
|
||||
self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
|
||||
self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
|
||||
class AlignmentWord:
|
||||
def __init__(self, gold_word, system_word):
|
||||
self.gold_word = gold_word
|
||||
self.system_word = system_word
|
||||
self.gold_parent = None
|
||||
self.system_parent_gold_aligned = None
|
||||
class Alignment:
|
||||
def __init__(self, gold_words, system_words):
|
||||
self.gold_words = gold_words
|
||||
self.system_words = system_words
|
||||
self.matched_words = []
|
||||
self.matched_words_map = {}
|
||||
def append_aligned_words(self, gold_word, system_word):
|
||||
self.matched_words.append(AlignmentWord(gold_word, system_word))
|
||||
self.matched_words_map[system_word] = gold_word
|
||||
def fill_parents(self):
|
||||
# We represent root parents in both gold and system data by '0'.
|
||||
# For gold data, we represent non-root parent by corresponding gold word.
|
||||
# For system data, we represent non-root parent by either gold word aligned
|
||||
# to parent system nodes, or by None if no gold words is aligned to the parent.
|
||||
for words in self.matched_words:
|
||||
words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0
|
||||
words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \
|
||||
if words.system_word.parent is not None else 0
|
||||
|
||||
def lower(text):
|
||||
if sys.version_info < (3, 0) and isinstance(text, str):
|
||||
return text.decode("utf-8").lower()
|
||||
return text.lower()
|
||||
|
||||
def spans_score(gold_spans, system_spans):
|
||||
correct, gi, si = 0, 0, 0
|
||||
while gi < len(gold_spans) and si < len(system_spans):
|
||||
if system_spans[si].start < gold_spans[gi].start:
|
||||
si += 1
|
||||
elif gold_spans[gi].start < system_spans[si].start:
|
||||
gi += 1
|
||||
else:
|
||||
correct += gold_spans[gi].end == system_spans[si].end
|
||||
si += 1
|
||||
gi += 1
|
||||
|
||||
return Score(len(gold_spans), len(system_spans), correct)
|
||||
|
||||
def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
|
||||
gold, system, aligned, correct = 0, 0, 0, 0
|
||||
|
||||
for word in alignment.gold_words:
|
||||
gold += weight_fn(word)
|
||||
|
||||
for word in alignment.system_words:
|
||||
system += weight_fn(word)
|
||||
|
||||
for words in alignment.matched_words:
|
||||
aligned += weight_fn(words.gold_word)
|
||||
|
||||
if key_fn is None:
|
||||
# Return score for whole aligned words
|
||||
return Score(gold, system, aligned)
|
||||
|
||||
for words in alignment.matched_words:
|
||||
if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned):
|
||||
correct += weight_fn(words.gold_word)
|
||||
|
||||
return Score(gold, system, correct, aligned)
|
||||
|
||||
def beyond_end(words, i, multiword_span_end):
|
||||
if i >= len(words):
|
||||
return True
|
||||
if words[i].is_multiword:
|
||||
return words[i].span.start >= multiword_span_end
|
||||
return words[i].span.end > multiword_span_end
|
||||
|
||||
def extend_end(word, multiword_span_end):
|
||||
if word.is_multiword and word.span.end > multiword_span_end:
|
||||
return word.span.end
|
||||
return multiword_span_end
|
||||
|
||||
def find_multiword_span(gold_words, system_words, gi, si):
|
||||
# We know gold_words[gi].is_multiword or system_words[si].is_multiword.
|
||||
# Find the start of the multiword span (gs, ss), so the multiword span is minimal.
|
||||
# Initialize multiword_span_end characters index.
|
||||
if gold_words[gi].is_multiword:
|
||||
multiword_span_end = gold_words[gi].span.end
|
||||
if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start:
|
||||
si += 1
|
||||
else: # if system_words[si].is_multiword
|
||||
multiword_span_end = system_words[si].span.end
|
||||
if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start:
|
||||
gi += 1
|
||||
gs, ss = gi, si
|
||||
|
||||
# Find the end of the multiword span
|
||||
# (so both gi and si are pointing to the word following the multiword span end).
|
||||
while not beyond_end(gold_words, gi, multiword_span_end) or \
|
||||
not beyond_end(system_words, si, multiword_span_end):
|
||||
if gi < len(gold_words) and (si >= len(system_words) or
|
||||
gold_words[gi].span.start <= system_words[si].span.start):
|
||||
multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
|
||||
gi += 1
|
||||
else:
|
||||
multiword_span_end = extend_end(system_words[si], multiword_span_end)
|
||||
si += 1
|
||||
return gs, ss, gi, si
|
||||
|
||||
def compute_lcs(gold_words, system_words, gi, si, gs, ss):
|
||||
lcs = [[0] * (si - ss) for i in range(gi - gs)]
|
||||
for g in reversed(range(gi - gs)):
|
||||
for s in reversed(range(si - ss)):
|
||||
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
|
||||
lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
|
||||
lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
|
||||
lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
|
||||
return lcs
|
||||
|
||||
def align_words(gold_words, system_words):
|
||||
alignment = Alignment(gold_words, system_words)
|
||||
|
||||
gi, si = 0, 0
|
||||
while gi < len(gold_words) and si < len(system_words):
|
||||
if gold_words[gi].is_multiword or system_words[si].is_multiword:
|
||||
# A: Multi-word tokens => align via LCS within the whole "multiword span".
|
||||
gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
|
||||
|
||||
if si > ss and gi > gs:
|
||||
lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
|
||||
|
||||
# Store aligned words
|
||||
s, g = 0, 0
|
||||
while g < gi - gs and s < si - ss:
|
||||
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]):
|
||||
alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
|
||||
g += 1
|
||||
s += 1
|
||||
elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0):
|
||||
g += 1
|
||||
else:
|
||||
s += 1
|
||||
else:
|
||||
# B: No multi-word token => align according to spans.
|
||||
if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end):
|
||||
alignment.append_aligned_words(gold_words[gi], system_words[si])
|
||||
gi += 1
|
||||
si += 1
|
||||
elif gold_words[gi].span.start <= system_words[si].span.start:
|
||||
gi += 1
|
||||
else:
|
||||
si += 1
|
||||
|
||||
alignment.fill_parents()
|
||||
|
||||
return alignment
|
||||
|
||||
# Check that underlying character sequences do match
|
||||
if gold_ud.characters != system_ud.characters:
|
||||
index = 0
|
||||
while gold_ud.characters[index] == system_ud.characters[index]:
|
||||
index += 1
|
||||
|
||||
raise UDError(
|
||||
"The concatenation of tokens in gold file and in system file differ!\n" +
|
||||
"First 20 differing characters in gold file: '{}' and system file: '{}'".format(
|
||||
"".join(gold_ud.characters[index:index + 20]),
|
||||
"".join(system_ud.characters[index:index + 20])
|
||||
)
|
||||
)
|
||||
|
||||
# Align words
|
||||
alignment = align_words(gold_ud.words, system_ud.words)
|
||||
|
||||
# Compute the F1-scores
|
||||
result = {
|
||||
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
|
||||
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
|
||||
"Words": alignment_score(alignment, None),
|
||||
"UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
|
||||
"XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
|
||||
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
|
||||
"AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
|
||||
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
|
||||
"UAS": alignment_score(alignment, lambda w, parent: parent),
|
||||
"LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
|
||||
}
|
||||
|
||||
# Add WeightedLAS if weights are given
|
||||
if deprel_weights is not None:
|
||||
def weighted_las(word):
|
||||
return deprel_weights.get(word.columns[DEPREL], 1.0)
|
||||
result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las)
|
||||
|
||||
return result
|
||||
|
||||
def load_deprel_weights(weights_file):
|
||||
if weights_file is None:
|
||||
return None
|
||||
|
||||
deprel_weights = {}
|
||||
for line in weights_file:
|
||||
# Ignore comments and empty lines
|
||||
if line.startswith("#") or not line.strip():
|
||||
continue
|
||||
|
||||
columns = line.rstrip("\r\n").split()
|
||||
if len(columns) != 2:
|
||||
raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line))
|
||||
|
||||
deprel_weights[columns[0]] = float(columns[1])
|
||||
|
||||
return deprel_weights
|
||||
|
||||
def load_conllu_file(path):
|
||||
_file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {}))
|
||||
return load_conllu(_file)
|
||||
|
||||
def evaluate_wrapper(args):
|
||||
# Load CoNLL-U files
|
||||
gold_ud = load_conllu_file(args.gold_file)
|
||||
system_ud = load_conllu_file(args.system_file)
|
||||
|
||||
# Load weights if requested
|
||||
deprel_weights = load_deprel_weights(args.weights)
|
||||
|
||||
return evaluate(gold_ud, system_ud, deprel_weights)
|
||||
|
||||
def main():
|
||||
# Parse arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("gold_file", type=str,
|
||||
help="Name of the CoNLL-U file with the gold data.")
|
||||
parser.add_argument("system_file", type=str,
|
||||
help="Name of the CoNLL-U file with the predicted data.")
|
||||
parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None,
|
||||
metavar="deprel_weights_file",
|
||||
help="Compute WeightedLAS using given weights for Universal Dependency Relations.")
|
||||
parser.add_argument("--verbose", "-v", default=0, action="count",
|
||||
help="Print all metrics.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Use verbose if weights are supplied
|
||||
if args.weights is not None and not args.verbose:
|
||||
args.verbose = 1
|
||||
|
||||
# Evaluate
|
||||
evaluation = evaluate_wrapper(args)
|
||||
|
||||
# Print the evaluation
|
||||
if not args.verbose:
|
||||
print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1))
|
||||
else:
|
||||
metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"]
|
||||
if args.weights is not None:
|
||||
metrics.append("WeightedLAS")
|
||||
|
||||
print("Metrics | Precision | Recall | F1 Score | AligndAcc")
|
||||
print("-----------+-----------+-----------+-----------+-----------")
|
||||
for metric in metrics:
|
||||
print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
|
||||
metric,
|
||||
100 * evaluation[metric].precision,
|
||||
100 * evaluation[metric].recall,
|
||||
100 * evaluation[metric].f1,
|
||||
"{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else ""
|
||||
))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
# Tests, which can be executed with `python -m unittest conll17_ud_eval`.
|
||||
class TestAlignment(unittest.TestCase):
|
||||
@staticmethod
|
||||
def _load_words(words):
|
||||
"""Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors."""
|
||||
lines, num_words = [], 0
|
||||
for w in words:
|
||||
parts = w.split(" ")
|
||||
if len(parts) == 1:
|
||||
num_words += 1
|
||||
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1)))
|
||||
else:
|
||||
lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0]))
|
||||
for part in parts[1:]:
|
||||
num_words += 1
|
||||
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1)))
|
||||
return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"])))
|
||||
|
||||
def _test_exception(self, gold, system):
|
||||
self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system))
|
||||
|
||||
def _test_ok(self, gold, system, correct):
|
||||
metrics = evaluate(self._load_words(gold), self._load_words(system))
|
||||
gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold))
|
||||
system_words = sum((max(1, len(word.split(" ")) - 1) for word in system))
|
||||
self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1),
|
||||
(correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words)))
|
||||
|
||||
def test_exception(self):
|
||||
self._test_exception(["a"], ["b"])
|
||||
|
||||
def test_equal(self):
|
||||
self._test_ok(["a"], ["a"], 1)
|
||||
self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3)
|
||||
|
||||
def test_equal_with_multiword(self):
|
||||
self._test_ok(["abc a b c"], ["a", "b", "c"], 3)
|
||||
self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4)
|
||||
self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4)
|
||||
self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5)
|
||||
|
||||
def test_alignment(self):
|
||||
self._test_ok(["abcd"], ["a", "b", "c", "d"], 0)
|
||||
self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1)
|
||||
self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2)
|
||||
self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2)
|
||||
self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4)
|
||||
self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2)
|
||||
self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)
|
|
@ -8,8 +8,8 @@ from thinc.neural._classes.model import Model
|
|||
from timeit import default_timer as timer
|
||||
|
||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
from ..gold import GoldCorpus
|
||||
from ..util import prints, minibatch, minibatch_by_words
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
|
@ -51,8 +51,6 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
train_path = util.ensure_path(train_data)
|
||||
dev_path = util.ensure_path(dev_data)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
if not train_path.exists():
|
||||
prints(train_path, title="Training data not found", exits=1)
|
||||
if dev_path and not dev_path.exists():
|
||||
|
@ -65,7 +63,14 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
title="Not a valid meta.json format", exits=1)
|
||||
meta.setdefault('lang', lang)
|
||||
meta.setdefault('name', 'unnamed')
|
||||
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
|
||||
print("Counting training words (limit=%s" % n_sents)
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||
n_train_words = corpus.count_train()
|
||||
print(n_train_words)
|
||||
pipeline = ['tagger', 'parser', 'ner']
|
||||
if no_tagger and 'tagger' in pipeline:
|
||||
pipeline.remove('tagger')
|
||||
|
@ -81,13 +86,9 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
|
||||
util.env_opt('dropout_to', 0.2),
|
||||
util.env_opt('dropout_decay', 0.0))
|
||||
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
|
||||
util.env_opt('batch_to', 16),
|
||||
batch_sizes = util.compounding(util.env_opt('batch_from', 1000),
|
||||
util.env_opt('batch_to', 1000),
|
||||
util.env_opt('batch_compound', 1.001))
|
||||
max_doc_len = util.env_opt('max_doc_len', 5000)
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||
n_train_words = corpus.count_train()
|
||||
|
||||
lang_class = util.get_lang_class(lang)
|
||||
nlp = lang_class()
|
||||
meta['pipeline'] = pipeline
|
||||
|
@ -105,6 +106,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
lex.is_oov = False
|
||||
for name in pipeline:
|
||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||
nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
|
||||
if parser_multitasks:
|
||||
for objective in parser_multitasks.split(','):
|
||||
nlp.parser.add_multitask_objective(objective)
|
||||
|
@ -116,21 +118,20 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
|
||||
print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
try:
|
||||
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||
gold_preproc=gold_preproc, max_length=0)
|
||||
train_docs = list(train_docs)
|
||||
for i in range(n_iter):
|
||||
train_docs = corpus.train_docs(nlp, noise_level=0.0,
|
||||
gold_preproc=gold_preproc, max_length=0)
|
||||
words_seen = 0
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
losses = {}
|
||||
for batch in minibatch(train_docs, size=batch_sizes):
|
||||
batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
|
||||
for batch in minibatch_by_words(train_docs, size=batch_sizes):
|
||||
if not batch:
|
||||
continue
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, sgd=optimizer,
|
||||
drop=next(dropout_rates), losses=losses)
|
||||
pbar.update(sum(len(doc) for doc in docs))
|
||||
|
||||
words_seen += sum(len(doc) for doc in docs)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ('model%d' % i)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import six
|
||||
import ftfy
|
||||
import sys
|
||||
import ujson
|
||||
|
@ -47,9 +46,10 @@ is_windows = sys.platform.startswith('win')
|
|||
is_linux = sys.platform.startswith('linux')
|
||||
is_osx = sys.platform == 'darwin'
|
||||
|
||||
is_python2 = six.PY2
|
||||
is_python3 = six.PY3
|
||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1]<5)
|
||||
# See: https://github.com/benjaminp/six/blob/master/six.py
|
||||
is_python2 = sys.version_info[0] == 2
|
||||
is_python3 = sys.version_info[0] == 3
|
||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
|
||||
|
||||
if is_python2:
|
||||
bytes_ = str
|
||||
|
|
432
spacy/gold.pyx
432
spacy/gold.pyx
|
@ -3,16 +3,25 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import re
|
||||
import ujson
|
||||
import random
|
||||
import cytoolz
|
||||
import itertools
|
||||
import numpy
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
import msgpack
|
||||
|
||||
import ujson
|
||||
|
||||
from . import _align
|
||||
from .syntax import nonproj
|
||||
from .tokens import Doc
|
||||
from . import util
|
||||
from .util import minibatch
|
||||
from .util import minibatch, itershuffle
|
||||
from .compat import json_dumps
|
||||
|
||||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
||||
|
||||
def tags_to_entities(tags):
|
||||
entities = []
|
||||
|
@ -59,196 +68,62 @@ def merge_sents(sents):
|
|||
return [(m_deps, m_brackets)]
|
||||
|
||||
|
||||
def align(cand_words, gold_words):
|
||||
cost, edit_path = _min_edit_path(cand_words, gold_words)
|
||||
alignment = []
|
||||
i_of_gold = 0
|
||||
for move in edit_path:
|
||||
if move == 'M':
|
||||
alignment.append(i_of_gold)
|
||||
i_of_gold += 1
|
||||
elif move == 'S':
|
||||
alignment.append(None)
|
||||
i_of_gold += 1
|
||||
elif move == 'D':
|
||||
alignment.append(None)
|
||||
elif move == 'I':
|
||||
i_of_gold += 1
|
||||
else:
|
||||
raise Exception(move)
|
||||
return alignment
|
||||
|
||||
|
||||
punct_re = re.compile(r'\W')
|
||||
|
||||
|
||||
def _min_edit_path(cand_words, gold_words):
|
||||
cdef:
|
||||
Pool mem
|
||||
int i, j, n_cand, n_gold
|
||||
int* curr_costs
|
||||
int* prev_costs
|
||||
|
||||
# TODO: Fix this --- just do it properly, make the full edit matrix and
|
||||
# then walk back over it...
|
||||
# Preprocess inputs
|
||||
cand_words = [punct_re.sub('', w).lower() for w in cand_words]
|
||||
gold_words = [punct_re.sub('', w).lower() for w in gold_words]
|
||||
|
||||
def align(cand_words, gold_words):
|
||||
if cand_words == gold_words:
|
||||
return 0, ''.join(['M' for _ in gold_words])
|
||||
mem = Pool()
|
||||
n_cand = len(cand_words)
|
||||
n_gold = len(gold_words)
|
||||
# Levenshtein distance, except we need the history, and we may want
|
||||
# different costs. Mark operations with a string, and score the history
|
||||
# using _edit_cost.
|
||||
previous_row = []
|
||||
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
||||
for i in range(n_gold + 1):
|
||||
cell = ''
|
||||
for j in range(i):
|
||||
cell += 'I'
|
||||
previous_row.append('I' * i)
|
||||
prev_costs[i] = i
|
||||
for i, cand in enumerate(cand_words):
|
||||
current_row = ['D' * (i + 1)]
|
||||
curr_costs[0] = i+1
|
||||
for j, gold in enumerate(gold_words):
|
||||
if gold.lower() == cand.lower():
|
||||
s_cost = prev_costs[j]
|
||||
i_cost = curr_costs[j] + 1
|
||||
d_cost = prev_costs[j + 1] + 1
|
||||
else:
|
||||
s_cost = prev_costs[j] + 1
|
||||
i_cost = curr_costs[j] + 1
|
||||
d_cost = prev_costs[j + 1] + (1 if cand else 0)
|
||||
|
||||
if s_cost <= i_cost and s_cost <= d_cost:
|
||||
best_cost = s_cost
|
||||
best_hist = previous_row[j] + ('M' if gold == cand else 'S')
|
||||
elif i_cost <= s_cost and i_cost <= d_cost:
|
||||
best_cost = i_cost
|
||||
best_hist = current_row[j] + 'I'
|
||||
else:
|
||||
best_cost = d_cost
|
||||
best_hist = previous_row[j + 1] + 'D'
|
||||
|
||||
current_row.append(best_hist)
|
||||
curr_costs[j+1] = best_cost
|
||||
previous_row = current_row
|
||||
for j in range(len(gold_words) + 1):
|
||||
prev_costs[j] = curr_costs[j]
|
||||
curr_costs[j] = 0
|
||||
|
||||
return prev_costs[n_gold], previous_row[-1]
|
||||
alignment = numpy.arange(len(cand_words))
|
||||
return 0, alignment, alignment, {}, {}
|
||||
cand_words = [w.replace(' ', '') for w in cand_words]
|
||||
gold_words = [w.replace(' ', '') for w in gold_words]
|
||||
cost, i2j, j2i, matrix = _align.align(cand_words, gold_words)
|
||||
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words],
|
||||
[len(w) for w in gold_words])
|
||||
for i, j in list(i2j_multi.items()):
|
||||
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
|
||||
i2j[i] = j
|
||||
i2j_multi.pop(i)
|
||||
for j, i in list(j2i_multi.items()):
|
||||
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
|
||||
j2i[j] = i
|
||||
j2i_multi.pop(j)
|
||||
return cost, i2j, j2i, i2j_multi, j2i_multi
|
||||
|
||||
|
||||
class GoldCorpus(object):
|
||||
"""An annotated corpus, using the JSON file format. Manages
|
||||
annotations for tagging, dependency parsing and NER."""
|
||||
def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
|
||||
def __init__(self, train, dev, gold_preproc=False, limit=None):
|
||||
"""Create a GoldCorpus.
|
||||
|
||||
train_path (unicode or Path): File or directory of training data.
|
||||
dev_path (unicode or Path): File or directory of development data.
|
||||
RETURNS (GoldCorpus): The newly created object.
|
||||
"""
|
||||
self.train_path = util.ensure_path(train_path)
|
||||
self.dev_path = util.ensure_path(dev_path)
|
||||
self.limit = limit
|
||||
self.train_locs = self.walk_corpus(self.train_path)
|
||||
self.dev_locs = self.walk_corpus(self.dev_path)
|
||||
if isinstance(train, str) or isinstance(train, Path):
|
||||
train = self.read_tuples(self.walk_corpus(train))
|
||||
dev = self.read_tuples(self.walk_corpus(dev))
|
||||
|
||||
@property
|
||||
def train_tuples(self):
|
||||
i = 0
|
||||
for loc in self.train_locs:
|
||||
gold_tuples = read_json_file(loc)
|
||||
for item in gold_tuples:
|
||||
yield item
|
||||
i += len(item[1])
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
# Write temp directory with one doc per file, so we can shuffle
|
||||
# and stream
|
||||
self.tmp_dir = Path(tempfile.mkdtemp())
|
||||
self.write_msgpack(self.tmp_dir / 'train', train)
|
||||
self.write_msgpack(self.tmp_dir / 'dev', dev)
|
||||
|
||||
@property
|
||||
def dev_tuples(self):
|
||||
i = 0
|
||||
for loc in self.dev_locs:
|
||||
gold_tuples = read_json_file(loc)
|
||||
for item in gold_tuples:
|
||||
yield item
|
||||
i += len(item[1])
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
|
||||
def count_train(self):
|
||||
n = 0
|
||||
i = 0
|
||||
for raw_text, paragraph_tuples in self.train_tuples:
|
||||
n += sum([len(s[0][1]) for s in paragraph_tuples])
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
i += len(paragraph_tuples)
|
||||
return n
|
||||
|
||||
def train_docs(self, nlp, gold_preproc=False,
|
||||
projectivize=False, max_length=None,
|
||||
noise_level=0.0):
|
||||
train_tuples = self.train_tuples
|
||||
if projectivize:
|
||||
train_tuples = nonproj.preprocess_training_data(
|
||||
self.train_tuples, label_freq_cutoff=100)
|
||||
random.shuffle(train_tuples)
|
||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||
max_length=max_length,
|
||||
noise_level=noise_level)
|
||||
yield from gold_docs
|
||||
|
||||
def dev_docs(self, nlp, gold_preproc=False):
|
||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||
yield from gold_docs
|
||||
|
||||
@classmethod
|
||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
||||
noise_level=0.0):
|
||||
for raw_text, paragraph_tuples in tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||
gold_preproc, noise_level=noise_level)
|
||||
golds = cls._make_golds(docs, paragraph_tuples)
|
||||
for doc, gold in zip(docs, golds):
|
||||
if (not max_length) or len(doc) < max_length:
|
||||
yield doc, gold
|
||||
|
||||
@classmethod
|
||||
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
|
||||
noise_level=0.0):
|
||||
if raw_text is not None:
|
||||
raw_text = add_noise(raw_text, noise_level)
|
||||
return [nlp.make_doc(raw_text)]
|
||||
else:
|
||||
return [Doc(nlp.vocab,
|
||||
words=add_noise(sent_tuples[1], noise_level))
|
||||
for (sent_tuples, brackets) in paragraph_tuples]
|
||||
|
||||
@classmethod
|
||||
def _make_golds(cls, docs, paragraph_tuples):
|
||||
assert len(docs) == len(paragraph_tuples)
|
||||
if len(docs) == 1:
|
||||
return [GoldParse.from_annot_tuples(docs[0],
|
||||
paragraph_tuples[0][0])]
|
||||
else:
|
||||
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
||||
for doc, (sent_tuples, brackets)
|
||||
in zip(docs, paragraph_tuples)]
|
||||
def __del__(self):
|
||||
shutil.rmtree(self.tmp_dir)
|
||||
|
||||
@staticmethod
|
||||
def write_msgpack(directory, doc_tuples):
|
||||
if not directory.exists():
|
||||
directory.mkdir()
|
||||
for i, doc_tuple in enumerate(doc_tuples):
|
||||
with open(directory / '{}.msg'.format(i), 'wb') as file_:
|
||||
msgpack.dump([doc_tuple], file_, use_bin_type=True, encoding='utf8')
|
||||
|
||||
@staticmethod
|
||||
def walk_corpus(path):
|
||||
path = util.ensure_path(path)
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
|
@ -266,6 +141,101 @@ class GoldCorpus(object):
|
|||
locs.append(path)
|
||||
return locs
|
||||
|
||||
@staticmethod
|
||||
def read_tuples(locs, limit=0):
|
||||
i = 0
|
||||
for loc in locs:
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.parts[-1].endswith('json'):
|
||||
gold_tuples = read_json_file(loc)
|
||||
elif loc.parts[-1].endswith('msg'):
|
||||
with loc.open('rb') as file_:
|
||||
gold_tuples = msgpack.load(file_, encoding='utf8')
|
||||
else:
|
||||
msg = "Cannot read from file: %s. Supported formats: .json, .msg"
|
||||
raise ValueError(msg % loc)
|
||||
for item in gold_tuples:
|
||||
yield item
|
||||
i += len(item[1])
|
||||
if limit and i >= limit:
|
||||
break
|
||||
|
||||
@property
|
||||
def dev_tuples(self):
|
||||
locs = (self.tmp_dir / 'dev').iterdir()
|
||||
yield from self.read_tuples(locs, limit=self.limit)
|
||||
|
||||
@property
|
||||
def train_tuples(self):
|
||||
locs = (self.tmp_dir / 'train').iterdir()
|
||||
yield from self.read_tuples(locs, limit=self.limit)
|
||||
|
||||
def count_train(self):
|
||||
n = 0
|
||||
i = 0
|
||||
for raw_text, paragraph_tuples in self.train_tuples:
|
||||
for sent_tuples, brackets in paragraph_tuples:
|
||||
n += len(sent_tuples[1])
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
i += len(paragraph_tuples)
|
||||
return n
|
||||
|
||||
def train_docs(self, nlp, gold_preproc=False, max_length=None,
|
||||
noise_level=0.0):
|
||||
locs = list((self.tmp_dir / 'train').iterdir())
|
||||
random.shuffle(locs)
|
||||
train_tuples = self.read_tuples(locs, limit=self.limit)
|
||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||
max_length=max_length,
|
||||
noise_level=noise_level,
|
||||
make_projective=True)
|
||||
yield from gold_docs
|
||||
|
||||
def dev_docs(self, nlp, gold_preproc=False):
|
||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples,
|
||||
gold_preproc=gold_preproc)
|
||||
yield from gold_docs
|
||||
|
||||
@classmethod
|
||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
||||
noise_level=0.0, make_projective=False):
|
||||
for raw_text, paragraph_tuples in tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||
gold_preproc, noise_level=noise_level)
|
||||
golds = cls._make_golds(docs, paragraph_tuples, make_projective)
|
||||
for doc, gold in zip(docs, golds):
|
||||
if (not max_length) or len(doc) < max_length:
|
||||
yield doc, gold
|
||||
|
||||
@classmethod
|
||||
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
|
||||
noise_level=0.0):
|
||||
if raw_text is not None:
|
||||
raw_text = add_noise(raw_text, noise_level)
|
||||
return [nlp.make_doc(raw_text)]
|
||||
else:
|
||||
return [Doc(nlp.vocab,
|
||||
words=add_noise(sent_tuples[1], noise_level))
|
||||
for (sent_tuples, brackets) in paragraph_tuples]
|
||||
|
||||
@classmethod
|
||||
def _make_golds(cls, docs, paragraph_tuples, make_projective):
|
||||
assert len(docs) == len(paragraph_tuples)
|
||||
if len(docs) == 1:
|
||||
return [GoldParse.from_annot_tuples(docs[0],
|
||||
paragraph_tuples[0][0],
|
||||
make_projective=make_projective)]
|
||||
else:
|
||||
return [GoldParse.from_annot_tuples(doc, sent_tuples,
|
||||
make_projective=make_projective)
|
||||
for doc, (sent_tuples, brackets)
|
||||
in zip(docs, paragraph_tuples)]
|
||||
|
||||
|
||||
def add_noise(orig, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
|
@ -297,11 +267,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
|||
for filename in loc.iterdir():
|
||||
yield from read_json_file(loc / filename, limit=limit)
|
||||
else:
|
||||
with loc.open('r', encoding='utf8') as file_:
|
||||
docs = ujson.load(file_)
|
||||
if limit is not None:
|
||||
docs = docs[:limit]
|
||||
for doc in docs:
|
||||
for doc in _json_iterate(loc):
|
||||
if docs_filter is not None and not docs_filter(doc):
|
||||
continue
|
||||
paragraphs = []
|
||||
|
@ -331,6 +297,56 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
|||
yield [paragraph.get('raw', None), sents]
|
||||
|
||||
|
||||
def _json_iterate(loc):
|
||||
# We should've made these files jsonl...But since we didn't, parse out
|
||||
# the docs one-by-one to reduce memory usage.
|
||||
# It's okay to read in the whole file -- just don't parse it into JSON.
|
||||
cdef bytes py_raw
|
||||
loc = util.ensure_path(loc)
|
||||
with loc.open('rb') as file_:
|
||||
py_raw = file_.read()
|
||||
raw = <char*>py_raw
|
||||
cdef int square_depth = 0
|
||||
cdef int curly_depth = 0
|
||||
cdef int inside_string = 0
|
||||
cdef int escape = 0
|
||||
cdef int start = -1
|
||||
cdef char c
|
||||
cdef char quote = ord('"')
|
||||
cdef char backslash = ord('\\')
|
||||
cdef char open_square = ord('[')
|
||||
cdef char close_square = ord(']')
|
||||
cdef char open_curly = ord('{')
|
||||
cdef char close_curly = ord('}')
|
||||
for i in range(len(py_raw)):
|
||||
c = raw[i]
|
||||
if c == backslash:
|
||||
escape = True
|
||||
continue
|
||||
if escape:
|
||||
escape = False
|
||||
continue
|
||||
if c == quote:
|
||||
inside_string = not inside_string
|
||||
continue
|
||||
if inside_string:
|
||||
continue
|
||||
if c == open_square:
|
||||
square_depth += 1
|
||||
elif c == close_square:
|
||||
square_depth -= 1
|
||||
elif c == open_curly:
|
||||
if square_depth == 1 and curly_depth == 0:
|
||||
start = i
|
||||
curly_depth += 1
|
||||
elif c == close_curly:
|
||||
curly_depth -= 1
|
||||
if square_depth == 1 and curly_depth == 0:
|
||||
py_str = py_raw[start : i+1].decode('utf8')
|
||||
yield ujson.loads(py_str)
|
||||
start = -1
|
||||
|
||||
|
||||
def iob_to_biluo(tags):
|
||||
out = []
|
||||
curr_label = None
|
||||
|
@ -434,8 +450,21 @@ cdef class GoldParse:
|
|||
self.labels = [None] * len(doc)
|
||||
self.ner = [None] * len(doc)
|
||||
|
||||
self.cand_to_gold = align([t.orth_ for t in doc], words)
|
||||
self.gold_to_cand = align(words, [t.orth_ for t in doc])
|
||||
# This needs to be done before we align the words
|
||||
if make_projective and heads is not None and deps is not None:
|
||||
heads, deps = nonproj.projectivize(heads, deps)
|
||||
|
||||
# Do many-to-one alignment for misaligned tokens.
|
||||
# If we over-segment, we'll have one gold word that covers a sequence
|
||||
# of predicted words
|
||||
# If we under-segment, we'll have one predicted word that covers a
|
||||
# sequence of gold words.
|
||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||
# a sequence of gold words. That's many-to-many -- we don't do that.
|
||||
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
||||
|
||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||
|
||||
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
||||
self.orig_annot = list(zip(*annot_tuples))
|
||||
|
@ -443,12 +472,47 @@ cdef class GoldParse:
|
|||
for i, gold_i in enumerate(self.cand_to_gold):
|
||||
if doc[i].text.isspace():
|
||||
self.words[i] = doc[i].text
|
||||
self.tags[i] = 'SP'
|
||||
self.tags[i] = '_SP'
|
||||
self.heads[i] = None
|
||||
self.labels[i] = None
|
||||
self.ner[i] = 'O'
|
||||
if gold_i is None:
|
||||
pass
|
||||
if i in i2j_multi:
|
||||
self.words[i] = words[i2j_multi[i]]
|
||||
self.tags[i] = tags[i2j_multi[i]]
|
||||
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
||||
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
||||
# Set next word in multi-token span as head, until last
|
||||
if not is_last:
|
||||
self.heads[i] = i+1
|
||||
self.labels[i] = 'subtok'
|
||||
else:
|
||||
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
|
||||
self.labels[i] = deps[i2j_multi[i]]
|
||||
# Now set NER...This is annoying because if we've split
|
||||
# got an entity word split into two, we need to adjust the
|
||||
# BILOU tags. We can't have BB or LL etc.
|
||||
# Case 1: O -- easy.
|
||||
ner_tag = entities[i2j_multi[i]]
|
||||
if ner_tag == 'O':
|
||||
self.ner[i] = 'O'
|
||||
# Case 2: U. This has to become a B I* L sequence.
|
||||
elif ner_tag.startswith('U-'):
|
||||
if is_first:
|
||||
self.ner[i] = ner_tag.replace('U-', 'B-', 1)
|
||||
elif is_last:
|
||||
self.ner[i] = ner_tag.replace('U-', 'L-', 1)
|
||||
else:
|
||||
self.ner[i] = ner_tag.replace('U-', 'I-', 1)
|
||||
# Case 3: L. If not last, change to I.
|
||||
elif ner_tag.startswith('L-'):
|
||||
if is_last:
|
||||
self.ner[i] = ner_tag
|
||||
else:
|
||||
self.ner[i] = ner_tag.replace('L-', 'I-', 1)
|
||||
# Case 4: I. Stays correct
|
||||
elif ner_tag.startswith('I-'):
|
||||
self.ner[i] = ner_tag
|
||||
else:
|
||||
self.words[i] = words[gold_i]
|
||||
self.tags[i] = tags[gold_i]
|
||||
|
@ -463,10 +527,6 @@ cdef class GoldParse:
|
|||
if cycle is not None:
|
||||
raise Exception("Cycle found: %s" % cycle)
|
||||
|
||||
if make_projective:
|
||||
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
|
||||
self.heads = proj_heads
|
||||
|
||||
def __len__(self):
|
||||
"""Get the number of gold-standard tokens.
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
|
|||
must my myself
|
||||
|
||||
name namely neither never nevertheless next nine no nobody none noone nor not
|
||||
nothing now nowhere
|
||||
nothing now nowhere n't
|
||||
|
||||
of off often on once one only onto or other others otherwise our ours ourselves
|
||||
out over own
|
||||
|
@ -66,4 +66,6 @@ whereafter whereas whereby wherein whereupon wherever whether which while
|
|||
whither who whoever whole whom whose why will with within without would
|
||||
|
||||
yet you your yours yourself yourselves
|
||||
|
||||
'd 'll 'm 're 's 've
|
||||
""".split())
|
||||
|
|
|
@ -6,17 +6,19 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
|||
|
||||
def noun_chunks(obj):
|
||||
doc = obj.doc
|
||||
np_label = doc.vocab.strings['NP']
|
||||
if not len(doc):
|
||||
return
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
|
||||
right_labels = ['flat', 'fixed', 'compound', 'neg']
|
||||
stop_labels = ['punct']
|
||||
np_left_deps = [doc.vocab.strings[label] for label in left_labels]
|
||||
np_right_deps = [doc.vocab.strings[label] for label in right_labels]
|
||||
stop_deps = [doc.vocab.strings[label] for label in stop_labels]
|
||||
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
||||
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||
token = doc[0]
|
||||
while token and token.i < len(doc):
|
||||
if token.pos in [PROPN, NOUN, PRON]:
|
||||
left, right = noun_bounds(token)
|
||||
left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
|
||||
yield left.i, right.i+1, np_label
|
||||
token = right
|
||||
token = next_token(token)
|
||||
|
@ -33,7 +35,7 @@ def next_token(token):
|
|||
return None
|
||||
|
||||
|
||||
def noun_bounds(root):
|
||||
def noun_bounds(doc, root, np_left_deps, np_right_deps, stop_deps):
|
||||
left_bound = root
|
||||
for token in reversed(list(root.lefts)):
|
||||
if token.dep in np_left_deps:
|
||||
|
@ -41,7 +43,7 @@ def noun_bounds(root):
|
|||
right_bound = root
|
||||
for token in root.rights:
|
||||
if (token.dep in np_right_deps):
|
||||
left, right = noun_bounds(token)
|
||||
left, right = noun_bounds(doc, token, np_left_deps, np_right_deps, stop_deps)
|
||||
if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||
doc[left_bound.i: right.i])):
|
||||
break
|
||||
|
|
15
spacy/lang/fi/examples.py
Normal file
15
spacy/lang/fi/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
>>> from spacy.lang.fi.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
sentences = [
|
||||
"Apple harkitsee ostavansa startup-yrityksen UK:sta 1 miljardilla dollarilla.",
|
||||
"Itseajavat autot siirtävät vakuutusriskin valmistajille.",
|
||||
"San Francisco harkitsee jakelurobottien kieltämistä jalkakäytävillä.",
|
||||
"Lontoo on iso kaupunki Iso-Britanniassa."
|
||||
]
|
26
spacy/lang/fi/lex_attrs.py
Normal file
26
spacy/lang/fi/lex_attrs.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# import the symbols for the attrs you want to overwrite
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
# check if token resembles a number
|
||||
|
||||
_num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'seitsemän', 'kahdeksan', 'yhdeksän', 'kymmenen', 'yksitoista', 'kaksitoista', 'kolmetoista' 'neljätoista', 'viisitoista', 'kuusitoista', 'seitsemäntoista', 'kahdeksantoista', 'yhdeksäntoista', 'kaksikymmentä', 'kolmekymmentä', 'neljäkymmentä', 'viisikymmentä', 'kuusikymmentä'v, 'seitsemänkymmentä', 'kahdeksankymmentä', 'yhdeksänkymmentä', 'sata', 'tuhat', 'miljoona', 'miljardi', 'triljoona']
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.replace('.', '').replace(',', '')
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
|
@ -79,7 +79,7 @@ pienestä pieni pienin poikki puolesta puolestaan päälle
|
|||
|
||||
runsaasti
|
||||
|
||||
saakka sama samaa samaan samalla saman samat samoin sata sataa satojen se
|
||||
saakka sama samaa samaan samalla saman samat samoin satojen se
|
||||
seitsemän sekä sen seuraavat siellä sieltä siihen siinä siis siitä sijaan siksi
|
||||
sille silloin sillä silti siltä sinne sinua sinulla sinulle sinulta sinun
|
||||
sinussa sinusta sinut sinuun sinä sisäkkäin sisällä siten sitten sitä ssa sta
|
||||
|
@ -89,7 +89,7 @@ taa taas taemmas tahansa tai takaa takaisin takana takia tallä tapauksessa
|
|||
tarpeeksi tavalla tavoitteena te teidän teidät teihin teille teillä teiltä
|
||||
teissä teistä teitä tietysti todella toinen toisaalla toisaalle toisaalta
|
||||
toiseen toiseksi toisella toiselle toiselta toisemme toisen toisensa toisessa
|
||||
toisesta toista toistaiseksi toki tosin tuhannen tuhat tule tulee tulemme tulen
|
||||
toisesta toista toistaiseksi toki tosin tule tulee tulemme tulen
|
||||
tulet tulette tulevat tulimme tulin tulisi tulisimme tulisin tulisit tulisitte
|
||||
tulisivat tulit tulitte tulivat tulla tulleet tullut tuntuu tuo tuohon tuoksi
|
||||
tuolla tuolle tuolloin tuolta tuon tuona tuonne tuossa tuosta tuota tuskin tykö
|
||||
|
|
|
@ -35,14 +35,32 @@ class JapaneseTokenizer(object):
|
|||
def from_disk(self, path, **exclude):
|
||||
return self
|
||||
|
||||
class JapaneseCharacterSegmenter(object):
|
||||
def __init__(self, vocab):
|
||||
self.vocab = vocab
|
||||
|
||||
def __call__(self, text):
|
||||
words = []
|
||||
spaces = []
|
||||
doc = self.tokenizer(text)
|
||||
for token in self.tokenizer(text):
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False]*len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'ja'
|
||||
use_janome = True
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
if cls.use_janome:
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
else:
|
||||
return JapaneseCharacterSegmenter(cls, nlp.vocab)
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
|
|
|
@ -144,7 +144,7 @@ def is_lower(string): return string.islower()
|
|||
def is_space(string): return string.isspace()
|
||||
def is_title(string): return string.istitle()
|
||||
def is_upper(string): return string.isupper()
|
||||
def is_stop(string, stops=set()): return string in stops
|
||||
def is_stop(string, stops=set()): return string.lower() in stops
|
||||
def is_oov(string): return True
|
||||
def get_prob(string): return -20.
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
@ -17,6 +18,7 @@ class PolishDefaults(Language.Defaults):
|
|||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
|
||||
|
||||
class Polish(Language):
|
||||
|
|
1628
spacy/lang/pl/tag_map.py
Normal file
1628
spacy/lang/pl/tag_map.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -1,7 +1,7 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN, ADP
|
||||
from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
|
||||
|
||||
|
||||
_exc = {}
|
||||
|
@ -12,24 +12,11 @@ for exc_data in [
|
|||
{ORTH: "mgr.", LEMMA: "magister", POS: NOUN},
|
||||
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
|
||||
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
|
||||
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ},
|
||||
{ORTH: "adw.", LEMMA: "adwokat", POS: NOUN},
|
||||
{ORTH: "afr.", LEMMA: "afrykański", POS: ADJ},
|
||||
{ORTH: "c.b.d.o.", LEMMA: "co było do okazania", POS: ADV},
|
||||
{ORTH: "cbdu.", LEMMA: "co było do udowodnienia", POS: ADV},
|
||||
{ORTH: "mn.w.", LEMMA: "mniej więcej", POS: ADV},
|
||||
{ORTH: "nt.", LEMMA: "na temat", POS: ADP},
|
||||
{ORTH: "ok.", LEMMA: "około"},
|
||||
{ORTH: "n.p.u.", LEMMA: "na psa urok"},
|
||||
{ORTH: "ww.", LEMMA: "wyżej wymieniony", POS: ADV}]:
|
||||
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
for orth in [
|
||||
"w.", "r.", "br.", "bm.", "b.r.", "amer.", "am.", "bdb.", "św.", "p.", "lit.",
|
||||
"wym.", "czyt.", "daw.", "d.", "zob.", "gw.", "dn.", "dyr.", "im.", "mł.",
|
||||
"min.", "dot.", "muz.", "k.k.", "k.p.a.", "k.p.c.", "n.p.m.", "p.p.m.", "nb.",
|
||||
"ob.", "n.e.", "p.n.e.", "zw.", "zool.", "zach.", "żarg.", "żart.", "wzgl.",
|
||||
"wyj.", "xx.", "ks.", "x.", "wyd.", "wsch.", "o.o."]:
|
||||
"w.", "r."]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
|
|
|
@ -24,5 +24,5 @@ TAG_MAP = {
|
|||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART},
|
||||
"SP": {POS: SPACE}
|
||||
"_SP": {POS: SPACE}
|
||||
}
|
||||
|
|
19
spacy/lang/vi/__init__.py
Normal file
19
spacy/lang/vi/__init__.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
|
||||
|
||||
class VietnameseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'vi' # for pickling
|
||||
|
||||
|
||||
class Vietnamese(Language):
|
||||
lang = 'vi'
|
||||
Defaults = VietnameseDefaults # override defaults
|
||||
|
||||
|
||||
__all__ = ['Vietnamese']
|
|
@ -9,6 +9,7 @@ from ...tokens import Doc
|
|||
class ChineseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'zh' # for pickling
|
||||
use_jieba = True
|
||||
|
||||
|
||||
class Chinese(Language):
|
||||
|
@ -16,14 +17,25 @@ class Chinese(Language):
|
|||
Defaults = ChineseDefaults # override defaults
|
||||
|
||||
def make_doc(self, text):
|
||||
try:
|
||||
import jieba
|
||||
except ImportError:
|
||||
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
||||
"https://github.com/fxsjy/jieba")
|
||||
words = list(jieba.cut(text, cut_all=False))
|
||||
words = [x for x in words if x]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
if self.Defaults.use_jieba:
|
||||
try:
|
||||
import jieba
|
||||
except ImportError:
|
||||
msg = ("Jieba not installed. Either set Chinese.use_jieba = False, "
|
||||
"or install it https://github.com/fxsjy/jieba")
|
||||
raise ImportError(msg)
|
||||
words = list(jieba.cut(text, cut_all=False))
|
||||
words = [x for x in words if x]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
else:
|
||||
words = []
|
||||
spaces = []
|
||||
doc = self.tokenizer(text)
|
||||
for token in self.tokenizer(text):
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False]*len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
__all__ = ['Chinese']
|
||||
|
|
|
@ -17,7 +17,7 @@ from .vocab import Vocab
|
|||
from .lemmatizer import Lemmatizer
|
||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
||||
from .pipeline import merge_noun_chunks, merge_entities
|
||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
||||
from .compat import json_dumps, izip, basestring_
|
||||
from .gold import GoldParse
|
||||
from .scorer import Scorer
|
||||
|
@ -108,7 +108,8 @@ class Language(object):
|
|||
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
||||
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
||||
'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
|
||||
'merge_entities': lambda nlp, **cfg: merge_entities
|
||||
'merge_entities': lambda nlp, **cfg: merge_entities,
|
||||
'merge_subtokens': lambda nlp, **cfg: merge_subtokens,
|
||||
}
|
||||
|
||||
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
|
||||
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
|
||||
|
||||
|
@ -27,11 +27,13 @@ class Lemmatizer(object):
|
|||
univ_pos = 'adj'
|
||||
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
||||
univ_pos = 'punct'
|
||||
elif univ_pos in (PROPN, 'PROPN'):
|
||||
return [string]
|
||||
else:
|
||||
return list(set([string.lower()]))
|
||||
return [string.lower()]
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
if self.is_base_form(univ_pos, morphology):
|
||||
return list(set([string.lower()]))
|
||||
return [string.lower()]
|
||||
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
||||
self.exc.get(univ_pos, {}),
|
||||
self.rules.get(univ_pos, []))
|
||||
|
@ -88,6 +90,7 @@ class Lemmatizer(object):
|
|||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
orig = string
|
||||
string = string.lower()
|
||||
forms = []
|
||||
forms.extend(exceptions.get(string, []))
|
||||
|
@ -105,5 +108,5 @@ def lemmatize(string, index, exceptions, rules):
|
|||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms.append(orig)
|
||||
return list(set(forms))
|
||||
|
|
|
@ -1,24 +1,19 @@
|
|||
# cython: profile=True
|
||||
# cython: infer_types=True
|
||||
# coding: utf8
|
||||
# cython: profile=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from libcpp.vector cimport vector
|
||||
from libcpp.pair cimport pair
|
||||
from libc.stdint cimport int32_t, uint64_t, uint16_t
|
||||
from preshed.maps cimport PreshMap
|
||||
from cymem.cymem cimport Pool
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from libc.stdint cimport int32_t
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .typedefs cimport attr_t, hash_t
|
||||
from .structs cimport TokenC
|
||||
from .tokens.doc cimport Doc, get_token_attr
|
||||
from .lexeme cimport attr_id_t
|
||||
from .vocab cimport Vocab
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .attrs cimport ID, attr_id_t, NULL_ATTR
|
||||
from .attrs import IDS
|
||||
from .attrs cimport attr_id_t, ID, NULL_ATTR
|
||||
from .attrs import FLAG61 as U_ENT
|
||||
from .attrs import FLAG60 as B2_ENT
|
||||
from .attrs import FLAG59 as B3_ENT
|
||||
|
@ -48,29 +43,24 @@ from .attrs import FLAG36 as L9_ENT
|
|||
from .attrs import FLAG35 as L10_ENT
|
||||
|
||||
|
||||
cpdef enum quantifier_t:
|
||||
_META
|
||||
ONE
|
||||
cdef enum action_t:
|
||||
REJECT = 0000
|
||||
MATCH = 1000
|
||||
ADVANCE = 0100
|
||||
RETRY = 0010
|
||||
RETRY_EXTEND = 0011
|
||||
MATCH_EXTEND = 1001
|
||||
MATCH_REJECT = 2000
|
||||
|
||||
|
||||
cdef enum quantifier_t:
|
||||
ZERO
|
||||
ZERO_ONE
|
||||
ZERO_PLUS
|
||||
ONE
|
||||
ONE_PLUS
|
||||
|
||||
|
||||
cdef enum action_t:
|
||||
REJECT
|
||||
ADVANCE
|
||||
REPEAT
|
||||
ACCEPT
|
||||
ADVANCE_ZERO
|
||||
ACCEPT_PREV
|
||||
PANIC
|
||||
|
||||
# A "match expression" conists of one or more token patterns
|
||||
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
|
||||
# A state is an (int, pattern pointer) pair, where the int is the start
|
||||
# position, and the pattern pointer shows where we're up to
|
||||
# in the pattern.
|
||||
|
||||
cdef struct AttrValueC:
|
||||
attr_id_t attr
|
||||
attr_t value
|
||||
|
@ -80,10 +70,231 @@ cdef struct TokenPatternC:
|
|||
AttrValueC* attrs
|
||||
int32_t nr_attr
|
||||
quantifier_t quantifier
|
||||
hash_t key
|
||||
|
||||
|
||||
ctypedef TokenPatternC* TokenPatternC_ptr
|
||||
ctypedef pair[int, TokenPatternC_ptr] StateC
|
||||
cdef struct ActionC:
|
||||
char emit_match
|
||||
char next_state_next_token
|
||||
char next_state_same_token
|
||||
char same_state_next_token
|
||||
|
||||
|
||||
cdef struct PatternStateC:
|
||||
TokenPatternC* pattern
|
||||
int32_t start
|
||||
int32_t length
|
||||
|
||||
|
||||
cdef struct MatchC:
|
||||
attr_t pattern_id
|
||||
int32_t start
|
||||
int32_t length
|
||||
|
||||
|
||||
cdef find_matches(TokenPatternC** patterns, int n, Doc doc):
|
||||
cdef vector[PatternStateC] states
|
||||
cdef vector[MatchC] matches
|
||||
cdef PatternStateC state
|
||||
cdef Pool mem = Pool()
|
||||
# TODO: Prefill this with the extra attribute values.
|
||||
extra_attrs = <attr_t**>mem.alloc(len(doc), sizeof(attr_t*))
|
||||
# Main loop
|
||||
cdef int i, j
|
||||
for i in range(doc.length):
|
||||
for j in range(n):
|
||||
states.push_back(PatternStateC(patterns[j], i, 0))
|
||||
transition_states(states, matches, &doc.c[i], extra_attrs[i])
|
||||
# Handle matches that end in 0-width patterns
|
||||
finish_states(matches, states)
|
||||
return [(matches[i].pattern_id, matches[i].start, matches[i].start+matches[i].length)
|
||||
for i in range(matches.size())]
|
||||
|
||||
|
||||
|
||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
||||
const TokenC* token, const attr_t* extra_attrs) except *:
|
||||
cdef int q = 0
|
||||
cdef vector[PatternStateC] new_states
|
||||
for i in range(states.size()):
|
||||
action = get_action(states[i], token, extra_attrs)
|
||||
if action == REJECT:
|
||||
continue
|
||||
state = states[i]
|
||||
states[q] = state
|
||||
while action in (RETRY, RETRY_EXTEND):
|
||||
if action == RETRY_EXTEND:
|
||||
new_states.push_back(
|
||||
PatternStateC(pattern=state.pattern, start=state.start,
|
||||
length=state.length+1))
|
||||
states[q].pattern += 1
|
||||
action = get_action(states[q], token, extra_attrs)
|
||||
if action == REJECT:
|
||||
pass
|
||||
elif action == ADVANCE:
|
||||
states[q].pattern += 1
|
||||
states[q].length += 1
|
||||
q += 1
|
||||
else:
|
||||
ent_id = state.pattern[1].attrs.value
|
||||
if action == MATCH:
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length+1))
|
||||
elif action == MATCH_REJECT:
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length))
|
||||
elif action == MATCH_EXTEND:
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length))
|
||||
states[q].length += 1
|
||||
q += 1
|
||||
states.resize(q)
|
||||
for i in range(new_states.size()):
|
||||
states.push_back(new_states[i])
|
||||
|
||||
|
||||
cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
|
||||
'''Handle states that end in zero-width patterns.'''
|
||||
cdef PatternStateC state
|
||||
for i in range(states.size()):
|
||||
state = states[i]
|
||||
while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
|
||||
is_final = get_is_final(state)
|
||||
if is_final:
|
||||
ent_id = state.pattern[1].attrs.value
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start, length=state.length))
|
||||
break
|
||||
else:
|
||||
state.pattern += 1
|
||||
|
||||
|
||||
cdef action_t get_action(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
|
||||
'''We need to consider:
|
||||
|
||||
a) Does the token match the specification? [Yes, No]
|
||||
b) What's the quantifier? [1, 0+, ?]
|
||||
c) Is this the last specification? [final, non-final]
|
||||
|
||||
We can transition in the following ways:
|
||||
|
||||
a) Do we emit a match?
|
||||
b) Do we add a state with (next state, next token)?
|
||||
c) Do we add a state with (next state, same token)?
|
||||
d) Do we add a state with (same state, next token)?
|
||||
|
||||
We'll code the actions as boolean strings, so 0000 means no to all 4,
|
||||
1000 means match but no states added, etc.
|
||||
|
||||
1:
|
||||
Yes, final:
|
||||
1000
|
||||
Yes, non-final:
|
||||
0100
|
||||
No, final:
|
||||
0000
|
||||
No, non-final
|
||||
0000
|
||||
0+:
|
||||
Yes, final:
|
||||
1001
|
||||
Yes, non-final:
|
||||
0011
|
||||
No, final:
|
||||
1000 (note: Don't include last token!)
|
||||
No, non-final:
|
||||
0010
|
||||
?:
|
||||
Yes, final:
|
||||
1000
|
||||
Yes, non-final:
|
||||
0100
|
||||
No, final:
|
||||
1000 (note: Don't include last token!)
|
||||
No, non-final:
|
||||
0010
|
||||
|
||||
Possible combinations: 1000, 0100, 0000, 1001, 0011, 0010,
|
||||
|
||||
We'll name the bits "match", "advance", "retry", "extend"
|
||||
REJECT = 0000
|
||||
MATCH = 1000
|
||||
ADVANCE = 0100
|
||||
RETRY = 0010
|
||||
MATCH_EXTEND = 1001
|
||||
RETRY_EXTEND = 0011
|
||||
MATCH_REJECT = 2000 # Match, but don't include last token
|
||||
|
||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
||||
'''
|
||||
cdef char is_match
|
||||
is_match = get_is_match(state, token, extra_attrs)
|
||||
quantifier = get_quantifier(state)
|
||||
is_final = get_is_final(state)
|
||||
if quantifier == ZERO:
|
||||
is_match = not is_match
|
||||
quantifier = ONE
|
||||
if quantifier == ONE:
|
||||
if is_match and is_final:
|
||||
# Yes, final: 1000
|
||||
return MATCH
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0100
|
||||
return ADVANCE
|
||||
elif not is_match and is_final:
|
||||
# No, final: 0000
|
||||
return REJECT
|
||||
else:
|
||||
return REJECT
|
||||
elif quantifier == ZERO_PLUS:
|
||||
if is_match and is_final:
|
||||
# Yes, final: 1001
|
||||
return MATCH_EXTEND
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0011
|
||||
return RETRY_EXTEND
|
||||
elif not is_match and is_final:
|
||||
# No, final 2000 (note: Don't include last token!)
|
||||
return MATCH_REJECT
|
||||
else:
|
||||
# No, non-final 0010
|
||||
return RETRY
|
||||
elif quantifier == ZERO_ONE:
|
||||
if is_match and is_final:
|
||||
# Yes, final: 1000
|
||||
return MATCH
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0100
|
||||
return ADVANCE
|
||||
elif not is_match and is_final:
|
||||
# No, final 2000 (note: Don't include last token!)
|
||||
return MATCH_REJECT
|
||||
else:
|
||||
# No, non-final 0010
|
||||
return RETRY
|
||||
|
||||
|
||||
cdef char get_is_match(PatternStateC state, const TokenC* token, const attr_t* extra_attrs) nogil:
|
||||
spec = state.pattern
|
||||
for attr in spec.attrs[:spec.nr_attr]:
|
||||
if get_token_attr(token, attr.attr) != attr.value:
|
||||
return 0
|
||||
else:
|
||||
return 1
|
||||
|
||||
|
||||
cdef char get_is_final(PatternStateC state) nogil:
|
||||
if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
cdef char get_quantifier(PatternStateC state) nogil:
|
||||
return state.pattern.quantifier
|
||||
|
||||
|
||||
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
||||
|
@ -97,6 +308,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
|||
for j, (attr, value) in enumerate(spec):
|
||||
pattern[i].attrs[j].attr = attr
|
||||
pattern[i].attrs[j].value = value
|
||||
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
|
||||
i = len(token_specs)
|
||||
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
|
||||
pattern[i].attrs[0].attr = ID
|
||||
|
@ -105,48 +317,16 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
|||
return pattern
|
||||
|
||||
|
||||
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
|
||||
cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
|
||||
while pattern.nr_attr != 0:
|
||||
pattern += 1
|
||||
id_attr = pattern[0].attrs[0]
|
||||
assert id_attr.attr == ID
|
||||
return id_attr.value
|
||||
|
||||
|
||||
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||
lookahead = &pattern[1]
|
||||
for attr in pattern.attrs[:pattern.nr_attr]:
|
||||
if get_token_attr(token, attr.attr) != attr.value:
|
||||
if pattern.quantifier == ONE:
|
||||
return REJECT
|
||||
elif pattern.quantifier == ZERO:
|
||||
return ACCEPT if lookahead.nr_attr == 0 else ADVANCE
|
||||
elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
|
||||
return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO
|
||||
else:
|
||||
return PANIC
|
||||
if pattern.quantifier == ZERO:
|
||||
return REJECT
|
||||
elif lookahead.nr_attr == 0:
|
||||
return ACCEPT
|
||||
elif pattern.quantifier in (ONE, ZERO_ONE):
|
||||
return ADVANCE
|
||||
elif pattern.quantifier == ZERO_PLUS:
|
||||
# This is a bandaid over the 'shadowing' problem described here:
|
||||
# https://github.com/explosion/spaCy/issues/864
|
||||
next_action = get_action(lookahead, token)
|
||||
if next_action is REJECT:
|
||||
return REPEAT
|
||||
else:
|
||||
return ADVANCE_ZERO
|
||||
else:
|
||||
return PANIC
|
||||
|
||||
|
||||
def _convert_strings(token_specs, string_store):
|
||||
# Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
|
||||
operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
||||
'?': (ZERO_ONE,), '1': (ONE,)}
|
||||
operators = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
|
||||
'?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)}
|
||||
tokens = []
|
||||
op = ONE
|
||||
for spec in token_specs:
|
||||
|
@ -176,21 +356,6 @@ def _convert_strings(token_specs, string_store):
|
|||
return tokens
|
||||
|
||||
|
||||
def merge_phrase(matcher, doc, i, matches):
|
||||
"""Callback to merge a phrase on match."""
|
||||
ent_id, label, start, end = matches[i]
|
||||
span = doc[start:end]
|
||||
span.merge(ent_type=label, ent_id=ent_id)
|
||||
|
||||
|
||||
def unpickle_matcher(vocab, patterns, callbacks):
|
||||
matcher = Matcher(vocab)
|
||||
for key, specs in patterns.items():
|
||||
callback = callbacks.get(key, None)
|
||||
matcher.add(key, callback, *specs)
|
||||
return matcher
|
||||
|
||||
|
||||
cdef class Matcher:
|
||||
"""Match sequences of tokens, based on pattern rules."""
|
||||
cdef Pool mem
|
||||
|
@ -311,7 +476,7 @@ cdef class Matcher:
|
|||
if key not in self._patterns:
|
||||
return default
|
||||
return (self._callbacks[key], self._patterns[key])
|
||||
|
||||
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
|
@ -333,85 +498,9 @@ cdef class Matcher:
|
|||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
"""
|
||||
cdef vector[StateC] partials
|
||||
cdef int n_partials = 0
|
||||
cdef int q = 0
|
||||
cdef int i, token_i
|
||||
cdef const TokenC* token
|
||||
cdef StateC state
|
||||
matches = []
|
||||
for token_i in range(doc.length):
|
||||
token = &doc.c[token_i]
|
||||
q = 0
|
||||
# Go over the open matches, extending or finalizing if able.
|
||||
# Otherwise, we over-write them (q doesn't advance)
|
||||
for state in partials:
|
||||
action = get_action(state.second, token)
|
||||
if action == PANIC:
|
||||
raise Exception("Error selecting action in matcher")
|
||||
while action == ADVANCE_ZERO:
|
||||
state.second += 1
|
||||
action = get_action(state.second, token)
|
||||
if action == PANIC:
|
||||
raise Exception("Error selecting action in matcher")
|
||||
|
||||
if action == REPEAT:
|
||||
# Leave the state in the queue, and advance to next slot
|
||||
# (i.e. we don't overwrite -- we want to greedily match
|
||||
# more pattern.
|
||||
q += 1
|
||||
elif action == REJECT:
|
||||
pass
|
||||
elif action == ADVANCE:
|
||||
partials[q] = state
|
||||
partials[q].second += 1
|
||||
q += 1
|
||||
elif action in (ACCEPT, ACCEPT_PREV):
|
||||
# TODO: What to do about patterns starting with ZERO? Need
|
||||
# to adjust the start position.
|
||||
start = state.first
|
||||
end = token_i+1 if action == ACCEPT else token_i
|
||||
ent_id = state.second[1].attrs[0].value
|
||||
label = state.second[1].attrs[1].value
|
||||
matches.append((ent_id, start, end))
|
||||
|
||||
partials.resize(q)
|
||||
# Check whether we open any new patterns on this token
|
||||
for pattern in self.patterns:
|
||||
action = get_action(pattern, token)
|
||||
if action == PANIC:
|
||||
raise Exception("Error selecting action in matcher")
|
||||
while action == ADVANCE_ZERO:
|
||||
pattern += 1
|
||||
action = get_action(pattern, token)
|
||||
if action == REPEAT:
|
||||
state.first = token_i
|
||||
state.second = pattern
|
||||
partials.push_back(state)
|
||||
elif action == ADVANCE:
|
||||
# TODO: What to do about patterns starting with ZERO? Need
|
||||
# to adjust the start position.
|
||||
state.first = token_i
|
||||
state.second = pattern + 1
|
||||
partials.push_back(state)
|
||||
elif action in (ACCEPT, ACCEPT_PREV):
|
||||
start = token_i
|
||||
end = token_i+1 if action == ACCEPT else token_i
|
||||
ent_id = pattern[1].attrs[0].value
|
||||
label = pattern[1].attrs[1].value
|
||||
matches.append((ent_id, start, end))
|
||||
# Look for open patterns that are actually satisfied
|
||||
for state in partials:
|
||||
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
|
||||
state.second += 1
|
||||
if state.second.nr_attr == 0:
|
||||
start = state.first
|
||||
end = len(doc)
|
||||
ent_id = state.second.attrs[0].value
|
||||
label = state.second.attrs[0].value
|
||||
matches.append((ent_id, start, end))
|
||||
for i, (ent_id, start, end) in enumerate(matches):
|
||||
on_match = self._callbacks.get(ent_id)
|
||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doc)
|
||||
for i, (key, start, end) in enumerate(matches):
|
||||
on_match = self._callbacks.get(key, None)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
return matches
|
||||
|
@ -423,31 +512,37 @@ cdef class Matcher:
|
|||
return key
|
||||
|
||||
|
||||
def unpickle_matcher(vocab, patterns, callbacks):
|
||||
matcher = Matcher(vocab)
|
||||
for key, specs in patterns.items():
|
||||
callback = callbacks.get(key, None)
|
||||
matcher.add(key, callback, *specs)
|
||||
return matcher
|
||||
|
||||
|
||||
def _get_longest_matches(matches):
|
||||
'''Filter out matches that have a longer equivalent.'''
|
||||
longest_matches = {}
|
||||
for pattern_id, start, end in matches:
|
||||
key = (pattern_id, start)
|
||||
length = end-start
|
||||
if key not in longest_matches or length > longest_matches[key]:
|
||||
longest_matches[key] = length
|
||||
return [(pattern_id, start, start+length)
|
||||
for (pattern_id, start), length in longest_matches.items()]
|
||||
|
||||
|
||||
def get_bilou(length):
|
||||
if length == 1:
|
||||
if length == 0:
|
||||
raise ValueError("Length must be >= 1")
|
||||
elif length == 1:
|
||||
return [U_ENT]
|
||||
elif length == 2:
|
||||
return [B2_ENT, L2_ENT]
|
||||
elif length == 3:
|
||||
return [B3_ENT, I3_ENT, L3_ENT]
|
||||
elif length == 4:
|
||||
return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
|
||||
elif length == 5:
|
||||
return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
|
||||
elif length == 6:
|
||||
return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
|
||||
elif length == 7:
|
||||
return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
|
||||
elif length == 8:
|
||||
return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
|
||||
elif length == 9:
|
||||
return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
|
||||
L9_ENT]
|
||||
elif length == 10:
|
||||
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
||||
I10_ENT, I10_ENT, L10_ENT]
|
||||
else:
|
||||
raise ValueError("Max length currently 10 for phrase matching")
|
||||
return [B4_ENT, I4_ENT] + [I4_ENT] * (length-3) + [L4_ENT]
|
||||
|
||||
|
||||
cdef class PhraseMatcher:
|
||||
|
@ -456,21 +551,21 @@ cdef class PhraseMatcher:
|
|||
cdef Matcher matcher
|
||||
cdef PreshMap phrase_ids
|
||||
cdef int max_length
|
||||
cdef attr_t* _phrase_key
|
||||
cdef public object _callbacks
|
||||
cdef public object _patterns
|
||||
|
||||
def __init__(self, Vocab vocab, max_length=10):
|
||||
self.mem = Pool()
|
||||
self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
|
||||
self.max_length = max_length
|
||||
self.vocab = vocab
|
||||
self.matcher = Matcher(self.vocab)
|
||||
self.phrase_ids = PreshMap()
|
||||
abstract_patterns = []
|
||||
for length in range(1, max_length):
|
||||
abstract_patterns.append([{tag: True}
|
||||
for tag in get_bilou(length)])
|
||||
abstract_patterns = [
|
||||
[{U_ENT: True}],
|
||||
[{B2_ENT: True}, {L2_ENT: True}],
|
||||
[{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}],
|
||||
[{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}],
|
||||
]
|
||||
self.matcher.add('Candidate', None, *abstract_patterns)
|
||||
self._callbacks = {}
|
||||
|
||||
|
@ -504,29 +599,24 @@ cdef class PhraseMatcher:
|
|||
*docs (Doc): `Doc` objects representing match patterns.
|
||||
"""
|
||||
cdef Doc doc
|
||||
for doc in docs:
|
||||
if len(doc) >= self.max_length:
|
||||
msg = (
|
||||
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
|
||||
"Length can be set on initialization, up to 10."
|
||||
)
|
||||
raise ValueError(msg % (len(doc), self.max_length))
|
||||
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||
self._callbacks[ent_id] = on_match
|
||||
cdef int length
|
||||
cdef int i
|
||||
cdef hash_t phrase_hash
|
||||
cdef Pool mem = Pool()
|
||||
for doc in docs:
|
||||
length = doc.length
|
||||
if length == 0:
|
||||
continue
|
||||
tags = get_bilou(length)
|
||||
for i in range(self.max_length):
|
||||
self._phrase_key[i] = 0
|
||||
phrase_key = <attr_t*>mem.alloc(length, sizeof(attr_t))
|
||||
for i, tag in enumerate(tags):
|
||||
lexeme = self.vocab[doc.c[i].lex.orth]
|
||||
lexeme.set_flag(tag, True)
|
||||
self._phrase_key[i] = lexeme.orth
|
||||
phrase_hash = hash64(self._phrase_key,
|
||||
self.max_length * sizeof(attr_t), 0)
|
||||
phrase_key[i] = lexeme.orth
|
||||
phrase_hash = hash64(phrase_key,
|
||||
length * sizeof(attr_t), 0)
|
||||
self.phrase_ids.set(phrase_hash, <void*>ent_id)
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
|
@ -548,28 +638,45 @@ cdef class PhraseMatcher:
|
|||
on_match(self, doc, i, matches)
|
||||
return matches
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2, return_matches=False,
|
||||
as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the implementation supports multi-threading.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
||||
and yield (result, context) tuples out.
|
||||
If both return_matches and as_tuples are True, the output will
|
||||
be a sequence of ((doc, matches), context) tuples.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in stream:
|
||||
self(doc)
|
||||
yield doc
|
||||
if as_tuples:
|
||||
for doc, context in stream:
|
||||
matches = self(doc)
|
||||
if return_matches:
|
||||
yield ((doc, matches), context)
|
||||
else:
|
||||
yield (doc, context)
|
||||
else:
|
||||
for doc in stream:
|
||||
matches = self(doc)
|
||||
if return_matches:
|
||||
yield (doc, matches)
|
||||
else:
|
||||
yield doc
|
||||
|
||||
def accept_match(self, Doc doc, int start, int end):
|
||||
assert (end - start) < self.max_length
|
||||
cdef int i, j
|
||||
for i in range(self.max_length):
|
||||
self._phrase_key[i] = 0
|
||||
cdef Pool mem = Pool()
|
||||
phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t))
|
||||
for i, j in enumerate(range(start, end)):
|
||||
self._phrase_key[i] = doc.c[j].lex.orth
|
||||
cdef hash_t key = hash64(self._phrase_key,
|
||||
self.max_length * sizeof(attr_t), 0)
|
||||
phrase_key[i] = doc.c[j].lex.orth
|
||||
cdef hash_t key = hash64(phrase_key,
|
||||
(end-start) * sizeof(attr_t), 0)
|
||||
ent_id = <hash_t>self.phrase_ids.get(key)
|
||||
if ent_id == 0:
|
||||
return None
|
||||
|
|
|
@ -47,7 +47,9 @@ cdef class Morphology:
|
|||
cdef enum univ_morph_t:
|
||||
NIL = 0
|
||||
Animacy_anim = symbols.Animacy_anim
|
||||
Animacy_inam
|
||||
Animacy_inan
|
||||
Animacy_hum
|
||||
Animacy_nhum
|
||||
Aspect_freq
|
||||
Aspect_imp
|
||||
Aspect_mod
|
||||
|
|
|
@ -184,7 +184,9 @@ cdef class Morphology:
|
|||
|
||||
IDS = {
|
||||
"Animacy_anim": Animacy_anim,
|
||||
"Animacy_inam": Animacy_inam,
|
||||
"Animacy_inan": Animacy_inan,
|
||||
"Animacy_hum": Animacy_hum, # U20
|
||||
"Animacy_nhum": Animacy_nhum,
|
||||
"Aspect_freq": Aspect_freq,
|
||||
"Aspect_imp": Aspect_imp,
|
||||
"Aspect_mod": Aspect_mod,
|
||||
|
|
|
@ -25,6 +25,7 @@ from .morphology cimport Morphology
|
|||
from .vocab cimport Vocab
|
||||
from .syntax import nonproj
|
||||
from .compat import json_dumps
|
||||
from .matcher import Matcher
|
||||
|
||||
from .attrs import POS
|
||||
from .parts_of_speech import X
|
||||
|
@ -97,6 +98,17 @@ def merge_entities(doc):
|
|||
return doc
|
||||
|
||||
|
||||
def merge_subtokens(doc, label='subtok'):
|
||||
merger = Matcher(doc.vocab)
|
||||
merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
|
||||
matches = merger(doc)
|
||||
spans = [doc[start:end+1] for _, start, end in matches]
|
||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||
for start_char, end_char in offsets:
|
||||
doc.merge(start_char, end_char)
|
||||
return doc
|
||||
|
||||
|
||||
class Pipe(object):
|
||||
"""This class is not instantiated directly. Components inherit from it, and
|
||||
it defines the interface that components should follow to function as
|
||||
|
@ -167,7 +179,7 @@ class Pipe(object):
|
|||
problem.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def create_optimizer(self):
|
||||
return create_default_optimizer(self.model.ops,
|
||||
**self.cfg.get('optimizer', {}))
|
||||
|
@ -652,11 +664,13 @@ class MultitaskObjective(Tagger):
|
|||
self.make_label = self.make_dep_tag_offset
|
||||
elif target == 'ent_tag':
|
||||
self.make_label = self.make_ent_tag
|
||||
elif target == 'sent_start':
|
||||
self.make_label = self.make_sent_start
|
||||
elif hasattr(target, '__call__'):
|
||||
self.make_label = target
|
||||
else:
|
||||
raise ValueError("MultitaskObjective target should be function or "
|
||||
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
|
||||
"one of: dep, tag, ent, sent_start, dep_tag_offset, ent_tag.")
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims',
|
||||
|
@ -716,11 +730,7 @@ class MultitaskObjective(Tagger):
|
|||
for i, gold in enumerate(golds):
|
||||
for j in range(len(docs[i])):
|
||||
# Handes alignment for tokenization differences
|
||||
gold_idx = gold.cand_to_gold[j]
|
||||
if gold_idx is None:
|
||||
idx += 1
|
||||
continue
|
||||
label = self.make_label(gold_idx, gold.words, gold.tags,
|
||||
label = self.make_label(j, gold.words, gold.tags,
|
||||
gold.heads, gold.labels, gold.ents)
|
||||
if label is None or label not in self.labels:
|
||||
correct[idx] = guesses[idx]
|
||||
|
@ -765,6 +775,51 @@ class MultitaskObjective(Tagger):
|
|||
else:
|
||||
return '%s-%s' % (tags[i], ents[i])
|
||||
|
||||
@staticmethod
|
||||
def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
|
||||
'''A multi-task objective for representing sentence boundaries,
|
||||
using BILU scheme. (O is impossible)
|
||||
|
||||
The implementation of this method uses an internal cache that relies
|
||||
on the identity of the heads array, to avoid requiring a new piece
|
||||
of gold data. You can pass cache=False if you know the cache will
|
||||
do the wrong thing.
|
||||
'''
|
||||
assert len(words) == len(heads)
|
||||
assert target < len(words), (target, len(words))
|
||||
if cache:
|
||||
if id(heads) in _cache:
|
||||
return _cache[id(heads)][target]
|
||||
else:
|
||||
for key in list(_cache.keys()):
|
||||
_cache.pop(key)
|
||||
sent_tags = ['I-SENT'] * len(words)
|
||||
_cache[id(heads)] = sent_tags
|
||||
else:
|
||||
sent_tags = ['I-SENT'] * len(words)
|
||||
|
||||
def _find_root(child):
|
||||
seen = set([child])
|
||||
while child is not None and heads[child] != child:
|
||||
seen.add(child)
|
||||
child = heads[child]
|
||||
return child
|
||||
|
||||
sentences = {}
|
||||
for i in range(len(words)):
|
||||
root = _find_root(i)
|
||||
if root is None:
|
||||
sent_tags[i] = None
|
||||
else:
|
||||
sentences.setdefault(root, []).append(i)
|
||||
for root, span in sorted(sentences.items()):
|
||||
if len(span) == 1:
|
||||
sent_tags[span[0]] = 'U-SENT'
|
||||
else:
|
||||
sent_tags[span[0]] = 'B-SENT'
|
||||
sent_tags[span[-1]] = 'L-SENT'
|
||||
return sent_tags[target]
|
||||
|
||||
|
||||
class SimilarityHook(Pipe):
|
||||
"""
|
||||
|
@ -823,8 +878,8 @@ class TextCategorizer(Pipe):
|
|||
name = 'textcat'
|
||||
|
||||
@classmethod
|
||||
def Model(cls, nr_class=1, width=64, **cfg):
|
||||
return build_text_classifier(nr_class, width, **cfg)
|
||||
def Model(cls, **cfg):
|
||||
return build_text_classifier(**cfg)
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
|
@ -890,6 +945,15 @@ class TextCategorizer(Pipe):
|
|||
if label in self.labels:
|
||||
return 0
|
||||
if self.model not in (None, True, False):
|
||||
# This functionality was available previously, but was broken.
|
||||
# The problem is that we resize the last layer, but the last layer
|
||||
# is actually just an ensemble. We're not resizing the child layers
|
||||
# -- a huge problem.
|
||||
raise ValueError(
|
||||
"Cannot currently add labels to pre-trained text classifier. "
|
||||
"Add labels before training begins. This functionality was "
|
||||
"available in previous versions, but had significant bugs that "
|
||||
"let to poor performance")
|
||||
smaller = self.model._layers[-1]
|
||||
larger = Affine(len(self.labels)+1, smaller.nI)
|
||||
copy_array(larger.W[:smaller.nO], smaller.W)
|
||||
|
@ -905,8 +969,9 @@ class TextCategorizer(Pipe):
|
|||
token_vector_width = 64
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model = self.Model(len(self.labels), token_vector_width,
|
||||
**self.cfg)
|
||||
self.cfg['nr_class'] = len(self.labels)
|
||||
self.cfg['width'] = token_vector_width
|
||||
self.model = self.Model(**self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
|
@ -920,7 +985,7 @@ cdef class DependencyParser(Parser):
|
|||
@property
|
||||
def postprocesses(self):
|
||||
return [nonproj.deprojectivize]
|
||||
|
||||
|
||||
def add_multitask_objective(self, target):
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
self._multitasks.append(labeller)
|
||||
|
@ -941,7 +1006,7 @@ cdef class EntityRecognizer(Parser):
|
|||
TransitionSystem = BiluoPushDown
|
||||
|
||||
nr_feature = 6
|
||||
|
||||
|
||||
def add_multitask_objective(self, target):
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
self._multitasks.append(labeller)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import division, print_function, unicode_literals
|
||||
|
||||
from .gold import tags_to_entities
|
||||
from .gold import tags_to_entities, GoldParse
|
||||
|
||||
|
||||
class PRFScore(object):
|
||||
|
@ -84,6 +84,8 @@ class Scorer(object):
|
|||
}
|
||||
|
||||
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
||||
if len(tokens) != len(gold):
|
||||
gold = GoldParse.from_annot_tuples(tokens, zip(*gold.orig_annot))
|
||||
assert len(tokens) == len(gold)
|
||||
gold_deps = set()
|
||||
gold_tags = set()
|
||||
|
@ -100,8 +102,7 @@ class Scorer(object):
|
|||
continue
|
||||
gold_i = gold.cand_to_gold[token.i]
|
||||
if gold_i is None:
|
||||
if token.dep_.lower() not in punct_labels:
|
||||
self.tokens.fp += 1
|
||||
self.tokens.fp += 1
|
||||
else:
|
||||
self.tokens.tp += 1
|
||||
cand_tags.add((gold_i, token.tag_))
|
||||
|
|
|
@ -85,6 +85,7 @@ cdef enum symbol_t:
|
|||
SENT_START
|
||||
SPACY
|
||||
PROB
|
||||
LANG
|
||||
|
||||
ADJ
|
||||
ADP
|
||||
|
@ -108,8 +109,9 @@ cdef enum symbol_t:
|
|||
SPACE
|
||||
|
||||
Animacy_anim
|
||||
Animacy_inam
|
||||
Animacy_inan
|
||||
Animacy_hum # U20
|
||||
Animacy_nhum
|
||||
Aspect_freq
|
||||
Aspect_imp
|
||||
Aspect_mod
|
||||
|
@ -393,6 +395,7 @@ cdef enum symbol_t:
|
|||
EVENT
|
||||
WORK_OF_ART
|
||||
LANGUAGE
|
||||
LAW
|
||||
|
||||
DATE
|
||||
TIME
|
||||
|
@ -451,10 +454,9 @@ cdef enum symbol_t:
|
|||
prt
|
||||
punct
|
||||
quantmod
|
||||
relcl
|
||||
rcmod
|
||||
root
|
||||
xcomp
|
||||
|
||||
acl
|
||||
LAW
|
||||
LANG
|
||||
|
|
|
@ -114,8 +114,9 @@ IDS = {
|
|||
"SPACE": SPACE,
|
||||
|
||||
"Animacy_anim": Animacy_anim,
|
||||
"Animacy_inam": Animacy_inam,
|
||||
"Animacy_inam": Animacy_inan,
|
||||
"Animacy_hum": Animacy_hum, # U20
|
||||
"Animacy_nhum": Animacy_nhum,
|
||||
"Aspect_freq": Aspect_freq,
|
||||
"Aspect_imp": Aspect_imp,
|
||||
"Aspect_mod": Aspect_mod,
|
||||
|
@ -458,6 +459,7 @@ IDS = {
|
|||
"punct": punct,
|
||||
"quantmod": quantmod,
|
||||
"rcmod": rcmod,
|
||||
"relcl": relcl,
|
||||
"root": root,
|
||||
"xcomp": xcomp,
|
||||
|
||||
|
|
|
@ -108,7 +108,7 @@ cdef cppclass StateC:
|
|||
ids[1] = this.B(1)
|
||||
ids[2] = this.S(0)
|
||||
ids[3] = this.S(1)
|
||||
ids[4] = this.H(this.S(0))
|
||||
ids[4] = this.S(2)
|
||||
ids[5] = this.L(this.B(0), 1)
|
||||
ids[6] = this.L(this.S(0), 1)
|
||||
ids[7] = this.R(this.S(0), 1)
|
||||
|
|
|
@ -6,16 +6,19 @@ from __future__ import unicode_literals
|
|||
|
||||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
from collections import OrderedDict
|
||||
from collections import OrderedDict, defaultdict, Counter
|
||||
from thinc.extra.search cimport Beam
|
||||
import json
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .nonproj import is_nonproj_tree
|
||||
from . import nonproj
|
||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||
from ..gold cimport GoldParse, GoldParseC
|
||||
from ..structs cimport TokenC
|
||||
|
||||
# Calculate cost as gold/not gold. We don't use scalar value anyway.
|
||||
cdef int BINARY_COSTS = 1
|
||||
|
||||
DEF NON_MONOTONIC = True
|
||||
DEF USE_BREAK = True
|
||||
|
@ -54,6 +57,8 @@ cdef weight_t push_cost(StateClass stcls, const GoldParseC* gold, int target) no
|
|||
cost += 1
|
||||
if gold.heads[S_i] == target and (NON_MONOTONIC or not stcls.has_head(S_i)):
|
||||
cost += 1
|
||||
if BINARY_COSTS and cost >= 1:
|
||||
return cost
|
||||
cost += Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0
|
||||
return cost
|
||||
|
||||
|
@ -67,6 +72,8 @@ cdef weight_t pop_cost(StateClass stcls, const GoldParseC* gold, int target) nog
|
|||
cost += gold.heads[target] == B_i
|
||||
if gold.heads[B_i] == B_i or gold.heads[B_i] < target:
|
||||
break
|
||||
if BINARY_COSTS and cost >= 1:
|
||||
return cost
|
||||
if Break.is_valid(stcls.c, 0) and Break.move_cost(stcls, gold) == 0:
|
||||
cost += 1
|
||||
return cost
|
||||
|
@ -110,7 +117,8 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
|
|||
cdef class Shift:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and st.B_(0).sent_start != 1
|
||||
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and sent_start != 1
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
|
@ -170,7 +178,8 @@ cdef class Reduce:
|
|||
cdef class LeftArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.B_(0).sent_start != 1
|
||||
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
||||
return sent_start != 1
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
|
@ -205,7 +214,8 @@ cdef class RightArc:
|
|||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
# If there's (perhaps partial) parse pre-set, don't allow cycle.
|
||||
return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0)
|
||||
sent_start = st._sent[st.B_(0).l_edge].sent_start
|
||||
return sent_start != 1 and st.H(st.S(0)) != st.B(0)
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
|
@ -312,39 +322,42 @@ cdef class ArcEager(TransitionSystem):
|
|||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions', OrderedDict((
|
||||
(SHIFT, ['']),
|
||||
(REDUCE, ['']),
|
||||
(RIGHT, []),
|
||||
(LEFT, []),
|
||||
(BREAK, ['ROOT']))
|
||||
))
|
||||
seen_actions = set()
|
||||
min_freq = kwargs.get('min_freq', None)
|
||||
actions = defaultdict(lambda: Counter())
|
||||
actions[SHIFT][''] = 1
|
||||
actions[REDUCE][''] = 1
|
||||
for label in kwargs.get('left_labels', []):
|
||||
if label.upper() != 'ROOT':
|
||||
if (LEFT, label) not in seen_actions:
|
||||
actions[LEFT].append(label)
|
||||
seen_actions.add((LEFT, label))
|
||||
actions[LEFT][label] = 1
|
||||
actions[SHIFT][label] = 1
|
||||
for label in kwargs.get('right_labels', []):
|
||||
if label.upper() != 'ROOT':
|
||||
if (RIGHT, label) not in seen_actions:
|
||||
actions[RIGHT].append(label)
|
||||
seen_actions.add((RIGHT, label))
|
||||
|
||||
actions[RIGHT][label] = 1
|
||||
actions[REDUCE][label] = 1
|
||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
heads, labels = nonproj.projectivize(heads, labels)
|
||||
for child, head, label in zip(ids, heads, labels):
|
||||
if label.upper() == 'ROOT':
|
||||
if label.upper() == 'ROOT' :
|
||||
label = 'ROOT'
|
||||
if label != 'ROOT':
|
||||
if head < child:
|
||||
if (RIGHT, label) not in seen_actions:
|
||||
actions[RIGHT].append(label)
|
||||
seen_actions.add((RIGHT, label))
|
||||
elif head > child:
|
||||
if (LEFT, label) not in seen_actions:
|
||||
actions[LEFT].append(label)
|
||||
seen_actions.add((LEFT, label))
|
||||
if head == child:
|
||||
actions[BREAK][label] += 1
|
||||
elif head < child:
|
||||
actions[RIGHT][label] += 1
|
||||
actions[REDUCE][''] += 1
|
||||
elif head > child:
|
||||
actions[LEFT][label] += 1
|
||||
actions[SHIFT][''] += 1
|
||||
if min_freq is not None:
|
||||
for action, label_freqs in actions.items():
|
||||
for label, freq in list(label_freqs.items()):
|
||||
if freq < min_freq:
|
||||
label_freqs.pop(label)
|
||||
# Ensure these actions are present
|
||||
actions[BREAK].setdefault('ROOT', 0)
|
||||
actions[RIGHT].setdefault('subtok', 0)
|
||||
actions[LEFT].setdefault('subtok', 0)
|
||||
# Used for backoff
|
||||
actions[RIGHT].setdefault('dep', 0)
|
||||
actions[LEFT].setdefault('dep', 0)
|
||||
return actions
|
||||
|
||||
property action_types:
|
||||
|
@ -376,18 +389,34 @@ cdef class ArcEager(TransitionSystem):
|
|||
def preprocess_gold(self, GoldParse gold):
|
||||
if not self.has_gold(gold):
|
||||
return None
|
||||
for i in range(gold.length):
|
||||
for i, (head, dep) in enumerate(zip(gold.heads, gold.labels)):
|
||||
# Missing values
|
||||
if gold.heads[i] is None or gold.labels[i] is None:
|
||||
if head is None or dep is None:
|
||||
gold.c.heads[i] = i
|
||||
gold.c.has_dep[i] = False
|
||||
else:
|
||||
label = gold.labels[i]
|
||||
if head > i:
|
||||
action = LEFT
|
||||
elif head < i:
|
||||
action = RIGHT
|
||||
else:
|
||||
action = BREAK
|
||||
if dep not in self.labels[action]:
|
||||
if action == BREAK:
|
||||
dep = 'ROOT'
|
||||
elif nonproj.is_decorated(dep):
|
||||
backoff = nonproj.decompose(dep)[0]
|
||||
if backoff in self.labels[action]:
|
||||
dep = backoff
|
||||
else:
|
||||
dep = 'dep'
|
||||
else:
|
||||
dep = 'dep'
|
||||
gold.c.has_dep[i] = True
|
||||
if label.upper() == 'ROOT':
|
||||
label = 'ROOT'
|
||||
gold.c.heads[i] = gold.heads[i]
|
||||
gold.c.labels[i] = self.strings.add(label)
|
||||
if dep.upper() == 'ROOT':
|
||||
dep = 'ROOT'
|
||||
gold.c.heads[i] = head
|
||||
gold.c.labels[i] = self.strings.add(dep)
|
||||
return gold
|
||||
|
||||
def get_beam_parses(self, Beam beam):
|
||||
|
@ -527,8 +556,13 @@ cdef class ArcEager(TransitionSystem):
|
|||
is_valid[i] = False
|
||||
costs[i] = 9000
|
||||
if n_gold < 1:
|
||||
# Check projectivity --- leading cause
|
||||
if is_nonproj_tree(gold.heads):
|
||||
# Check label set --- leading cause
|
||||
label_set = set([self.strings[self.c[i].label] for i in range(self.n_moves)])
|
||||
for label_str in gold.labels:
|
||||
if label_str is not None and label_str not in label_set:
|
||||
raise ValueError("Cannot get gold parser action: unknown label: %s" % label_str)
|
||||
# Check projectivity --- other leading cause
|
||||
if nonproj.is_nonproj_tree(gold.heads):
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise the "
|
||||
"dependency parser. Likely cause: the tree is "
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from thinc.typedefs cimport weight_t
|
||||
from thinc.extra.search cimport Beam
|
||||
from collections import OrderedDict
|
||||
from collections import OrderedDict, Counter
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
@ -64,21 +64,18 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions', OrderedDict((
|
||||
(MISSING, ['']),
|
||||
(BEGIN, []),
|
||||
(IN, []),
|
||||
(LAST, []),
|
||||
(UNIT, []),
|
||||
(OUT, [''])
|
||||
)))
|
||||
seen_entities = set()
|
||||
actions = {
|
||||
MISSING: Counter(),
|
||||
BEGIN: Counter(),
|
||||
IN: Counter(),
|
||||
LAST: Counter(),
|
||||
UNIT: Counter(),
|
||||
OUT: Counter()
|
||||
}
|
||||
actions[OUT][''] = 1
|
||||
for entity_type in kwargs.get('entity_types', []):
|
||||
if entity_type in seen_entities:
|
||||
continue
|
||||
seen_entities.add(entity_type)
|
||||
for action in (BEGIN, IN, LAST, UNIT):
|
||||
actions[action].append(entity_type)
|
||||
actions[action][entity_type] = 1
|
||||
moves = ('M', 'B', 'I', 'L', 'U')
|
||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||
for (ids, words, tags, heads, labels, biluo), _ in sents:
|
||||
|
@ -87,10 +84,8 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if ner_tag.count('-') != 1:
|
||||
raise ValueError(ner_tag)
|
||||
_, label = ner_tag.split('-')
|
||||
if label not in seen_entities:
|
||||
seen_entities.add(label)
|
||||
for move_str in ('B', 'I', 'L', 'U'):
|
||||
actions[moves.index(move_str)].append(label)
|
||||
for action in (BEGIN, IN, LAST, UNIT):
|
||||
actions[action][label] += 1
|
||||
return actions
|
||||
|
||||
property action_types:
|
||||
|
@ -213,7 +208,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
raise Exception(move)
|
||||
return t
|
||||
|
||||
def add_action(self, int action, label_name):
|
||||
def add_action(self, int action, label_name, freq=None):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, (int, long)):
|
||||
label_id = self.strings.add(label_name)
|
||||
|
@ -234,6 +229,12 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
assert self.c[self.n_moves].label == label_id
|
||||
self.n_moves += 1
|
||||
if self.labels.get(action, []):
|
||||
freq = min(0, min(self.labels[action].values()))
|
||||
self.labels[action][label_name] = freq-1
|
||||
else:
|
||||
self.labels[action] = Counter()
|
||||
self.labels[action][label_name] = -1
|
||||
return 1
|
||||
|
||||
cdef int initialize_state(self, StateC* st) nogil:
|
||||
|
|
|
@ -15,7 +15,7 @@ cdef class Parser:
|
|||
cdef readonly object cfg
|
||||
cdef public object _multitasks
|
||||
|
||||
cdef void _parseC(self, StateC* state,
|
||||
cdef void _parseC(self, StateC** states, int nr_task,
|
||||
const float* feat_weights, const float* bias,
|
||||
const float* hW, const float* hb,
|
||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# cython: infer_types=True
|
||||
# cython: cdivision=True
|
||||
# cython: boundscheck=False
|
||||
# cython: profile=True
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
@ -28,6 +27,8 @@ from thinc.misc import LayerNorm
|
|||
from thinc.neural.ops import CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
from thinc.linalg cimport Vec, VecVec
|
||||
from thinc cimport openblas
|
||||
|
||||
|
||||
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
||||
from .._ml import link_vectors_to_models, create_default_optimizer
|
||||
|
@ -266,7 +267,7 @@ cdef class Parser:
|
|||
|
||||
with Model.use_device('cpu'):
|
||||
upper = chain(
|
||||
clone(LayerNorm(Maxout(hidden_width, hidden_width)), depth-1),
|
||||
clone(Maxout(hidden_width, hidden_width), depth-1),
|
||||
zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
|
||||
)
|
||||
|
||||
|
@ -302,7 +303,7 @@ cdef class Parser:
|
|||
"""
|
||||
self.vocab = vocab
|
||||
if moves is True:
|
||||
self.moves = self.TransitionSystem(self.vocab.strings, {})
|
||||
self.moves = self.TransitionSystem(self.vocab.strings)
|
||||
else:
|
||||
self.moves = moves
|
||||
if 'beam_width' not in cfg:
|
||||
|
@ -311,12 +312,7 @@ cdef class Parser:
|
|||
cfg['beam_density'] = util.env_opt('beam_density', 0.0)
|
||||
if 'pretrained_dims' not in cfg:
|
||||
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
cfg.setdefault('cnn_maxout_pieces', 3)
|
||||
self.cfg = cfg
|
||||
if 'actions' in self.cfg:
|
||||
for action, labels in self.cfg.get('actions', {}).items():
|
||||
for label in labels:
|
||||
self.moves.add_action(action, label)
|
||||
self.model = model
|
||||
self._multitasks = []
|
||||
|
||||
|
@ -423,69 +419,81 @@ cdef class Parser:
|
|||
cdef int nr_hidden = hidden_weights.shape[0]
|
||||
cdef int nr_task = states.size()
|
||||
with nogil:
|
||||
for i in range(nr_task):
|
||||
self._parseC(states[i],
|
||||
feat_weights, bias, hW, hb,
|
||||
nr_class, nr_hidden, nr_feat, nr_piece)
|
||||
self._parseC(&states[0], nr_task, feat_weights, bias, hW, hb,
|
||||
nr_class, nr_hidden, nr_feat, nr_piece)
|
||||
PyErr_CheckSignals()
|
||||
tokvecs = self.model[0].ops.unflatten(tokvecs,
|
||||
[len(doc) for doc in docs])
|
||||
return state_objs, tokvecs
|
||||
|
||||
cdef void _parseC(self, StateC* state,
|
||||
cdef void _parseC(self, StateC** states, int nr_task,
|
||||
const float* feat_weights, const float* bias,
|
||||
const float* hW, const float* hb,
|
||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
|
||||
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
||||
is_valid = <int*>calloc(nr_class, sizeof(int))
|
||||
vectors = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
|
||||
scores = <float*>calloc(nr_class, sizeof(float))
|
||||
vectors = <float*>calloc(nr_hidden * nr_task, sizeof(float))
|
||||
unmaxed = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
|
||||
scores = <float*>calloc(nr_class*nr_task, sizeof(float))
|
||||
if not (token_ids and is_valid and vectors and scores):
|
||||
with gil:
|
||||
PyErr_SetFromErrno(MemoryError)
|
||||
PyErr_CheckSignals()
|
||||
cdef float feature
|
||||
while not state.is_final():
|
||||
state.set_context_tokens(token_ids, nr_feat)
|
||||
memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
|
||||
memset(scores, 0, nr_class * sizeof(float))
|
||||
sum_state_features(vectors,
|
||||
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
|
||||
for i in range(nr_hidden * nr_piece):
|
||||
vectors[i] += bias[i]
|
||||
V = vectors
|
||||
W = hW
|
||||
for i in range(nr_hidden):
|
||||
if nr_piece == 1:
|
||||
feature = V[0] if V[0] >= 0. else 0.
|
||||
elif nr_piece == 2:
|
||||
feature = V[0] if V[0] >= V[1] else V[1]
|
||||
else:
|
||||
feature = Vec.max(V, nr_piece)
|
||||
for j in range(nr_class):
|
||||
scores[j] += feature * W[j]
|
||||
W += nr_class
|
||||
V += nr_piece
|
||||
for i in range(nr_class):
|
||||
scores[i] += hb[i]
|
||||
self.moves.set_valid(is_valid, state)
|
||||
guess = arg_max_if_valid(scores, is_valid, nr_class)
|
||||
action = self.moves.c[guess]
|
||||
action.do(state, action.label)
|
||||
state.push_hist(guess)
|
||||
cdef int nr_todo = nr_task
|
||||
cdef int i, j
|
||||
cdef vector[StateC*] unfinished
|
||||
while nr_todo >= 1:
|
||||
memset(vectors, 0, nr_todo * nr_hidden * sizeof(float))
|
||||
memset(scores, 0, nr_todo * nr_class * sizeof(float))
|
||||
for i in range(nr_todo):
|
||||
state = states[i]
|
||||
state.set_context_tokens(token_ids, nr_feat)
|
||||
memset(unmaxed, 0, nr_hidden * nr_piece * sizeof(float))
|
||||
sum_state_features(unmaxed,
|
||||
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
|
||||
VecVec.add_i(unmaxed,
|
||||
bias, 1., nr_hidden*nr_piece)
|
||||
state_vector = &vectors[i*nr_hidden]
|
||||
for j in range(nr_hidden):
|
||||
index = j * nr_piece
|
||||
which = Vec.arg_max(&unmaxed[index], nr_piece)
|
||||
state_vector[j] = unmaxed[index + which]
|
||||
# Compute hidden-to-output
|
||||
openblas.simple_gemm(scores, nr_todo, nr_class,
|
||||
vectors, nr_todo, nr_hidden, hW, nr_hidden, nr_class, 0, 0)
|
||||
# Add bias
|
||||
for i in range(nr_todo):
|
||||
VecVec.add_i(&scores[i*nr_class],
|
||||
hb, 1., nr_class)
|
||||
# Validate actions, argmax, take action.
|
||||
for i in range(nr_todo):
|
||||
state = states[i]
|
||||
self.moves.set_valid(is_valid, state)
|
||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
||||
action = self.moves.c[guess]
|
||||
action.do(state, action.label)
|
||||
state.push_hist(guess)
|
||||
if not state.is_final():
|
||||
unfinished.push_back(state)
|
||||
for i in range(unfinished.size()):
|
||||
states[i] = unfinished[i]
|
||||
nr_todo = unfinished.size()
|
||||
unfinished.clear()
|
||||
free(token_ids)
|
||||
free(is_valid)
|
||||
free(vectors)
|
||||
free(unmaxed)
|
||||
free(scores)
|
||||
|
||||
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
|
||||
def beam_parse(self, docs, int beam_width=3, float beam_density=0.001,
|
||||
float drop=0.):
|
||||
cdef Beam beam
|
||||
cdef np.ndarray scores
|
||||
cdef Doc doc
|
||||
cdef int nr_class = self.moves.n_moves
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||
docs, cuda_stream, 0.0)
|
||||
docs, cuda_stream, drop)
|
||||
cdef int offset = 0
|
||||
cdef int j = 0
|
||||
cdef int k
|
||||
|
@ -524,8 +532,8 @@ cdef class Parser:
|
|||
n_states += 1
|
||||
if n_states == 0:
|
||||
break
|
||||
vectors = state2vec(token_ids[:n_states])
|
||||
scores = vec2scores(vectors)
|
||||
vectors, _ = state2vec.begin_update(token_ids[:n_states], drop)
|
||||
scores, _ = vec2scores.begin_update(vectors, drop=drop)
|
||||
c_scores = <float*>scores.data
|
||||
for beam in todo:
|
||||
for i in range(beam.size):
|
||||
|
@ -556,7 +564,10 @@ cdef class Parser:
|
|||
for multitask in self._multitasks:
|
||||
multitask.update(docs, golds, drop=drop, sgd=sgd)
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||
# Chop sequences into lengths of this many transitions, to make the
|
||||
# batch uniform length.
|
||||
cut_gold = numpy.random.choice(range(20, 100))
|
||||
states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold)
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
drop)
|
||||
todo = [(s, g) for (s, g) in zip(states, golds)
|
||||
|
@ -659,8 +670,7 @@ cdef class Parser:
|
|||
for beam in beams:
|
||||
_cleanup(beam)
|
||||
|
||||
|
||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||
def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500):
|
||||
"""Make a square batch, of length equal to the shortest doc. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
where N is the shortest doc. We'll make two states, one representing
|
||||
|
@ -669,7 +679,7 @@ cdef class Parser:
|
|||
StateClass state
|
||||
Transition action
|
||||
whole_states = self.moves.init_batch(whole_docs)
|
||||
max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
|
||||
max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
|
||||
max_moves = 0
|
||||
states = []
|
||||
golds = []
|
||||
|
@ -791,6 +801,11 @@ cdef class Parser:
|
|||
for doc in docs:
|
||||
hook(doc)
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
class_names = [self.moves.get_class_name(i) for i in range(self.moves.n_moves)]
|
||||
return class_names
|
||||
|
||||
@property
|
||||
def tok2vec(self):
|
||||
'''Return the embedding and convolutional layer of the model.'''
|
||||
|
@ -809,9 +824,6 @@ cdef class Parser:
|
|||
for action in self.moves.action_types:
|
||||
added = self.moves.add_action(action, label)
|
||||
if added:
|
||||
# Important that the labels be stored as a list! We need the
|
||||
# order, or the model goes out of synch
|
||||
self.cfg.setdefault('extra_labels', []).append(label)
|
||||
resized = True
|
||||
if self.model not in (True, False, None) and resized:
|
||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||
|
|
|
@ -9,7 +9,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from copy import copy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..tokens.doc cimport Doc, set_children_from_heads
|
||||
|
||||
|
||||
DELIMITER = '||'
|
||||
|
@ -74,7 +74,21 @@ def decompose(label):
|
|||
|
||||
|
||||
def is_decorated(label):
|
||||
return label.find(DELIMITER) != -1
|
||||
return DELIMITER in label
|
||||
|
||||
def count_decorated_labels(gold_tuples):
|
||||
freqs = {}
|
||||
for raw_text, sents in gold_tuples:
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
proj_heads, deco_labels = projectivize(heads, labels)
|
||||
# set the label to ROOT for each root dependent
|
||||
deco_labels = ['ROOT' if head == i else deco_labels[i]
|
||||
for i, head in enumerate(proj_heads)]
|
||||
# count label frequencies
|
||||
for label in deco_labels:
|
||||
if is_decorated(label):
|
||||
freqs[label] = freqs.get(label, 0) + 1
|
||||
return freqs
|
||||
|
||||
|
||||
def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
|
||||
|
@ -124,8 +138,9 @@ cpdef deprojectivize(Doc doc):
|
|||
if DELIMITER in label:
|
||||
new_label, head_label = label.split(DELIMITER)
|
||||
new_head = _find_new_head(doc[i], head_label)
|
||||
doc[i].head = new_head
|
||||
doc.c[i].head = new_head.i - i
|
||||
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
||||
set_children_from_heads(doc.c, doc.length)
|
||||
return doc
|
||||
|
||||
|
||||
|
@ -191,9 +206,12 @@ def _filter_labels(gold_tuples, cutoff, freqs):
|
|||
for raw_text, sents in gold_tuples:
|
||||
filtered_sents = []
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
filtered_labels = [decompose(label)[0]
|
||||
if freqs.get(label, cutoff) < cutoff
|
||||
else label for label in labels]
|
||||
filtered_labels = []
|
||||
for label in labels:
|
||||
if is_decorated(label) and freqs.get(label, 0) < cutoff:
|
||||
filtered_labels.append(decompose(label)[0])
|
||||
else:
|
||||
filtered_labels.append(label)
|
||||
filtered_sents.append(
|
||||
((ids, words, tags, heads, filtered_labels, iob), ctnts))
|
||||
filtered.append((raw_text, filtered_sents))
|
||||
|
|
|
@ -42,6 +42,7 @@ cdef class TransitionSystem:
|
|||
cdef public attr_t root_label
|
||||
cdef public freqs
|
||||
cdef init_state_t init_beam_state
|
||||
cdef public object labels
|
||||
|
||||
cdef int initialize_state(self, StateC* state) nogil
|
||||
cdef int finalize_state(self, StateC* state) nogil
|
||||
|
|
|
@ -5,7 +5,7 @@ from __future__ import unicode_literals
|
|||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
from collections import OrderedDict
|
||||
from collections import OrderedDict, Counter
|
||||
import ujson
|
||||
|
||||
from ..structs cimport TokenC
|
||||
|
@ -28,7 +28,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
|||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
def __init__(self, StringStore string_table, labels_by_action):
|
||||
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
|
||||
self.mem = Pool()
|
||||
self.strings = string_table
|
||||
self.n_moves = 0
|
||||
|
@ -36,21 +36,14 @@ cdef class TransitionSystem:
|
|||
|
||||
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
|
||||
|
||||
for action, label_strs in labels_by_action.items():
|
||||
for label_str in label_strs:
|
||||
self.add_action(int(action), label_str)
|
||||
self.labels = {}
|
||||
if labels_by_action:
|
||||
self.initialize_actions(labels_by_action, min_freq=min_freq)
|
||||
self.root_label = self.strings.add('ROOT')
|
||||
self.init_beam_state = _init_state
|
||||
|
||||
def __reduce__(self):
|
||||
labels_by_action = OrderedDict()
|
||||
cdef Transition t
|
||||
for trans in self.c[:self.n_moves]:
|
||||
label_str = self.strings[trans.label]
|
||||
labels_by_action.setdefault(trans.move, []).append(label_str)
|
||||
return (self.__class__,
|
||||
(self.strings, labels_by_action),
|
||||
None, None)
|
||||
return (self.__class__, (self.strings, self.labels), None, None)
|
||||
|
||||
def init_batch(self, docs):
|
||||
cdef StateClass state
|
||||
|
@ -146,6 +139,22 @@ cdef class TransitionSystem:
|
|||
act = self.c[clas]
|
||||
return self.move_name(act.move, act.label)
|
||||
|
||||
def initialize_actions(self, labels_by_action, min_freq=None):
|
||||
self.labels = {}
|
||||
self.n_moves = 0
|
||||
for action, label_freqs in sorted(labels_by_action.items()):
|
||||
action = int(action)
|
||||
# Make sure we take a copy here, and that we get a Counter
|
||||
self.labels[action] = Counter()
|
||||
# Have to be careful here: Sorting must be stable, or our model
|
||||
# won't be read back in correctly.
|
||||
sorted_labels = [(f, L) for L, f in label_freqs.items()]
|
||||
sorted_labels.sort()
|
||||
sorted_labels.reverse()
|
||||
for freq, label_str in sorted_labels:
|
||||
self.add_action(int(action), label_str)
|
||||
self.labels[action][label_str] = freq
|
||||
|
||||
def add_action(self, int action, label_name):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, int) and \
|
||||
|
@ -164,6 +173,14 @@ cdef class TransitionSystem:
|
|||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
assert self.c[self.n_moves].label == label_id
|
||||
self.n_moves += 1
|
||||
if self.labels.get(action, []):
|
||||
new_freq = min(self.labels[action].values())
|
||||
else:
|
||||
self.labels[action] = Counter()
|
||||
new_freq = -1
|
||||
if new_freq > 0:
|
||||
new_freq = 0
|
||||
self.labels[action][label_name] = new_freq-1
|
||||
return 1
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
|
@ -178,26 +195,18 @@ cdef class TransitionSystem:
|
|||
|
||||
def to_bytes(self, **exclude):
|
||||
transitions = []
|
||||
for trans in self.c[:self.n_moves]:
|
||||
transitions.append({
|
||||
'clas': trans.clas,
|
||||
'move': trans.move,
|
||||
'label': self.strings[trans.label],
|
||||
'name': self.move_name(trans.move, trans.label)
|
||||
})
|
||||
serializers = {
|
||||
'transitions': lambda: json_dumps(transitions),
|
||||
'moves': lambda: json_dumps(self.labels),
|
||||
'strings': lambda: self.strings.to_bytes()
|
||||
}
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
transitions = []
|
||||
labels = {}
|
||||
deserializers = {
|
||||
'transitions': lambda b: transitions.extend(ujson.loads(b)),
|
||||
'moves': lambda b: labels.update(ujson.loads(b)),
|
||||
'strings': lambda b: self.strings.from_bytes(b)
|
||||
}
|
||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||
for trans in transitions:
|
||||
self.add_action(trans['move'], trans['label'])
|
||||
self.initialize_actions(labels)
|
||||
return self
|
||||
|
|
|
@ -19,6 +19,15 @@ def doc(en_tokenizer):
|
|||
return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc_not_parsed(en_tokenizer):
|
||||
text = "This is a sentence. This is another sentence. And a third."
|
||||
tokens = en_tokenizer(text)
|
||||
d = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||
d.is_parsed = False
|
||||
return d
|
||||
|
||||
|
||||
def test_spans_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
@ -34,6 +43,7 @@ def test_spans_root(doc):
|
|||
assert span.root.text == 'sentence'
|
||||
assert span.root.head.text == 'is'
|
||||
|
||||
|
||||
def test_spans_string_fn(doc):
|
||||
span = doc[0:4]
|
||||
assert len(span) == 4
|
||||
|
@ -41,6 +51,7 @@ def test_spans_string_fn(doc):
|
|||
assert span.upper_ == 'THIS IS A SENTENCE'
|
||||
assert span.lower_ == 'this is a sentence'
|
||||
|
||||
|
||||
def test_spans_root2(en_tokenizer):
|
||||
text = "through North and South Carolina"
|
||||
heads = [0, 3, -1, -2, -4]
|
||||
|
@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer):
|
|||
assert doc[-2:].root.text == 'Carolina'
|
||||
|
||||
|
||||
def test_spans_span_sent(doc):
|
||||
def test_spans_span_sent(doc, doc_not_parsed):
|
||||
"""Test span.sent property"""
|
||||
assert len(list(doc.sents))
|
||||
assert doc[:2].sent.root.text == 'is'
|
||||
assert doc[:2].sent.text == 'This is a sentence .'
|
||||
assert doc[6:7].sent.root.left_edge.text == 'This'
|
||||
# test on manual sbd
|
||||
doc_not_parsed[0].is_sent_start = True
|
||||
doc_not_parsed[5].is_sent_start = True
|
||||
assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5]
|
||||
assert doc_not_parsed[10:14].sent == doc_not_parsed[5:]
|
||||
|
||||
|
||||
def test_spans_lca_matrix(en_tokenizer):
|
||||
|
@ -129,7 +145,7 @@ def test_span_to_array(doc):
|
|||
assert arr[0, 1] == len(span[0])
|
||||
|
||||
|
||||
def test_span_as_doc(doc):
|
||||
span = doc[4:10]
|
||||
span_doc = span.as_doc()
|
||||
assert span.text == span_doc.text.strip()
|
||||
#def test_span_as_doc(doc):
|
||||
# span = doc[4:10]
|
||||
# span_doc = span.as_doc()
|
||||
# assert span.text == span_doc.text.strip()
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
# coding: utf-8
|
||||
"""Find the min-cost alignment between two tokenizations"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...gold import _min_edit_path as min_edit_path
|
||||
from ...gold import align
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cand,gold,path', [
|
||||
(["U.S", ".", "policy"], ["U.S.", "policy"], (0, 'MDM')),
|
||||
(["U.N", ".", "policy"], ["U.S.", "policy"], (1, 'SDM')),
|
||||
(["The", "cat", "sat", "down"], ["The", "cat", "sat", "down"], (0, 'MMMM')),
|
||||
(["cat", "sat", "down"], ["The", "cat", "sat", "down"], (1, 'IMMM')),
|
||||
(["The", "cat", "down"], ["The", "cat", "sat", "down"], (1, 'MMIM')),
|
||||
(["The", "cat", "sag", "down"], ["The", "cat", "sat", "down"], (1, 'MMSM'))])
|
||||
def test_gold_lev_align_edit_path(cand, gold, path):
|
||||
assert min_edit_path(cand, gold) == path
|
||||
|
||||
|
||||
def test_gold_lev_align_edit_path2():
|
||||
cand = ["your", "stuff"]
|
||||
gold = ["you", "r", "stuff"]
|
||||
assert min_edit_path(cand, gold) in [(2, 'ISM'), (2, 'SIM')]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('cand,gold,result', [
|
||||
(["U.S", ".", "policy"], ["U.S.", "policy"], [0, None, 1]),
|
||||
(["your", "stuff"], ["you", "r", "stuff"], [None, 2]),
|
||||
(["i", "like", "2", "guys", " ", "well", "id", "just", "come", "straight", "out"],
|
||||
["i", "like", "2", "guys", "well", "i", "d", "just", "come", "straight", "out"],
|
||||
[0, 1, 2, 3, None, 4, None, 7, 8, 9, 10])])
|
||||
def test_gold_lev_align(cand, gold, result):
|
||||
assert align(cand, gold) == result
|
|
@ -2,9 +2,9 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ....parts_of_speech import SPACE
|
||||
from ....compat import unicode_
|
||||
from ...util import get_doc
|
||||
|
||||
import six
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -24,8 +24,8 @@ def test_tag_names(EN):
|
|||
text = "I ate pizzas with anchovies."
|
||||
doc = EN(text, disable=['parser'])
|
||||
assert type(doc[2].pos) == int
|
||||
assert isinstance(doc[2].pos_, six.text_type)
|
||||
assert isinstance(doc[2].dep_, six.text_type)
|
||||
assert isinstance(doc[2].pos_, unicode_)
|
||||
assert isinstance(doc[2].dep_, unicode_)
|
||||
assert doc[2].tag_ == u'NNS'
|
||||
|
||||
|
||||
|
|
75
spacy/tests/parser/test_arc_eager_oracle.py
Normal file
75
spacy/tests/parser/test_arc_eager_oracle.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
from __future__ import unicode_literals
|
||||
from ...vocab import Vocab
|
||||
from ...pipeline import DependencyParser
|
||||
from ...tokens import Doc
|
||||
from ...gold import GoldParse
|
||||
from ...syntax.nonproj import projectivize
|
||||
|
||||
annot_tuples = [
|
||||
(0, 'When', 'WRB', 11, 'advmod', 'O'),
|
||||
(1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'),
|
||||
(2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'),
|
||||
(3, ',', ',', 2, 'punct', 'O'),
|
||||
(4, 'our', 'PRP$', 6, 'poss', 'O'),
|
||||
(5, 'embedded', 'VBN', 6, 'amod', 'O'),
|
||||
(6, 'reporter', 'NN', 2, 'appos', 'O'),
|
||||
(7, 'with', 'IN', 6, 'prep', 'O'),
|
||||
(8, 'the', 'DT', 10, 'det', 'B-ORG'),
|
||||
(9, '3rd', 'NNP', 10, 'compound', 'I-ORG'),
|
||||
(10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'),
|
||||
(11, 'says', 'VBZ', 44, 'advcl', 'O'),
|
||||
(12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'),
|
||||
(13, 'battalions', 'NNS', 16, 'nsubj', 'O'),
|
||||
(14, 'of', 'IN', 13, 'prep', 'O'),
|
||||
(15, 'troops', 'NNS', 14, 'pobj', 'O'),
|
||||
(16, 'are', 'VBP', 11, 'ccomp', 'O'),
|
||||
(17, 'on', 'IN', 16, 'prep', 'O'),
|
||||
(18, 'the', 'DT', 19, 'det', 'O'),
|
||||
(19, 'ground', 'NN', 17, 'pobj', 'O'),
|
||||
(20, ',', ',', 17, 'punct', 'O'),
|
||||
(21, 'inside', 'IN', 17, 'prep', 'O'),
|
||||
(22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'),
|
||||
(23, 'itself', 'PRP', 22, 'appos', 'O'),
|
||||
(24, ',', ',', 16, 'punct', 'O'),
|
||||
(25, 'have', 'VBP', 26, 'aux', 'O'),
|
||||
(26, 'taken', 'VBN', 16, 'dep', 'O'),
|
||||
(27, 'up', 'RP', 26, 'prt', 'O'),
|
||||
(28, 'positions', 'NNS', 26, 'dobj', 'O'),
|
||||
(29, 'they', 'PRP', 31, 'nsubj', 'O'),
|
||||
(30, "'re", 'VBP', 31, 'aux', 'O'),
|
||||
(31, 'going', 'VBG', 26, 'parataxis', 'O'),
|
||||
(32, 'to', 'TO', 33, 'aux', 'O'),
|
||||
(33, 'spend', 'VB', 31, 'xcomp', 'O'),
|
||||
(34, 'the', 'DT', 35, 'det', 'B-TIME'),
|
||||
(35, 'night', 'NN', 33, 'dobj', 'L-TIME'),
|
||||
(36, 'there', 'RB', 33, 'advmod', 'O'),
|
||||
(37, 'presumably', 'RB', 33, 'advmod', 'O'),
|
||||
(38, ',', ',', 44, 'punct', 'O'),
|
||||
(39, 'how', 'WRB', 40, 'advmod', 'O'),
|
||||
(40, 'many', 'JJ', 41, 'amod', 'O'),
|
||||
(41, 'soldiers', 'NNS', 44, 'pobj', 'O'),
|
||||
(42, 'are', 'VBP', 44, 'aux', 'O'),
|
||||
(43, 'we', 'PRP', 44, 'nsubj', 'O'),
|
||||
(44, 'talking', 'VBG', 44, 'ROOT', 'O'),
|
||||
(45, 'about', 'IN', 44, 'prep', 'O'),
|
||||
(46, 'right', 'RB', 47, 'advmod', 'O'),
|
||||
(47, 'now', 'RB', 44, 'advmod', 'O'),
|
||||
(48, '?', '.', 44, 'punct', 'O')]
|
||||
|
||||
def test_get_oracle_actions():
|
||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||
parser = DependencyParser(doc.vocab)
|
||||
parser.moves.add_action(0, '')
|
||||
parser.moves.add_action(1, '')
|
||||
parser.moves.add_action(1, '')
|
||||
parser.moves.add_action(4, 'ROOT')
|
||||
for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
|
||||
if head > i:
|
||||
parser.moves.add_action(2, dep)
|
||||
elif head < i:
|
||||
parser.moves.add_action(3, dep)
|
||||
ids, words, tags, heads, deps, ents = zip(*annot_tuples)
|
||||
heads, deps = projectivize(heads, deps)
|
||||
gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
|
||||
parser.moves.preprocess_gold(gold)
|
||||
actions = parser.moves.get_oracle_sequence(doc, gold)
|
|
@ -13,8 +13,8 @@ from ...vocab import Vocab
|
|||
('a b', 0, 2),
|
||||
('a c', 0, 1),
|
||||
('a b c', 0, 2),
|
||||
('a b b c', 0, 2),
|
||||
('a b b', 0, 2),
|
||||
('a b b c', 0, 3),
|
||||
('a b b', 0, 3),
|
||||
]
|
||||
)
|
||||
def test_issue1450_matcher_end_zero_plus(string, start, end):
|
||||
|
@ -54,5 +54,6 @@ def test_issue1450_matcher_end_zero_plus(string, start, end):
|
|||
if start is None or end is None:
|
||||
assert matches == []
|
||||
|
||||
assert matches[0][1] == start
|
||||
assert matches[0][2] == end
|
||||
print(matches)
|
||||
assert matches[-1][1] == start
|
||||
assert matches[-1][2] == end
|
||||
|
|
65
spacy/tests/regression/test_issue1855.py
Normal file
65
spacy/tests/regression/test_issue1855.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
import re
|
||||
|
||||
from ...matcher import Matcher
|
||||
|
||||
import pytest
|
||||
|
||||
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
|
||||
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
|
||||
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
|
||||
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
|
||||
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
|
||||
|
||||
re_pattern1 = 'AA*'
|
||||
re_pattern2 = 'A*A'
|
||||
re_pattern3 = 'AA'
|
||||
re_pattern4 = 'BA*B'
|
||||
re_pattern5 = 'B*A*B'
|
||||
|
||||
@pytest.fixture
|
||||
def text():
|
||||
return "(ABBAAAAAB)."
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer,text):
|
||||
doc = en_tokenizer(' '.join(text))
|
||||
return doc
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('pattern,re_pattern',[
|
||||
(pattern1,re_pattern1),
|
||||
(pattern2,re_pattern2),
|
||||
(pattern3,re_pattern3),
|
||||
(pattern4,re_pattern4),
|
||||
(pattern5,re_pattern5)])
|
||||
def test_greedy_matching(doc,text,pattern,re_pattern):
|
||||
"""
|
||||
Test that the greedy matching behavior of the * op
|
||||
is consistant with other re implementations
|
||||
"""
|
||||
matcher = Matcher(doc.vocab)
|
||||
matcher.add(re_pattern,None,pattern)
|
||||
matches = matcher(doc)
|
||||
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
|
||||
for match,re_match in zip(matches,re_matches):
|
||||
assert match[1:]==re_match
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('pattern,re_pattern',[
|
||||
(pattern1,re_pattern1),
|
||||
(pattern2,re_pattern2),
|
||||
(pattern3,re_pattern3),
|
||||
(pattern4,re_pattern4),
|
||||
(pattern5,re_pattern5)])
|
||||
def test_match_consuming(doc,text,pattern,re_pattern):
|
||||
"""
|
||||
Test that matcher.__call__ consumes tokens on a match
|
||||
similar to re.findall
|
||||
"""
|
||||
matcher = Matcher(doc.vocab)
|
||||
matcher.add(re_pattern,None,pattern)
|
||||
matches = matcher(doc)
|
||||
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
|
||||
assert len(matches)==len(re_matches)
|
11
spacy/tests/regression/test_issue1889.py
Normal file
11
spacy/tests/regression/test_issue1889.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
from ...lang.lex_attrs import is_stop
|
||||
from ...lang.en.stop_words import STOP_WORDS
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('word', ['the'])
|
||||
def test_lex_attrs_stop_words_case_sensitivity(word):
|
||||
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
|
@ -6,7 +6,6 @@ from ...vocab import Vocab
|
|||
from ...tokens import Doc
|
||||
from ...matcher import Matcher
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue1945():
|
||||
text = "a a a"
|
||||
matcher = Matcher(Vocab())
|
||||
|
|
|
@ -22,10 +22,9 @@ def test_basic_case():
|
|||
assert end == 4
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue850():
|
||||
"""The problem here is that the variable-length pattern matches the
|
||||
succeeding token. We then don't handle the ambiguity correctly."""
|
||||
"""The variable-length pattern matches the
|
||||
succeeding token. Check we handle the ambiguity correctly."""
|
||||
matcher = Matcher(Vocab(
|
||||
lex_attr_getters={LOWER: lambda string: string.lower()}))
|
||||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||
|
|
66
spacy/tests/test_align.py
Normal file
66
spacy/tests/test_align.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
from .._align import align, multi_align
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string1,string2,cost', [
|
||||
('hello', 'hell', 1),
|
||||
('rat', 'cat', 1),
|
||||
('rat', 'rat', 0),
|
||||
('rat', 'catsie', 4),
|
||||
('t', 'catsie', 5),
|
||||
])
|
||||
def test_align_costs(string1, string2, cost):
|
||||
output_cost, i2j, j2i, matrix = align(string1, string2)
|
||||
assert output_cost == cost
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string1,string2,i2j', [
|
||||
('hello', 'hell', [0,1,2,3,-1]),
|
||||
('rat', 'cat', [0,1,2]),
|
||||
('rat', 'rat', [0,1,2]),
|
||||
('rat', 'catsie', [0,1,2]),
|
||||
('t', 'catsie', [2]),
|
||||
])
|
||||
def test_align_i2j(string1, string2, i2j):
|
||||
output_cost, output_i2j, j2i, matrix = align(string1, string2)
|
||||
assert list(output_i2j) == i2j
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string1,string2,j2i', [
|
||||
('hello', 'hell', [0,1,2,3]),
|
||||
('rat', 'cat', [0,1,2]),
|
||||
('rat', 'rat', [0,1,2]),
|
||||
('rat', 'catsie', [0,1,2, -1, -1, -1]),
|
||||
('t', 'catsie', [-1, -1, 0, -1, -1, -1]),
|
||||
])
|
||||
def test_align_i2j(string1, string2, j2i):
|
||||
output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
|
||||
assert list(output_j2i) == j2i
|
||||
|
||||
def test_align_strings():
|
||||
words1 = ['hello', 'this', 'is', 'test!']
|
||||
words2 = ['hellothis', 'is', 'test', '!']
|
||||
cost, i2j, j2i, matrix = align(words1, words2)
|
||||
assert cost == 4
|
||||
assert list(i2j) == [-1, -1, 1, -1]
|
||||
assert list(j2i) == [-1, 2, -1, -1]
|
||||
|
||||
def test_align_many_to_one():
|
||||
words1 = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
|
||||
words2 = ['ab', 'bc', 'e', 'fg', 'h']
|
||||
cost, i2j, j2i, matrix = align(words1, words2)
|
||||
assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
|
||||
lengths1 = [len(w) for w in words1]
|
||||
lengths2 = [len(w) for w in words2]
|
||||
i2j_multi, j2i_multi = multi_align(i2j, j2i, lengths1, lengths2)
|
||||
assert i2j_multi[0] == 0
|
||||
assert i2j_multi[1] == 0
|
||||
assert i2j_multi[2] == 1
|
||||
assert i2j_multi[3] == 1
|
||||
assert i2j_multi[3] == 1
|
||||
assert i2j_multi[5] == 3
|
||||
assert i2j_multi[6] == 3
|
||||
|
||||
assert j2i_multi[0] == 1
|
||||
assert j2i_multi[1] == 3
|
|
@ -3,12 +3,17 @@ from __future__ import unicode_literals
|
|||
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from .util import get_doc
|
||||
from ..util import get_lang_class
|
||||
from ..tokens import Doc
|
||||
|
||||
import pytest
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def en_vocab():
|
||||
return get_lang_class('en').Defaults.create_vocab()
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def matcher(en_vocab):
|
||||
rules = {
|
||||
'JS': [[{'ORTH': 'JavaScript'}]],
|
||||
|
@ -21,187 +26,196 @@ def matcher(en_vocab):
|
|||
return matcher
|
||||
|
||||
|
||||
def test_matcher_from_api_docs(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{'ORTH': 'test'}]
|
||||
assert len(matcher) == 0
|
||||
matcher.add('Rule', None, pattern)
|
||||
assert len(matcher) == 1
|
||||
matcher.remove('Rule')
|
||||
assert 'Rule' not in matcher
|
||||
matcher.add('Rule', None, pattern)
|
||||
assert 'Rule' in matcher
|
||||
on_match, patterns = matcher.get('Rule')
|
||||
assert len(patterns[0])
|
||||
#def test_matcher_from_api_docs(en_vocab):
|
||||
# matcher = Matcher(en_vocab)
|
||||
# pattern = [{'ORTH': 'test'}]
|
||||
# assert len(matcher) == 0
|
||||
# matcher.add('Rule', None, pattern)
|
||||
# assert len(matcher) == 1
|
||||
# matcher.remove('Rule')
|
||||
# assert 'Rule' not in matcher
|
||||
# matcher.add('Rule', None, pattern)
|
||||
# assert 'Rule' in matcher
|
||||
# on_match, patterns = matcher.get('Rule')
|
||||
# assert len(patterns[0])
|
||||
#
|
||||
#
|
||||
#def test_matcher_from_usage_docs(en_vocab):
|
||||
# text = "Wow 😀 This is really cool! 😂 😂"
|
||||
# doc = get_doc(en_vocab, words=text.split(' '))
|
||||
# pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
|
||||
# pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
|
||||
#
|
||||
# def label_sentiment(matcher, doc, i, matches):
|
||||
# match_id, start, end = matches[i]
|
||||
# if doc.vocab.strings[match_id] == 'HAPPY':
|
||||
# doc.sentiment += 0.1
|
||||
# span = doc[start : end]
|
||||
# token = span.merge()
|
||||
# token.vocab[token.text].norm_ = 'happy emoji'
|
||||
#
|
||||
# matcher = Matcher(en_vocab)
|
||||
# matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
||||
# matches = matcher(doc)
|
||||
# assert doc.sentiment != 0
|
||||
# assert doc[1].norm_ == 'happy emoji'
|
||||
|
||||
|
||||
def test_matcher_from_usage_docs(en_vocab):
|
||||
text = "Wow 😀 This is really cool! 😂 😂"
|
||||
doc = get_doc(en_vocab, words=text.split(' '))
|
||||
pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']
|
||||
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
|
||||
|
||||
def label_sentiment(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
if doc.vocab.strings[match_id] == 'HAPPY':
|
||||
doc.sentiment += 0.1
|
||||
span = doc[start : end]
|
||||
token = span.merge()
|
||||
token.vocab[token.text].norm_ = 'happy emoji'
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
||||
matches = matcher(doc)
|
||||
assert doc.sentiment != 0
|
||||
assert doc[1].norm_ == 'happy emoji'
|
||||
|
||||
|
||||
@pytest.mark.parametrize('words', [["Some", "words"]])
|
||||
def test_matcher_init(en_vocab, words):
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = get_doc(en_vocab, words)
|
||||
assert len(matcher) == 0
|
||||
assert matcher(doc) == []
|
||||
|
||||
|
||||
def test_matcher_contains(matcher):
|
||||
matcher.add('TEST', None, [{'ORTH': 'test'}])
|
||||
assert 'TEST' in matcher
|
||||
assert 'TEST2' not in matcher
|
||||
|
||||
|
||||
def test_matcher_no_match(matcher):
|
||||
words = ["I", "like", "cheese", "."]
|
||||
doc = get_doc(matcher.vocab, words)
|
||||
assert matcher(doc) == []
|
||||
|
||||
|
||||
def test_matcher_compile(matcher):
|
||||
assert len(matcher) == 3
|
||||
|
||||
|
||||
def test_matcher_match_start(matcher):
|
||||
words = ["JavaScript", "is", "good"]
|
||||
doc = get_doc(matcher.vocab, words)
|
||||
assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
|
||||
|
||||
|
||||
def test_matcher_match_end(matcher):
|
||||
words = ["I", "like", "java"]
|
||||
doc = get_doc(matcher.vocab, words)
|
||||
assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
|
||||
|
||||
|
||||
def test_matcher_match_middle(matcher):
|
||||
words = ["I", "like", "Google", "Now", "best"]
|
||||
doc = get_doc(matcher.vocab, words)
|
||||
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
|
||||
|
||||
|
||||
def test_matcher_match_multi(matcher):
|
||||
words = ["I", "like", "Google", "Now", "and", "java", "best"]
|
||||
doc = get_doc(matcher.vocab, words)
|
||||
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
|
||||
(doc.vocab.strings['Java'], 5, 6)]
|
||||
|
||||
|
||||
def test_matcher_empty_dict(en_vocab):
|
||||
'''Test matcher allows empty token specs, meaning match on any token.'''
|
||||
matcher = Matcher(en_vocab)
|
||||
abc = ["a", "b", "c"]
|
||||
doc = get_doc(matcher.vocab, abc)
|
||||
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
assert matches[0][1:] == (0, 3)
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][1:] == (0, 2)
|
||||
|
||||
|
||||
def test_matcher_operator_shadow(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
abc = ["a", "b", "c"]
|
||||
doc = get_doc(matcher.vocab, abc)
|
||||
matcher.add('A.C', None, [{'ORTH': 'a'},
|
||||
{"IS_ALPHA": True, "OP": "+"},
|
||||
{'ORTH': 'c'}])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
assert matches[0][1:] == (0, 3)
|
||||
|
||||
|
||||
def test_matcher_phrase_matcher(en_vocab):
|
||||
words = ["Google", "Now"]
|
||||
doc = get_doc(en_vocab, words)
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add('COMPANY', None, doc)
|
||||
words = ["I", "like", "Google", "Now", "best"]
|
||||
doc = get_doc(en_vocab, words)
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
|
||||
def test_phrase_matcher_length(en_vocab):
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
assert len(matcher) == 0
|
||||
matcher.add('TEST', None, get_doc(en_vocab, ['test']))
|
||||
assert len(matcher) == 1
|
||||
matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
|
||||
assert len(matcher) == 2
|
||||
|
||||
|
||||
def test_phrase_matcher_contains(en_vocab):
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add('TEST', None, get_doc(en_vocab, ['test']))
|
||||
assert 'TEST' in matcher
|
||||
assert 'TEST2' not in matcher
|
||||
|
||||
|
||||
def test_matcher_match_zero(matcher):
|
||||
words1 = 'He said , " some words " ...'.split()
|
||||
words2 = 'He said , " some three words " ...'.split()
|
||||
pattern1 = [{'ORTH': '"'},
|
||||
{'OP': '!', 'IS_PUNCT': True},
|
||||
{'OP': '!', 'IS_PUNCT': True},
|
||||
{'ORTH': '"'}]
|
||||
pattern2 = [{'ORTH': '"'},
|
||||
{'IS_PUNCT': True},
|
||||
{'IS_PUNCT': True},
|
||||
{'IS_PUNCT': True},
|
||||
{'ORTH': '"'}]
|
||||
|
||||
matcher.add('Quote', None, pattern1)
|
||||
doc = get_doc(matcher.vocab, words1)
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
doc = get_doc(matcher.vocab, words2)
|
||||
assert len(matcher(doc)) == 0
|
||||
matcher.add('Quote', None, pattern2)
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
|
||||
def test_matcher_match_zero_plus(matcher):
|
||||
words = 'He said , " some words " ...'.split()
|
||||
pattern = [{'ORTH': '"'},
|
||||
{'OP': '*', 'IS_PUNCT': False},
|
||||
{'ORTH': '"'}]
|
||||
matcher.add('Quote', None, pattern)
|
||||
doc = get_doc(matcher.vocab, words)
|
||||
assert len(matcher(doc)) == 1
|
||||
|
||||
|
||||
def test_matcher_match_one_plus(matcher):
|
||||
control = Matcher(matcher.vocab)
|
||||
control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
|
||||
doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
|
||||
m = control(doc)
|
||||
assert len(m) == 2
|
||||
matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
|
||||
{'ORTH': 'Philippe', 'OP': '+'}])
|
||||
m = matcher(doc)
|
||||
assert len(m) == 1
|
||||
|
||||
#@pytest.mark.parametrize('words', [["Some", "words"]])
|
||||
#def test_matcher_init(en_vocab, words):
|
||||
# matcher = Matcher(en_vocab)
|
||||
# doc = get_doc(en_vocab, words)
|
||||
# assert len(matcher) == 0
|
||||
# assert matcher(doc) == []
|
||||
#
|
||||
#
|
||||
#def test_matcher_contains(matcher):
|
||||
# matcher.add('TEST', None, [{'ORTH': 'test'}])
|
||||
# assert 'TEST' in matcher
|
||||
# assert 'TEST2' not in matcher
|
||||
#
|
||||
#
|
||||
#def test_matcher_no_match(matcher):
|
||||
# words = ["I", "like", "cheese", "."]
|
||||
# doc = get_doc(matcher.vocab, words)
|
||||
# assert matcher(doc) == []
|
||||
#
|
||||
#
|
||||
#def test_matcher_compile(en_vocab):
|
||||
# rules = {
|
||||
# 'JS': [[{'ORTH': 'JavaScript'}]],
|
||||
# 'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
|
||||
# 'Java': [[{'LOWER': 'java'}]]
|
||||
# }
|
||||
# matcher = Matcher(en_vocab)
|
||||
# for key, patterns in rules.items():
|
||||
# matcher.add(key, None, *patterns)
|
||||
# assert len(matcher) == 3
|
||||
#
|
||||
#
|
||||
#def test_matcher_match_start(matcher):
|
||||
# words = ["JavaScript", "is", "good"]
|
||||
# doc = get_doc(matcher.vocab, words)
|
||||
# assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
|
||||
#
|
||||
#
|
||||
#def test_matcher_match_end(matcher):
|
||||
# words = ["I", "like", "java"]
|
||||
# doc = get_doc(matcher.vocab, words)
|
||||
# assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
|
||||
#
|
||||
#
|
||||
#def test_matcher_match_middle(matcher):
|
||||
# words = ["I", "like", "Google", "Now", "best"]
|
||||
# doc = get_doc(matcher.vocab, words)
|
||||
# assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
|
||||
#
|
||||
#
|
||||
#def test_matcher_match_multi(matcher):
|
||||
# words = ["I", "like", "Google", "Now", "and", "java", "best"]
|
||||
# doc = get_doc(matcher.vocab, words)
|
||||
# assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
|
||||
# (doc.vocab.strings['Java'], 5, 6)]
|
||||
#
|
||||
#
|
||||
#def test_matcher_empty_dict(en_vocab):
|
||||
# '''Test matcher allows empty token specs, meaning match on any token.'''
|
||||
# matcher = Matcher(en_vocab)
|
||||
# abc = ["a", "b", "c"]
|
||||
# doc = get_doc(matcher.vocab, abc)
|
||||
# matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
|
||||
# matches = matcher(doc)
|
||||
# assert len(matches) == 1
|
||||
# assert matches[0][1:] == (0, 3)
|
||||
# matcher = Matcher(en_vocab)
|
||||
# matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
||||
# matches = matcher(doc)
|
||||
# assert matches[0][1:] == (0, 2)
|
||||
#
|
||||
#
|
||||
#def test_matcher_operator_shadow(en_vocab):
|
||||
# matcher = Matcher(en_vocab)
|
||||
# abc = ["a", "b", "c"]
|
||||
# doc = get_doc(matcher.vocab, abc)
|
||||
# matcher.add('A.C', None, [{'ORTH': 'a'},
|
||||
# {"IS_ALPHA": True, "OP": "+"},
|
||||
# {'ORTH': 'c'}])
|
||||
# matches = matcher(doc)
|
||||
# assert len(matches) == 1
|
||||
# assert matches[0][1:] == (0, 3)
|
||||
#
|
||||
#
|
||||
#def test_matcher_phrase_matcher(en_vocab):
|
||||
# words = ["Google", "Now"]
|
||||
# doc = get_doc(en_vocab, words)
|
||||
# matcher = PhraseMatcher(en_vocab)
|
||||
# matcher.add('COMPANY', None, doc)
|
||||
# words = ["I", "like", "Google", "Now", "best"]
|
||||
# doc = get_doc(en_vocab, words)
|
||||
# assert len(matcher(doc)) == 1
|
||||
#
|
||||
#
|
||||
#def test_phrase_matcher_length(en_vocab):
|
||||
# matcher = PhraseMatcher(en_vocab)
|
||||
# assert len(matcher) == 0
|
||||
# matcher.add('TEST', None, get_doc(en_vocab, ['test']))
|
||||
# assert len(matcher) == 1
|
||||
# matcher.add('TEST2', None, get_doc(en_vocab, ['test2']))
|
||||
# assert len(matcher) == 2
|
||||
#
|
||||
#
|
||||
#def test_phrase_matcher_contains(en_vocab):
|
||||
# matcher = PhraseMatcher(en_vocab)
|
||||
# matcher.add('TEST', None, get_doc(en_vocab, ['test']))
|
||||
# assert 'TEST' in matcher
|
||||
# assert 'TEST2' not in matcher
|
||||
#
|
||||
#
|
||||
#def test_matcher_match_zero(matcher):
|
||||
# words1 = 'He said , " some words " ...'.split()
|
||||
# words2 = 'He said , " some three words " ...'.split()
|
||||
# pattern1 = [{'ORTH': '"'},
|
||||
# {'OP': '!', 'IS_PUNCT': True},
|
||||
# {'OP': '!', 'IS_PUNCT': True},
|
||||
# {'ORTH': '"'}]
|
||||
# pattern2 = [{'ORTH': '"'},
|
||||
# {'IS_PUNCT': True},
|
||||
# {'IS_PUNCT': True},
|
||||
# {'IS_PUNCT': True},
|
||||
# {'ORTH': '"'}]
|
||||
#
|
||||
# matcher.add('Quote', None, pattern1)
|
||||
# doc = get_doc(matcher.vocab, words1)
|
||||
# assert len(matcher(doc)) == 1
|
||||
#
|
||||
# doc = get_doc(matcher.vocab, words2)
|
||||
# assert len(matcher(doc)) == 0
|
||||
# matcher.add('Quote', None, pattern2)
|
||||
# assert len(matcher(doc)) == 0
|
||||
#
|
||||
#
|
||||
#def test_matcher_match_zero_plus(matcher):
|
||||
# words = 'He said , " some words " ...'.split()
|
||||
# pattern = [{'ORTH': '"'},
|
||||
# {'OP': '*', 'IS_PUNCT': False},
|
||||
# {'ORTH': '"'}]
|
||||
# matcher = Matcher(matcher.vocab)
|
||||
# matcher.add('Quote', None, pattern)
|
||||
# doc = get_doc(matcher.vocab, words)
|
||||
# assert len(matcher(doc)) == 1
|
||||
#
|
||||
#
|
||||
#def test_matcher_match_one_plus(matcher):
|
||||
# control = Matcher(matcher.vocab)
|
||||
# control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
|
||||
# doc = get_doc(control.vocab, ['Philippe', 'Philippe'])
|
||||
# m = control(doc)
|
||||
# assert len(m) == 2
|
||||
# matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
|
||||
# {'ORTH': 'Philippe', 'OP': '+'}])
|
||||
# m = matcher(doc)
|
||||
# assert len(m) == 1
|
||||
#
|
||||
|
||||
def test_operator_combos(matcher):
|
||||
cases = [
|
||||
|
@ -252,9 +266,8 @@ def test_matcher_end_zero_plus(matcher):
|
|||
)
|
||||
nlp = lambda string: Doc(matcher.vocab, words=string.split())
|
||||
assert len(matcher(nlp(u'a'))) == 1
|
||||
assert len(matcher(nlp(u'a b'))) == 1
|
||||
assert len(matcher(nlp(u'a b'))) == 1
|
||||
assert len(matcher(nlp(u'a b'))) == 2
|
||||
assert len(matcher(nlp(u'a c'))) == 1
|
||||
assert len(matcher(nlp(u'a b c'))) == 1
|
||||
assert len(matcher(nlp(u'a b b c'))) == 1
|
||||
assert len(matcher(nlp(u'a b b'))) == 1
|
||||
assert len(matcher(nlp(u'a b c'))) == 2
|
||||
assert len(matcher(nlp(u'a b b c'))) == 3
|
||||
assert len(matcher(nlp(u'a b b'))) == 3
|
||||
|
|
44
spacy/tests/test_textcat.py
Normal file
44
spacy/tests/test_textcat.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
from __future__ import unicode_literals
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from ..pipeline import TextCategorizer
|
||||
from ..lang.en import English
|
||||
from ..vocab import Vocab
|
||||
from ..tokens import Doc
|
||||
from ..gold import GoldParse
|
||||
|
||||
|
||||
def test_textcat_learns_multilabel():
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
docs = []
|
||||
nlp = English()
|
||||
vocab = nlp.vocab
|
||||
letters = ['a', 'b', 'c']
|
||||
for w1 in letters:
|
||||
for w2 in letters:
|
||||
cats = {letter: float(w2==letter) for letter in letters}
|
||||
docs.append((Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3), cats))
|
||||
random.shuffle(docs)
|
||||
model = TextCategorizer(vocab, width=8)
|
||||
for letter in letters:
|
||||
model.add_label(letter)
|
||||
optimizer = model.begin_training()
|
||||
for i in range(30):
|
||||
losses = {}
|
||||
Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
|
||||
Xs = [doc for doc, cats in docs]
|
||||
model.update(Xs, Ys, sgd=optimizer, losses=losses)
|
||||
random.shuffle(docs)
|
||||
for w1 in letters:
|
||||
for w2 in letters:
|
||||
doc = Doc(vocab, words=['d']*3 + [w1, w2] + ['d']*3)
|
||||
truth = {letter: w2==letter for letter in letters}
|
||||
model(doc)
|
||||
for cat, score in doc.cats.items():
|
||||
if not truth[cat]:
|
||||
assert score < 0.5
|
||||
else:
|
||||
assert score > 0.5
|
||||
|
|
@ -19,6 +19,9 @@ ctypedef fused LexemeOrToken:
|
|||
const_TokenC_ptr
|
||||
|
||||
|
||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
|
||||
|
||||
|
||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
|
||||
|
||||
|
||||
|
|
|
@ -186,6 +186,20 @@ cdef class Doc:
|
|||
def _(self):
|
||||
return Underscore(Underscore.doc_extensions, self)
|
||||
|
||||
@property
|
||||
def is_sentenced(self):
|
||||
# Check if the document has sentence boundaries,
|
||||
# i.e at least one tok has the sent_start in (-1, 1)
|
||||
if 'sents' in self.user_hooks:
|
||||
return True
|
||||
if self.is_parsed:
|
||||
return True
|
||||
for i in range(self.length):
|
||||
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Get a `Token` or `Span` object.
|
||||
|
||||
|
@ -517,29 +531,23 @@ cdef class Doc:
|
|||
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
||||
"""
|
||||
def __get__(self):
|
||||
if not self.is_sentenced:
|
||||
raise ValueError(
|
||||
"Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||
"Alternatively, add the dependency parser, or set "
|
||||
"sentence boundaries by setting doc[i].sent_start")
|
||||
if 'sents' in self.user_hooks:
|
||||
yield from self.user_hooks['sents'](self)
|
||||
return
|
||||
|
||||
cdef int i
|
||||
if not self.is_parsed:
|
||||
else:
|
||||
start = 0
|
||||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start != 0:
|
||||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
"Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||
"Alternatively, add the dependency parser, or set "
|
||||
"sentence boundaries by setting doc[i].sent_start")
|
||||
start = 0
|
||||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start == 1:
|
||||
yield Span(self, start, i)
|
||||
start = i
|
||||
if start != self.length:
|
||||
yield Span(self, start, self.length)
|
||||
if self.c[i].sent_start == 1:
|
||||
yield Span(self, start, i)
|
||||
start = i
|
||||
if start != self.length:
|
||||
yield Span(self, start, self.length)
|
||||
|
||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
||||
if self.length == 0:
|
||||
|
|
|
@ -285,16 +285,42 @@ cdef class Span:
|
|||
def __get__(self):
|
||||
if 'sent' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['sent'](self)
|
||||
# This should raise if we're not parsed.
|
||||
# This should raise if we're not parsed
|
||||
# or doesen't have any sbd component :)
|
||||
self.doc.sents
|
||||
# if doc is parsed we can use the deps to find the sentence
|
||||
# otherwise we use the `sent_start` token attribute
|
||||
cdef int n = 0
|
||||
root = &self.doc.c[self.start]
|
||||
while root.head != 0:
|
||||
root += root.head
|
||||
n += 1
|
||||
if n >= self.doc.length:
|
||||
raise RuntimeError
|
||||
return self.doc[root.l_edge:root.r_edge + 1]
|
||||
cdef int i
|
||||
if self.doc.is_parsed:
|
||||
root = &self.doc.c[self.start]
|
||||
n = 0
|
||||
while root.head != 0:
|
||||
root += root.head
|
||||
n += 1
|
||||
if n >= self.doc.length:
|
||||
raise RuntimeError
|
||||
return self.doc[root.l_edge:root.r_edge + 1]
|
||||
elif self.doc.is_sentenced:
|
||||
# find start of the sentence
|
||||
start = self.start
|
||||
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||
start += -1
|
||||
# find end of the sentence
|
||||
end = self.end
|
||||
n = 0
|
||||
while end < self.doc.length and self.doc.c[end].sent_start != 1:
|
||||
end += 1
|
||||
n += 1
|
||||
if n >= self.doc.length:
|
||||
break
|
||||
#
|
||||
return self.doc[start:end]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Access to sentence requires either the dependency parse "
|
||||
"or sentence boundaries to be set by setting " +
|
||||
"doc[i].is_sent_start = True")
|
||||
|
||||
property has_vector:
|
||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||
|
|
|
@ -34,11 +34,11 @@ cdef class Token:
|
|||
|
||||
@classmethod
|
||||
def get_extension(cls, name):
|
||||
return Underscore.token_extensions.get(name)
|
||||
return Underscore.span_extensions.get(name)
|
||||
|
||||
@classmethod
|
||||
def has_extension(cls, name):
|
||||
return name in Underscore.token_extensions
|
||||
return name in Underscore.span_extensions
|
||||
|
||||
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
||||
"""Construct a `Token` object.
|
||||
|
|
|
@ -442,6 +442,29 @@ def decaying(start, stop, decay):
|
|||
nr_upd += 1
|
||||
|
||||
|
||||
def minibatch_by_words(items, size, count_words=len):
|
||||
'''Create minibatches of a given number of words.'''
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(size)
|
||||
else:
|
||||
size_ = size
|
||||
items = iter(items)
|
||||
while True:
|
||||
batch_size = next(size_)
|
||||
batch = []
|
||||
while batch_size >= 0:
|
||||
try:
|
||||
doc, gold = next(items)
|
||||
except StopIteration:
|
||||
if batch:
|
||||
yield batch
|
||||
return
|
||||
batch_size -= count_words(doc)
|
||||
batch.append((doc, gold))
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
|
||||
def itershuffle(iterable, bufsize=1000):
|
||||
"""Shuffle an iterator. This works by holding `bufsize` items back
|
||||
and yielding them sometime later. Obviously, this is not unbiased –
|
||||
|
@ -457,7 +480,7 @@ def itershuffle(iterable, bufsize=1000):
|
|||
try:
|
||||
while True:
|
||||
for i in range(random.randint(1, bufsize-len(buf))):
|
||||
buf.append(iterable.next())
|
||||
buf.append(next(iterable))
|
||||
random.shuffle(buf)
|
||||
for i in range(random.randint(1, bufsize)):
|
||||
if buf:
|
||||
|
|
|
@ -120,9 +120,6 @@ include ../_includes/_mixins
|
|||
| A Practical Real-World Approach to Gaining Actionable Insights
|
||||
| from your Data
|
||||
|
||||
+card("Practical Machine Learning with Python", "", "Dipanjan Sarkar et al. (Apress, 2017)", "book")
|
||||
| A Problem-Solver's Guide to Building Real-World Intelligent Systems
|
||||
|
||||
+section("notebooks")
|
||||
+h(2, "notebooks") Jupyter notebooks
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user