This commit is contained in:
ines 2017-04-16 12:07:04 +02:00
commit d7229967b0
48 changed files with 888 additions and 765 deletions

View File

@ -1,3 +1,4 @@
#!/usr/bin/env python
'''Example of training a named entity recognition system from scratch using spaCy '''Example of training a named entity recognition system from scratch using spaCy
This example is written to be self-contained and reasonably transparent. This example is written to be self-contained and reasonably transparent.
@ -81,7 +82,7 @@ def load_vocab(path):
def init_ner_model(vocab, features=None): def init_ner_model(vocab, features=None):
if features is None: if features is None:
features = tuple(EntityRecognizer.feature_templates) features = tuple(EntityRecognizer.feature_templates)
return BeamEntityRecognizer(vocab, features=features) return EntityRecognizer(vocab, features=features)
def save_ner_model(model, path): def save_ner_model(model, path):
@ -99,7 +100,7 @@ def save_ner_model(model, path):
def load_ner_model(vocab, path): def load_ner_model(vocab, path):
return BeamEntityRecognizer.load(path, vocab) return EntityRecognizer.load(path, vocab)
class Pipeline(object): class Pipeline(object):
@ -110,18 +111,21 @@ class Pipeline(object):
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
if not path.is_dir(): if not path.is_dir():
raise IOError("Cannot load pipeline from %s\nNot a directory" % path) raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
vocab = load_vocab(path / 'vocab') vocab = load_vocab(path)
tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer = Tokenizer(vocab, {}, None, None, None)
ner_model = load_ner_model(vocab, path / 'ner') ner_model = load_ner_model(vocab, path / 'ner')
return cls(vocab, tokenizer, ner_model) return cls(vocab, tokenizer, ner_model)
def __init__(self, vocab=None, tokenizer=None, ner_model=None): def __init__(self, vocab=None, tokenizer=None, entity=None):
if vocab is None: if vocab is None:
self.vocab = init_vocab() vocab = init_vocab()
if tokenizer is None: if tokenizer is None:
tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer = Tokenizer(vocab, {}, None, None, None)
if ner_model is None: if entity is None:
self.entity = init_ner_model(self.vocab) entity = init_ner_model(self.vocab)
self.vocab = vocab
self.tokenizer = tokenizer
self.entity = entity
self.pipeline = [self.entity] self.pipeline = [self.entity]
def __call__(self, input_): def __call__(self, input_):
@ -173,7 +177,7 @@ class Pipeline(object):
save_ner_model(self.entity, path / 'ner') save_ner_model(self.entity, path / 'ner')
def train(nlp, train_examples, dev_examples, nr_epoch=5): def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
next_epoch = train_examples next_epoch = train_examples
print("Iter", "Loss", "P", "R", "F") print("Iter", "Loss", "P", "R", "F")
for i in range(nr_epoch): for i in range(nr_epoch):
@ -186,14 +190,17 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5):
next_epoch.append((input_, annot)) next_epoch.append((input_, annot))
random.shuffle(next_epoch) random.shuffle(next_epoch)
scores = nlp.evaluate(dev_examples) scores = nlp.evaluate(dev_examples)
precision = '%.2f' % scores['ents_p'] report_scores(i, loss, scores)
recall = '%.2f' % scores['ents_r']
f_measure = '%.2f' % scores['ents_f']
print(i, int(loss), precision, recall, f_measure)
nlp.average_weights() nlp.average_weights()
scores = nlp.evaluate(dev_examples) scores = nlp.evaluate(dev_examples)
print("After averaging") report_scores(channels, i+1, loss, scores)
print(scores['ents_p'], scores['ents_r'], scores['ents_f'])
def report_scores(i, loss, scores):
precision = '%.2f' % scores['ents_p']
recall = '%.2f' % scores['ents_r']
f_measure = '%.2f' % scores['ents_f']
print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
def read_examples(path): def read_examples(path):
@ -221,15 +228,17 @@ def read_examples(path):
train_loc=("Path to your training data", "positional", None, Path), train_loc=("Path to your training data", "positional", None, Path),
dev_loc=("Path to your development data", "positional", None, Path), dev_loc=("Path to your development data", "positional", None, Path),
) )
def main(model_dir, train_loc, dev_loc, nr_epoch=10): def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
train_loc=None, dev_loc=None, nr_epoch=30):
train_examples = read_examples(train_loc) train_examples = read_examples(train_loc)
dev_examples = read_examples(dev_loc) dev_examples = read_examples(dev_loc)
nlp = Pipeline() nlp = Pipeline.load(model_dir)
train(nlp, train_examples, list(dev_examples), nr_epoch) train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
nlp.save(model_dir) nlp.save(model_dir)
if __name__ == '__main__': if __name__ == '__main__':
plac.call(main) main()

View File

@ -0,0 +1,74 @@
from __future__ import unicode_literals, print_function
import json
import pathlib
import random
import spacy
from spacy.pipeline import EntityRecognizer
from spacy.gold import GoldParse
from spacy.tagger import Tagger
try:
unicode
except:
unicode = str
def train_ner(nlp, train_data, output_dir):
# Add new words to vocab.
for raw_text, _ in train_data:
doc = nlp.make_doc(raw_text)
for word in doc:
_ = nlp.vocab[word.orth]
for itn in range(20):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
gold = GoldParse(doc, entities=entity_offsets)
doc = nlp.make_doc(raw_text)
nlp.tagger(doc)
loss = nlp.entity.update(doc, gold)
nlp.end_training()
nlp.save_to_directory(output_dir)
def main(model_name, output_directory=None):
nlp = spacy.load(model_name)
train_data = [
(
"Horses are too tall and they pretend to care about your feelings",
[(0, 6, 'ANIMAL')],
),
(
"horses are too tall and they pretend to care about your feelings",
[(0, 6, 'ANIMAL')]
),
(
"horses pretend to care about your feelings",
[(0, 6, 'ANIMAL')]
),
(
"they pretend to care about your feelings, those horses",
[(48, 54, 'ANIMAL')]
)
]
nlp.entity.add_label('ANIMAL')
if output_directory is not None:
output_directory = pathlib.Path(output_directory)
ner = train_ner(nlp, train_data, output_directory)
doc = nlp('Do you like horses?')
for ent in doc.ents:
print(ent.label_, ent.text)
nlp2 = spacy.load('en', path=output_directory)
nlp2.entity.add_label('ANIMAL')
doc2 = nlp2('Do you like horses?')
for ent in doc2.ents:
print(ent.label_, ent.text)
if __name__ == '__main__':
import plac
plac.call(main)

2
fabfile.py vendored
View File

@ -14,7 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
def env(lang='python2.7'): def env(lang='python2.7'):
if path.exists(VENV_DIR): if path.exists(VENV_DIR):
local('rm -rf {env}'.format(env=VENV_DIR)) local('rm -rf {env}'.format(env=VENV_DIR))
local('virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
def install(): def install():

View File

@ -1,27 +1,13 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals
import json
from pathlib import Path from pathlib import Path
from .util import set_lang_class, get_lang_class, parse_package_meta from .util import set_lang_class, get_lang_class, parse_package_meta
from .deprecated import resolve_model_name from .deprecated import resolve_model_name
from .cli import info from .cli import info
from . import en from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
from . import de
from . import zh
from . import es
from . import it
from . import hu
from . import fr
from . import pt
from . import nl
from . import sv
from . import fi
from . import bn
from . import he
from .about import *
set_lang_class(en.English.lang, en.English) set_lang_class(en.English.lang, en.English)

View File

@ -14,8 +14,9 @@ from spacy.cli import convert as cli_convert
class CLI(object): class CLI(object):
"""Command-line interface for spaCy""" """
Command-line interface for spaCy
"""
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert') commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
@plac.annotations( @plac.annotations(
@ -29,7 +30,6 @@ class CLI(object):
can be shortcut, model name or, if --direct flag is set, full model name can be shortcut, model name or, if --direct flag is set, full model name
with version. with version.
""" """
cli_download(model, direct) cli_download(model, direct)
@ -44,7 +44,6 @@ class CLI(object):
either the name of a pip package, or the local path to the model data either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name). directory. Linking models allows loading them via spacy.load(link_name).
""" """
cli_link(origin, link_name, force) cli_link(origin, link_name, force)
@ -58,7 +57,6 @@ class CLI(object):
speficied as an argument, print model information. Flag --markdown speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues. prints details in Markdown for easy copy-pasting to GitHub issues.
""" """
cli_info(model, markdown) cli_info(model, markdown)
@ -73,7 +71,6 @@ class CLI(object):
installation files. A new directory will be created in the specified installation files. A new directory will be created in the specified
output directory, and model data will be copied over. output directory, and model data will be copied over.
""" """
cli_package(input_dir, output_dir, force) cli_package(input_dir, output_dir, force)
@ -93,7 +90,6 @@ class CLI(object):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
not no_parser, not no_ner, parser_L1) not no_parser, not no_ner, parser_L1)
@ -108,7 +104,6 @@ class CLI(object):
""" """
Initialize a new model and its data directory. Initialize a new model and its data directory.
""" """
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data) cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
@plac.annotations( @plac.annotations(
@ -122,7 +117,6 @@ class CLI(object):
Convert files into JSON format for use with train command and other Convert files into JSON format for use with train command and other
experiment management functions. experiment management functions.
""" """
cli_convert(input_file, output_dir, n_sents, morphology) cli_convert(input_file, output_dir, n_sents, morphology)

View File

@ -1,3 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
IDS = { IDS = {
"": NULL_ATTR, "": NULL_ATTR,
"IS_ALPHA": IS_ALPHA, "IS_ALPHA": IS_ALPHA,
@ -92,7 +96,8 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'''Normalize a dictionary of attributes, converting them to ints. """
Normalize a dictionary of attributes, converting them to ints.
Arguments: Arguments:
stringy_attrs (dict): stringy_attrs (dict):
@ -105,7 +110,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
inty_attrs (dict): inty_attrs (dict):
Attributes dictionary with keys and optionally values converted to Attributes dictionary with keys and optionally values converted to
ints. ints.
''' """
inty_attrs = {} inty_attrs = {}
if _do_deprecated: if _do_deprecated:
if 'F' in stringy_attrs: if 'F' in stringy_attrs:

View File

@ -1,3 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from libc.stdio cimport fopen, fclose, fread, fwrite from libc.stdio cimport fopen, fclose, fread, fwrite
from libc.string cimport memcpy from libc.string cimport memcpy

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import io
from pathlib import Path from pathlib import Path
from .converters import conllu2json from .converters import conllu2json

View File

@ -2,12 +2,12 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import json import json
from ...gold import read_json_file, merge_sents
from ... import util from ... import util
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
"""Convert conllu files into JSON format for use with train cli. """
Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is use_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich. useful for languages such as Spanish, where UD tags are not so rich.
""" """

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pip
import requests import requests
import os import os
import subprocess import subprocess

View File

@ -18,7 +18,6 @@ def info(model=None, markdown=False):
else: else:
data['source'] = str(model_path) data['source'] = str(model_path)
print_info(data, "model " + model, markdown) print_info(data, "model " + model, markdown)
else: else:
data = get_spacy_data() data = get_spacy_data()
print_info(data, "spaCy", markdown) print_info(data, "spaCy", markdown)
@ -26,10 +25,8 @@ def info(model=None, markdown=False):
def print_info(data, title, markdown): def print_info(data, title, markdown):
title = "Info about {title}".format(title=title) title = "Info about {title}".format(title=title)
if markdown: if markdown:
util.print_markdown(data, title=title) util.print_markdown(data, title=title)
else: else:
util.print_table(data, title=title) util.print_table(data, title=title)

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import pip import pip
from pathlib import Path from pathlib import Path
import importlib import importlib
from ..compat import unicode_, symlink_to
from .. import util from .. import util
@ -20,7 +21,6 @@ def link_package(package_name, link_name, force=False):
# Python's installation and import rules are very complicated. # Python's installation and import rules are very complicated.
pkg = importlib.import_module(package_name) pkg = importlib.import_module(package_name)
package_path = Path(pkg.__file__).parent.parent package_path = Path(pkg.__file__).parent.parent
meta = get_meta(package_path, package_name) meta = get_meta(package_path, package_name)
model_name = package_name + '-' + meta['version'] model_name = package_name + '-' + meta['version']
model_path = package_path / package_name / model_name model_path = package_path / package_name / model_name
@ -43,23 +43,17 @@ def symlink(model_path, link_name, force):
elif link_path.exists(): elif link_path.exists():
link_path.unlink() link_path.unlink()
# Add workaround for Python 2 on Windows (see issue #909) try:
if util.is_python2() and util.is_windows(): symlink_to(link_path, model_path)
import subprocess except:
command = ['mklink', '/d', unicode(link_path), unicode(model_path)] # This is quite dirty, but just making sure other errors are caught so
try: # users at least see a proper message.
subprocess.call(command, shell=True) util.sys_exit(
except: "Creating a symlink in spacy/data failed. You can still import "
# This is quite dirty, but just making sure other Windows-specific "the model as a Python package and call its load() method, or "
# errors are caught so users at least see a proper error message. "create the symlink manually:",
util.sys_exit( "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
"Creating a symlink in spacy/data failed. You can still import " title="Error: Couldn't link model to '{l}'".format(l=link_name))
"the model as a Python package and call its load() method, or "
"create the symlink manually:",
"{a} --> {b}".format(a=unicode(model_path), b=unicode(link_path)),
title="Error: Couldn't link model to '{l}'".format(l=link_name))
else:
link_path.symlink_to(model_path)
util.print_msg( util.print_msg(
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()), "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),

View File

@ -1,20 +1,13 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import json
import shutil import shutil
import requests import requests
from pathlib import Path from pathlib import Path
import six from ..compat import unicode_, json_dumps
from .. import about
from .. import util from .. import util
if six.PY2:
json_dumps = lambda data: json.dumps(data, indent=2).decode("utf8")
elif six.PY3:
json_dumps = lambda data: json.dumps(data, indent=2)
def package(input_dir, output_dir, force): def package(input_dir, output_dir, force):
input_path = Path(input_dir) input_path = Path(input_dir)
@ -32,31 +25,31 @@ def package(input_dir, output_dir, force):
package_path = main_path / model_name package_path = main_path / model_name
create_dirs(package_path, force) create_dirs(package_path, force)
shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix()) shutil.copytree(unicode_(input_path), unicode_(package_path / model_name_v))
create_file(main_path / 'meta.json', json_dumps(meta)) create_file(main_path / 'meta.json', json_dumps(meta))
create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'setup.py', template_setup)
create_file(main_path / 'MANIFEST.in', template_manifest) create_file(main_path / 'MANIFEST.in', template_manifest)
create_file(package_path / '__init__.py', template_init) create_file(package_path / '__init__.py', template_init)
util.print_msg( util.print_msg(
main_path.as_posix(), unicode_(main_path),
"To build the package, run `python setup.py sdist` in that directory.", "To build the package, run `python setup.py sdist` in that directory.",
title="Successfully created package {p}".format(p=model_name_v)) title="Successfully created package {p}".format(p=model_name_v))
def check_dirs(input_path, output_path): def check_dirs(input_path, output_path):
if not input_path.exists(): if not input_path.exists():
util.sys_exit(input_path.as_poisx(), title="Model directory not found") util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
if not output_path.exists(): if not output_path.exists():
util.sys_exit(output_path.as_posix(), title="Output directory not found") util.sys_exit(unicode_(output_path), title="Output directory not found")
def create_dirs(package_path, force): def create_dirs(package_path, force):
if package_path.exists(): if package_path.exists():
if force: if force:
shutil.rmtree(package_path.as_posix()) shutil.rmtree(unicode_(package_path.as_posix))
else: else:
util.sys_exit(package_path.as_posix(), util.sys_exit(unicode_(package_path.as_posix),
"Please delete the directory and try again.", "Please delete the directory and try again.",
title="Package directory already exists") title="Package directory already exists")
Path.mkdir(package_path, parents=True) Path.mkdir(package_path, parents=True)

View File

@ -5,8 +5,6 @@ import json
from pathlib import Path from pathlib import Path
from ..scorer import Scorer from ..scorer import Scorer
from ..tagger import Tagger
from ..syntax.parser import Parser
from ..gold import GoldParse, merge_sents from ..gold import GoldParse, merge_sents
from ..gold import read_json_file as read_gold_json from ..gold import read_json_file as read_gold_json
from .. import util from .. import util
@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer: with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
loss = 0
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
for doc, gold in epoch: for doc, gold in epoch:
trainer.update(doc, gold) trainer.update(doc, gold)

54
spacy/compat.py Normal file
View File

@ -0,0 +1,54 @@
# coding: utf8
from __future__ import unicode_literals
import six
import sys
import ujson
try:
import cPickle as pickle
except ImportError:
import pickle
try:
import copy_reg
except ImportError:
import copyreg as copy_reg
is_python2 = six.PY2
is_python3 = six.PY3
is_windows = sys.platform.startswith('win')
is_linux = sys.platform.startswith('linux')
is_osx = sys.platform == 'darwin'
if is_python2:
bytes_ = str
unicode_ = unicode
basestring_ = basestring
input_ = raw_input
json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
elif is_python3:
bytes_ = bytes
unicode_ = str
basestring_ = str
input_ = input
json_dumps = lambda data: ujson.dumps(data, indent=2)
def symlink_to(orig, dest):
if is_python2 and is_windows:
import subprocess
subprocess.call(['mklink', '/d', unicode(orig), unicode(dest)], shell=True)
else:
orig.symlink_to(dest)
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
return ((python2 == None or python2 == is_python2) and
(python3 == None or python3 == is_python3) and
(windows == None or windows == is_windows) and
(linux == None or linux == is_linux) and
(osx == None or osx == is_osx))

View File

@ -1,16 +1,14 @@
# coding: utf8
from __future__ import unicode_literals
from pathlib import Path from pathlib import Path
from . import about from . import about
from . import util from . import util
from .cli import download from .cli import download
from .cli import link from .cli import link
try:
basestring
except NameError:
basestring = str
def read_lang_data(package): def read_lang_data(package):
tokenization = package.load_json(('tokenizer', 'specials.json')) tokenization = package.load_json(('tokenizer', 'specials.json'))
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_: with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
@ -36,7 +34,8 @@ def align_tokens(ref, indices): # Deprecated, surely?
def detokenize(token_rules, words): # Deprecated? def detokenize(token_rules, words): # Deprecated?
"""To align with treebanks, return a list of "chunks", where a chunk is a """
To align with treebanks, return a list of "chunks", where a chunk is a
sequence of tokens that are separated by whitespace in actual strings. Each sequence of tokens that are separated by whitespace in actual strings. Each
chunk should be a tuple of token indices, e.g. chunk should be a tuple of token indices, e.g.
@ -57,10 +56,30 @@ def detokenize(token_rules, words): # Deprecated?
return positions return positions
def fix_glove_vectors_loading(overrides): def match_best_version(target_name, target_version, path):
"""Special-case hack for loading the GloVe vectors, to support deprecated path = util.ensure_path(path)
<1.0 stuff. Phase this out once the data is fixed.""" if path is None or not path.exists():
return None
matches = []
for data_name in path.iterdir():
name, version = split_data_name(data_name.parts[-1])
if name == target_name:
matches.append((tuple(float(v) for v in version.split('.')), data_name))
if matches:
return Path(max(matches)[1])
else:
return None
def split_data_name(name):
return name.split('-', 1) if '-' in name else (name, '')
def fix_glove_vectors_loading(overrides):
"""
Special-case hack for loading the GloVe vectors, to support deprecated
<1.0 stuff. Phase this out once the data is fixed.
"""
if 'data_dir' in overrides and 'path' not in overrides: if 'data_dir' in overrides and 'path' not in overrides:
raise ValueError("The argument 'data_dir' has been renamed to 'path'") raise ValueError("The argument 'data_dir' has been renamed to 'path'")
if overrides.get('path') is False: if overrides.get('path') is False:
@ -68,18 +87,16 @@ def fix_glove_vectors_loading(overrides):
if overrides.get('path') in (None, True): if overrides.get('path') in (None, True):
data_path = util.get_data_path() data_path = util.get_data_path()
else: else:
path = overrides['path'] path = util.ensure_path(overrides['path'])
if isinstance(path, basestring):
path = Path(path)
data_path = path.parent data_path = path.parent
vec_path = None vec_path = None
if 'add_vectors' not in overrides: if 'add_vectors' not in overrides:
if 'vectors' in overrides: if 'vectors' in overrides:
vec_path = util.match_best_version(overrides['vectors'], None, data_path) vec_path = match_best_version(overrides['vectors'], None, data_path)
if vec_path is None: if vec_path is None:
return overrides return overrides
else: else:
vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path) vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
if vec_path is not None: if vec_path is not None:
vec_path = vec_path / 'vocab' / 'vec.bin' vec_path = vec_path / 'vocab' / 'vec.bin'
if vec_path is not None: if vec_path is not None:
@ -88,13 +105,13 @@ def fix_glove_vectors_loading(overrides):
def resolve_model_name(name): def resolve_model_name(name):
"""If spaCy is loaded with 'de', check if symlink already exists. If """
If spaCy is loaded with 'de', check if symlink already exists. If
not, user have upgraded from older version and have old models installed. not, user have upgraded from older version and have old models installed.
Check if old model directory exists and if so, return that instead and create Check if old model directory exists and if so, return that instead and create
shortcut link. If English model is found and no shortcut exists, raise error shortcut link. If English model is found and no shortcut exists, raise error
and tell user to install new model. and tell user to install new model.
""" """
if name == 'en' or name == 'de': if name == 'en' or name == 'de':
versions = ['1.0.0', '1.1.0'] versions = ['1.0.0', '1.1.0']
data_path = Path(util.get_data_path()) data_path = Path(util.get_data_path())
@ -117,9 +134,11 @@ def resolve_model_name(name):
class ModelDownload(): class ModelDownload():
"""Replace download modules within en and de with deprecation warning and """
Replace download modules within en and de with deprecation warning and
download default language model (using shortcut). Use classmethods to allow download default language model (using shortcut). Use classmethods to allow
importing ModelDownload as download and calling download.en() etc.""" importing ModelDownload as download and calling download.en() etc.
"""
@classmethod @classmethod
def load(self, lang): def load(self, lang):

View File

@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading
from .language_data import * from .language_data import *
try:
basestring
except NameError:
basestring = str
class English(Language): class English(Language):
lang = 'en' lang = 'en'

View File

@ -1,13 +1,11 @@
# cython: profile=True # cython: profile=True
# coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import io import io
import json
import re import re
import os import ujson
from os import path from pathlib import Path
import ujson as json
from .syntax import nonproj from .syntax import nonproj
@ -141,12 +139,13 @@ def _min_edit_path(cand_words, gold_words):
def read_json_file(loc, docs_filter=None): def read_json_file(loc, docs_filter=None):
if path.isdir(loc): loc = Path(loc)
for filename in os.listdir(loc): if loc.is_dir():
yield from read_json_file(path.join(loc, filename)) for filename in loc.iterdir():
yield from read_json_file(loc / filename)
else: else:
with io.open(loc, 'r', encoding='utf8') as file_: with io.open(loc, 'r', encoding='utf8') as file_:
docs = json.load(file_) docs = ujson.load(file_)
for doc in docs: for doc in docs:
if docs_filter is not None and not docs_filter(doc): if docs_filter is not None and not docs_filter(doc):
continue continue
@ -220,7 +219,8 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False): deps=None, entities=None, make_projective=False):
"""Create a GoldParse. """
Create a GoldParse.
Arguments: Arguments:
doc (Doc): doc (Doc):
@ -302,7 +302,8 @@ cdef class GoldParse:
self.heads = proj_heads self.heads = proj_heads
def __len__(self): def __len__(self):
"""Get the number of gold-standard tokens. """
Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens. Returns (int): The number of gold-standard tokens.
""" """
@ -310,13 +311,16 @@ cdef class GoldParse:
@property @property
def is_projective(self): def is_projective(self):
"""Whether the provided syntactic annotations form a projective dependency """
tree.""" Whether the provided syntactic annotations form a projective dependency
tree.
"""
return not nonproj.is_nonproj_tree(self.heads) return not nonproj.is_nonproj_tree(self.heads)
def biluo_tags_from_offsets(doc, entities): def biluo_tags_from_offsets(doc, entities):
'''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out """
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (biluo). scheme (biluo).
Arguments: Arguments:
@ -347,7 +351,7 @@ def biluo_tags_from_offsets(doc, entities):
tags = biluo_tags_from_offsets(doc, entities) tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O'] assert tags == ['O', 'O', 'U-LOC', 'O']
''' """
starts = {token.idx: token.i for token in doc} starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc} ends = {token.idx+len(token): token.i for token in doc}
biluo = ['-' for _ in doc] biluo = ['-' for _ in doc]

View File

@ -1,39 +1,26 @@
from __future__ import absolute_import # coding: utf8
from __future__ import unicode_literals from __future__ import absolute_import, unicode_literals
import pathlib
from contextlib import contextmanager from contextlib import contextmanager
import shutil import shutil
import ujson import ujson
try:
basestring
except NameError:
basestring = str
try:
unicode
except NameError:
unicode = str
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
from .tagger import Tagger from .tagger import Tagger
from .matcher import Matcher from .matcher import Matcher
from . import attrs
from . import orth
from . import util
from . import language_data
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .train import Trainer from .train import Trainer
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
from .syntax.parser import get_templates from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity from .syntax.nonproj import PseudoProjectivity
from .pipeline import DependencyParser, EntityRecognizer from .pipeline import DependencyParser, EntityRecognizer
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from .syntax.ner import BiluoPushDown from .syntax.ner import BiluoPushDown
from .compat import unicode_
from .attrs import IS_STOP
from . import attrs
from . import orth
from . import util
from . import language_data
class BaseDefaults(object): class BaseDefaults(object):
@ -150,25 +137,15 @@ class BaseDefaults(object):
return pipeline return pipeline
token_match = language_data.TOKEN_MATCH token_match = language_data.TOKEN_MATCH
prefixes = tuple(language_data.TOKENIZER_PREFIXES) prefixes = tuple(language_data.TOKENIZER_PREFIXES)
suffixes = tuple(language_data.TOKENIZER_SUFFIXES) suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
infixes = tuple(language_data.TOKENIZER_INFIXES) infixes = tuple(language_data.TOKENIZER_INFIXES)
tag_map = dict(language_data.TAG_MAP) tag_map = dict(language_data.TAG_MAP)
tokenizer_exceptions = {} tokenizer_exceptions = {}
parser_features = get_templates('parser') parser_features = get_templates('parser')
entity_features = get_templates('ner') entity_features = get_templates('ner')
tagger_features = Tagger.feature_templates # TODO -- fix this tagger_features = Tagger.feature_templates # TODO -- fix this
stop_words = set() stop_words = set()
lemma_rules = {} lemma_rules = {}
lemma_exc = {} lemma_exc = {}
lemma_index = {} lemma_index = {}
@ -202,53 +179,42 @@ class BaseDefaults(object):
class Language(object): class Language(object):
'''A text-processing pipeline. Usually you'll load this once per process, and """
A text-processing pipeline. Usually you'll load this once per process, and
pass the instance around your program. pass the instance around your program.
''' """
Defaults = BaseDefaults Defaults = BaseDefaults
lang = None lang = None
@classmethod @classmethod
@contextmanager def setup_directory(cls, path, **configs):
def train(cls, path, gold_tuples, *configs): for name, config in configs.items():
if isinstance(path, basestring): directory = path / name
path = pathlib.Path(path) if directory.exists():
tagger_cfg, parser_cfg, entity_cfg = configs shutil.rmtree(str(directory))
dep_model_dir = path / 'deps' directory.mkdir()
ner_model_dir = path / 'ner' with (directory / 'config.json').open('wb') as file_:
pos_model_dir = path / 'pos' data = ujson.dumps(config, indent=2)
if dep_model_dir.exists(): if isinstance(data, unicode_):
shutil.rmtree(str(dep_model_dir)) data = data.encode('utf8')
if ner_model_dir.exists(): file_.write(data)
shutil.rmtree(str(ner_model_dir)) if not (path / 'vocab').exists():
if pos_model_dir.exists(): (path / 'vocab').mkdir()
shutil.rmtree(str(pos_model_dir))
dep_model_dir.mkdir()
ner_model_dir.mkdir()
pos_model_dir.mkdir()
@classmethod
@contextmanager
def train(cls, path, gold_tuples, **configs):
if parser_cfg['pseudoprojective']: if parser_cfg['pseudoprojective']:
# preprocess training data here before ArcEager.get_labels() is called # preprocess training data here before ArcEager.get_labels() is called
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) for subdir in ('deps', 'ner', 'pos'):
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) if subdir not in configs:
configs[subdir] = {}
configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
with (dep_model_dir / 'config.json').open('wb') as file_: cls.setup_directory(path, **configs)
data = ujson.dumps(parser_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
with (ner_model_dir / 'config.json').open('wb') as file_:
data = ujson.dumps(entity_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
with (pos_model_dir / 'config.json').open('wb') as file_:
data = ujson.dumps(tagger_cfg)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)
self = cls( self = cls(
path=path, path=path,
@ -269,14 +235,14 @@ class Language(object):
self.entity = self.Defaults.create_entity(self) self.entity = self.Defaults.create_entity(self)
self.pipeline = self.Defaults.create_pipeline(self) self.pipeline = self.Defaults.create_pipeline(self)
yield Trainer(self, gold_tuples) yield Trainer(self, gold_tuples)
self.end_training(path=path) self.end_training()
self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg,
pos=self.tagger.cfg)
def __init__(self, **overrides): def __init__(self, **overrides):
if 'data_dir' in overrides and 'path' not in overrides: if 'data_dir' in overrides and 'path' not in overrides:
raise ValueError("The argument 'data_dir' has been renamed to 'path'") raise ValueError("The argument 'data_dir' has been renamed to 'path'")
path = overrides.get('path', True) path = util.ensure_path(overrides.get('path', True))
if isinstance(path, basestring):
path = pathlib.Path(path)
if path is True: if path is True:
path = util.get_data_path() / self.lang path = util.get_data_path() / self.lang
if not path.exists() and 'path' not in overrides: if not path.exists() and 'path' not in overrides:
@ -322,7 +288,8 @@ class Language(object):
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity] self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
def __call__(self, text, tag=True, parse=True, entity=True): def __call__(self, text, tag=True, parse=True, entity=True):
"""Apply the pipeline to some text. The text can span multiple sentences, """
Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string and can contain arbtrary whitespace. Alignment into the original string
is preserved. is preserved.
@ -352,7 +319,8 @@ class Language(object):
return doc return doc
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
'''Process texts as a stream, and yield Doc objects in order. """
Process texts as a stream, and yield Doc objects in order.
Supports GIL-free multi-threading. Supports GIL-free multi-threading.
@ -361,7 +329,7 @@ class Language(object):
tag (bool) tag (bool)
parse (bool) parse (bool)
entity (bool) entity (bool)
''' """
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
stream = (self.make_doc(text) for text in texts) stream = (self.make_doc(text) for text in texts)
for proc in self.pipeline: for proc in self.pipeline:
@ -373,51 +341,35 @@ class Language(object):
for doc in stream: for doc in stream:
yield doc yield doc
def end_training(self, path=None): def save_to_directory(self, path):
if path is None: configs = {
path = self.path 'pos': self.tagger.cfg if self.tagger else {},
elif isinstance(path, basestring): 'deps': self.parser.cfg if self.parser else {},
path = pathlib.Path(path) 'ner': self.entity.cfg if self.entity else {},
}
if self.tagger: self.setup_directory(path, **configs)
self.tagger.model.end_training()
self.tagger.model.dump(str(path / 'pos' / 'model'))
if self.parser:
self.parser.model.end_training()
self.parser.model.dump(str(path / 'deps' / 'model'))
if self.entity:
self.entity.model.end_training()
self.entity.model.dump(str(path / 'ner' / 'model'))
strings_loc = path / 'vocab' / 'strings.json' strings_loc = path / 'vocab' / 'strings.json'
with strings_loc.open('w', encoding='utf8') as file_: with strings_loc.open('w', encoding='utf8') as file_:
self.vocab.strings.dump(file_) self.vocab.strings.dump(file_)
self.vocab.dump(path / 'vocab' / 'lexemes.bin') self.vocab.dump(path / 'vocab' / 'lexemes.bin')
# TODO: Word vectors?
if self.tagger: if self.tagger:
tagger_freqs = list(self.tagger.freqs[TAG].items()) self.tagger.model.dump(str(path / 'pos' / 'model'))
else:
tagger_freqs = []
if self.parser: if self.parser:
dep_freqs = list(self.parser.moves.freqs[DEP].items()) self.parser.model.dump(str(path / 'deps' / 'model'))
head_freqs = list(self.parser.moves.freqs[HEAD].items())
else:
dep_freqs = []
head_freqs = []
if self.entity: if self.entity:
entity_iob_freqs = list(self.entity.moves.freqs[ENT_IOB].items()) self.entity.model.dump(str(path / 'ner' / 'model'))
entity_type_freqs = list(self.entity.moves.freqs[ENT_TYPE].items())
else: def end_training(self, path=None):
entity_iob_freqs = [] if self.tagger:
entity_type_freqs = [] self.tagger.model.end_training()
with (path / 'vocab' / 'serializer.json').open('wb') as file_: if self.parser:
data = ujson.dumps([ self.parser.model.end_training()
(TAG, tagger_freqs), if self.entity:
(DEP, dep_freqs), self.entity.model.end_training()
(ENT_IOB, entity_iob_freqs), # NB: This is slightly different from before --- we no longer default
(ENT_TYPE, entity_type_freqs), # to taking nlp.path
(HEAD, head_freqs) if path is not None:
]) self.save_to_directory(path)
if isinstance(data, unicode):
data = data.encode('utf8')
file_.write(data)

View File

@ -1,13 +1,8 @@
from __future__ import unicode_literals, print_function # coding: utf8
import codecs from __future__ import unicode_literals
import pathlib
import ujson as json
from .symbols import POS, NOUN, VERB, ADJ, PUNCT from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
from .symbols import Number_sing
from .symbols import Degree_pos
class Lemmatizer(object): class Lemmatizer(object):
@ -38,8 +33,10 @@ class Lemmatizer(object):
return lemmas return lemmas
def is_base_form(self, univ_pos, morphology=None): def is_base_form(self, univ_pos, morphology=None):
'''Check whether we're dealing with an uninflected paradigm, so we can """
avoid lemmatization entirely.''' Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
"""
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
true_morph_key = morphology.get('morph', 0) true_morph_key = morphology.get('morph', 0)

View File

@ -1,4 +1,7 @@
# cython: embedsignature=True # cython: embedsignature=True
# coding: utf8
from __future__ import unicode_literals, print_function
from libc.math cimport sqrt from libc.math cimport sqrt
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -9,14 +12,11 @@ from cython.view cimport array as cvarray
cimport numpy as np cimport numpy as np
np.import_array() np.import_array()
from libc.string cimport memset from libc.string cimport memset
import numpy
from .orth cimport word_shape from .orth cimport word_shape
from .typedefs cimport attr_t, flags_t from .typedefs cimport attr_t, flags_t
import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET from .attrs cimport IS_BRACKET
@ -30,13 +30,15 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef class Lexeme: cdef class Lexeme:
"""An entry in the vocabulary. A Lexeme has no string context --- it's a """
An entry in the vocabulary. A Lexeme has no string context --- it's a
word-type, as opposed to a word token. It therefore has no part-of-speech word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag). tag).
""" """
def __init__(self, Vocab vocab, int orth): def __init__(self, Vocab vocab, int orth):
"""Create a Lexeme object. """
Create a Lexeme object.
Arguments: Arguments:
vocab (Vocab): The parent vocabulary vocab (Vocab): The parent vocabulary
@ -80,7 +82,8 @@ cdef class Lexeme:
return self.c.orth return self.c.orth
def set_flag(self, attr_id_t flag_id, bint value): def set_flag(self, attr_id_t flag_id, bint value):
"""Change the value of a boolean flag. """
Change the value of a boolean flag.
Arguments: Arguments:
flag_id (int): The attribute ID of the flag to set. flag_id (int): The attribute ID of the flag to set.
@ -89,7 +92,8 @@ cdef class Lexeme:
Lexeme.c_set_flag(self.c, flag_id, value) Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id): def check_flag(self, attr_id_t flag_id):
"""Check the value of a boolean flag. """
Check the value of a boolean flag.
Arguments: Arguments:
flag_id (int): The attribute ID of the flag to query. flag_id (int): The attribute ID of the flag to query.
@ -98,7 +102,8 @@ cdef class Lexeme:
return True if Lexeme.c_check_flag(self.c, flag_id) else False return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other): def similarity(self, other):
'''Compute a semantic similarity estimate. Defaults to cosine over vectors. """
Compute a semantic similarity estimate. Defaults to cosine over vectors.
Arguments: Arguments:
other: other:
@ -106,7 +111,7 @@ cdef class Lexeme:
Token and Lexeme objects. Token and Lexeme objects.
Returns: Returns:
score (float): A scalar similarity score. Higher is more similar. score (float): A scalar similarity score. Higher is more similar.
''' """
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

View File

@ -1,7 +1,10 @@
# cython: profile=True # cython: profile=True
# cython: infer_types=True # cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .attrs cimport attr_id_t from .attrs cimport attr_id_t
@ -52,12 +55,6 @@ from .attrs import FLAG36 as L9_ENT
from .attrs import FLAG35 as L10_ENT from .attrs import FLAG35 as L10_ENT
try:
import ujson as json
except ImportError:
import json
cpdef enum quantifier_t: cpdef enum quantifier_t:
_META _META
ONE ONE
@ -164,7 +161,7 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches): def merge_phrase(matcher, doc, i, matches):
'''Callback to merge a phrase on match''' '''Callback to merge a phrase on match'''
ent_id, label, start, end = matches[i] ent_id, label, start, end = matches[i]
span = doc[start : end] span = doc[start : end]
span.merge(ent_type=label, ent_id=ent_id) span.merge(ent_type=label, ent_id=ent_id)
@ -180,7 +177,8 @@ cdef class Matcher:
@classmethod @classmethod
def load(cls, path, vocab): def load(cls, path, vocab):
'''Load the matcher and patterns from a file path. """
Load the matcher and patterns from a file path.
Arguments: Arguments:
path (Path): path (Path):
@ -189,16 +187,17 @@ cdef class Matcher:
The vocabulary that the documents to match over will refer to. The vocabulary that the documents to match over will refer to.
Returns: Returns:
Matcher: The newly constructed object. Matcher: The newly constructed object.
''' """
if (path / 'gazetteer.json').exists(): if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = json.load(file_) patterns = ujson.load(file_)
else: else:
patterns = {} patterns = {}
return cls(vocab, patterns) return cls(vocab, patterns)
def __init__(self, vocab, patterns={}): def __init__(self, vocab, patterns={}):
"""Create the Matcher. """
Create the Matcher.
Arguments: Arguments:
vocab (Vocab): vocab (Vocab):
@ -227,7 +226,8 @@ cdef class Matcher:
def add_entity(self, entity_key, attrs=None, if_exists='raise', def add_entity(self, entity_key, attrs=None, if_exists='raise',
acceptor=None, on_match=None): acceptor=None, on_match=None):
"""Add an entity to the matcher. """
Add an entity to the matcher.
Arguments: Arguments:
entity_key (unicode or int): entity_key (unicode or int):
@ -264,7 +264,8 @@ cdef class Matcher:
self._callbacks[entity_key] = on_match self._callbacks[entity_key] = on_match
def add_pattern(self, entity_key, token_specs, label=""): def add_pattern(self, entity_key, token_specs, label=""):
"""Add a pattern to the matcher. """
Add a pattern to the matcher.
Arguments: Arguments:
entity_key (unicode or int): entity_key (unicode or int):
@ -307,7 +308,8 @@ cdef class Matcher:
return entity_key return entity_key
def has_entity(self, entity_key): def has_entity(self, entity_key):
"""Check whether the matcher has an entity. """
Check whether the matcher has an entity.
Arguments: Arguments:
entity_key (string or int): The entity key to check. entity_key (string or int): The entity key to check.
@ -318,7 +320,8 @@ cdef class Matcher:
return entity_key in self._entities return entity_key in self._entities
def get_entity(self, entity_key): def get_entity(self, entity_key):
"""Retrieve the attributes stored for an entity. """
Retrieve the attributes stored for an entity.
Arguments: Arguments:
entity_key (unicode or int): The entity to retrieve. entity_key (unicode or int): The entity to retrieve.
@ -332,7 +335,8 @@ cdef class Matcher:
return None return None
def __call__(self, Doc doc, acceptor=None): def __call__(self, Doc doc, acceptor=None):
"""Find all token sequences matching the supplied patterns on the Doc. """
Find all token sequences matching the supplied patterns on the Doc.
Arguments: Arguments:
doc (Doc): doc (Doc):
@ -445,7 +449,8 @@ cdef class Matcher:
return matches return matches
def pipe(self, docs, batch_size=1000, n_threads=2): def pipe(self, docs, batch_size=1000, n_threads=2):
"""Match a stream of documents, yielding them in turn. """
Match a stream of documents, yielding them in turn.
Arguments: Arguments:
docs: A stream of documents. docs: A stream of documents.

View File

@ -1,13 +1,9 @@
# cython: infer_types # cython: infer_types
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.string cimport memset from libc.string cimport memset
try:
import ujson as json
except ImportError:
import json
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
from .attrs cimport POS, IS_SPACE from .attrs cimport POS, IS_SPACE
from .parts_of_speech import IDS as POS_IDS from .parts_of_speech import IDS as POS_IDS
@ -16,7 +12,9 @@ from .attrs import LEMMA, intify_attrs
def _normalize_props(props): def _normalize_props(props):
'''Transform deprecated string keys to correct names.''' """
Transform deprecated string keys to correct names.
"""
out = {} out = {}
for key, value in props.items(): for key, value in props.items():
if key == POS: if key == POS:
@ -98,13 +96,14 @@ cdef class Morphology:
flags[0] &= ~(one << flag_id) flags[0] &= ~(one << flag_id)
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
'''Add a special-case rule to the morphological analyser. Tokens whose """
Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties. tag and orth match the rule will receive the specified properties.
Arguments: Arguments:
tag (unicode): The part-of-speech tag to key the exception. tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception. orth (unicode): The word-form to key the exception.
''' """
tag = self.strings[tag_str] tag = self.strings[tag_str]
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
orth = self.strings[orth_str] orth = self.strings[orth_str]

View File

@ -1,8 +0,0 @@
class RegexMerger(object):
def __init__(self, regexes):
self.regexes = regexes
def __call__(self, tokens):
for tag, entity_type, regex in self.regexes:
for m in regex.finditer(tokens.string):
tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)

View File

@ -1,6 +1,7 @@
# coding: utf8
# cython: infer_types=True # cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import unicodedata import unicodedata
import re import re

View File

@ -1,3 +1,4 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals

View File

@ -1,3 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from .syntax.parser cimport Parser from .syntax.parser cimport Parser
from .syntax.beam_parser cimport BeamParser from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown from .syntax.ner cimport BiluoPushDown
@ -11,44 +14,40 @@ from .attrs import DEP, ENT_TYPE
cdef class EntityRecognizer(Parser): cdef class EntityRecognizer(Parser):
"""Annotate named entities on Doc objects.""" """
Annotate named entities on Doc objects.
"""
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner') feature_templates = get_feature_templates('ner')
def add_label(self, label): def add_label(self, label):
for action in self.moves.action_types: Parser.add_label(self, label)
self.moves.add_action(action, label)
if 'actions' in self.cfg:
self.cfg['actions'].setdefault(action,
{}).setdefault(label, True)
if isinstance(label, basestring): if isinstance(label, basestring):
label = self.vocab.strings[label] label = self.vocab.strings[label]
# Set label into serializer. Super hacky :(
for attr, freqs in self.vocab.serializer_freqs: for attr, freqs in self.vocab.serializer_freqs:
if attr == ENT_TYPE and label not in freqs: if attr == ENT_TYPE and label not in freqs:
freqs.append([label, 1]) freqs.append([label, 1])
# Super hacky :(
self.vocab._serializer = None self.vocab._serializer = None
cdef class BeamEntityRecognizer(BeamParser): cdef class BeamEntityRecognizer(BeamParser):
"""Annotate named entities on Doc objects.""" """
Annotate named entities on Doc objects.
"""
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner') feature_templates = get_feature_templates('ner')
def add_label(self, label): def add_label(self, label):
for action in self.moves.action_types: Parser.add_label(self, label)
self.moves.add_action(action, label)
if 'actions' in self.cfg:
self.cfg['actions'].setdefault(action,
{}).setdefault(label, True)
if isinstance(label, basestring): if isinstance(label, basestring):
label = self.vocab.strings[label] label = self.vocab.strings[label]
# Set label into serializer. Super hacky :(
for attr, freqs in self.vocab.serializer_freqs: for attr, freqs in self.vocab.serializer_freqs:
if attr == ENT_TYPE and label not in freqs: if attr == ENT_TYPE and label not in freqs:
freqs.append([label, 1]) freqs.append([label, 1])
# Super hacky :(
self.vocab._serializer = None self.vocab._serializer = None
@ -58,11 +57,7 @@ cdef class DependencyParser(Parser):
feature_templates = get_feature_templates('basic') feature_templates = get_feature_templates('basic')
def add_label(self, label): def add_label(self, label):
for action in self.moves.action_types: Parser.add_label(self, label)
self.moves.add_action(action, label)
if 'actions' in self.cfg:
self.cfg['actions'].setdefault(action,
{}).setdefault(label, True)
if isinstance(label, basestring): if isinstance(label, basestring):
label = self.vocab.strings[label] label = self.vocab.strings[label]
for attr, freqs in self.vocab.serializer_freqs: for attr, freqs in self.vocab.serializer_freqs:
@ -78,11 +73,7 @@ cdef class BeamDependencyParser(BeamParser):
feature_templates = get_feature_templates('basic') feature_templates = get_feature_templates('basic')
def add_label(self, label): def add_label(self, label):
for action in self.moves.action_types: Parser.add_label(self, label)
self.moves.add_action(action, label)
if 'actions' in self.cfg:
self.cfg['actions'].setdefault(action,
{}).setdefault(label, True)
if isinstance(label, basestring): if isinstance(label, basestring):
label = self.vocab.strings[label] label = self.vocab.strings[label]
for attr, freqs in self.vocab.serializer_freqs: for attr, freqs in self.vocab.serializer_freqs:

View File

@ -1,12 +1,13 @@
from __future__ import division # coding: utf8
from __future__ import print_function from __future__ import division, print_function, unicode_literals
from __future__ import unicode_literals
from .gold import tags_to_entities from .gold import tags_to_entities
class PRFScore(object): class PRFScore(object):
"""A precision / recall / F score""" """
A precision / recall / F score
"""
def __init__(self): def __init__(self):
self.tp = 0 self.tp = 0
self.fp = 0 self.fp = 0

View File

@ -1,12 +1,11 @@
# cython: infer_types=True # cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals, absolute_import from __future__ import unicode_literals, absolute_import
cimport cython cimport cython
from libc.string cimport memcpy from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32 from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t from .typedefs cimport hash_t
@ -73,13 +72,16 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
cdef class StringStore: cdef class StringStore:
'''Map strings to and from integer IDs.''' """
Map strings to and from integer IDs.
"""
def __init__(self, strings=None, freeze=False): def __init__(self, strings=None, freeze=False):
'''Create the StringStore. """
Create the StringStore.
Arguments: Arguments:
strings: A sequence of unicode strings to add to the store. strings: A sequence of unicode strings to add to the store.
''' """
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()
self._oov = PreshMap() self._oov = PreshMap()
@ -104,7 +106,8 @@ cdef class StringStore:
return (StringStore, (list(self),)) return (StringStore, (list(self),))
def __len__(self): def __len__(self):
"""The number of strings in the store. """
The number of strings in the store.
Returns: Returns:
int The number of strings in the store. int The number of strings in the store.
@ -112,8 +115,9 @@ cdef class StringStore:
return self.size-1 return self.size-1
def __getitem__(self, object string_or_id): def __getitem__(self, object string_or_id):
"""Retrieve a string from a given integer ID, or vice versa. """
Retrieve a string from a given integer ID, or vice versa.
Arguments: Arguments:
string_or_id (bytes or unicode or int): string_or_id (bytes or unicode or int):
The value to encode. The value to encode.
@ -149,17 +153,18 @@ cdef class StringStore:
raise TypeError(type(string_or_id)) raise TypeError(type(string_or_id))
utf8str = self._intern_utf8(byte_string, len(byte_string)) utf8str = self._intern_utf8(byte_string, len(byte_string))
if utf8str is NULL: if utf8str is NULL:
# TODO: We need to use 32 bit here, for compatibility with the # TODO: We need to use 32 bit here, for compatibility with the
# vocabulary values. This makes birthday paradox probabilities # vocabulary values. This makes birthday paradox probabilities
# pretty bad. # pretty bad.
# We could also get unlucky here, and hash into a value that # We could also get unlucky here, and hash into a value that
# collides with the 'real' strings. # collides with the 'real' strings.
return hash32_utf8(byte_string, len(byte_string)) return hash32_utf8(byte_string, len(byte_string))
else: else:
return utf8str - self.c return utf8str - self.c
def __contains__(self, unicode string not None): def __contains__(self, unicode string not None):
"""Check whether a string is in the store. """
Check whether a string is in the store.
Arguments: Arguments:
string (unicode): The string to check. string (unicode): The string to check.
@ -172,7 +177,8 @@ cdef class StringStore:
return self._map.get(key) is not NULL return self._map.get(key) is not NULL
def __iter__(self): def __iter__(self):
"""Iterate over the strings in the store, in order. """
Iterate over the strings in the store, in order.
Yields: unicode A string in the store. Yields: unicode A string in the store.
""" """
@ -230,7 +236,8 @@ cdef class StringStore:
return &self.c[self.size-1] return &self.c[self.size-1]
def dump(self, file_): def dump(self, file_):
"""Save the strings to a JSON file. """
Save the strings to a JSON file.
Arguments: Arguments:
file_ (buffer): The file to save the strings. file_ (buffer): The file to save the strings.
@ -244,7 +251,8 @@ cdef class StringStore:
file_.write(string_data) file_.write(string_data)
def load(self, file_): def load(self, file_):
"""Load the strings from a JSON file. """
Load the strings from a JSON file.
Arguments: Arguments:
file_ (buffer): The file from which to load the strings. file_ (buffer): The file from which to load the strings.

View File

@ -1,3 +1,4 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
IDS = { IDS = {

View File

@ -7,17 +7,17 @@ out of "context") is in features/extractor.pyx
The atomic feature names are listed in a big enum, so that the feature tuples The atomic feature names are listed in a big enum, so that the feature tuples
can refer to them. can refer to them.
""" """
from libc.string cimport memset # coding: utf-8
from __future__ import unicode_literals
from libc.string cimport memset
from itertools import combinations from itertools import combinations
from cymem.cymem cimport Pool
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from cymem.cymem cimport Pool
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
if token is NULL: if token is NULL:

View File

@ -1,29 +1,26 @@
# cython: profile=True # cython: profile=True
# cython: cdivision=True # cython: cdivision=True
# cython: infer_types=True # cython: infer_types=True
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
import ctypes import ctypes
import os from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from ..structs cimport TokenC from cymem.cymem cimport Pool
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
from .nonproj import PseudoProjectivity
from .nonproj import is_nonproj_tree
from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..gold cimport GoldParseC from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..structs cimport TokenC
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cymem.cymem cimport Pool
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
from .nonproj import PseudoProjectivity
from .nonproj import is_nonproj_tree
DEF NON_MONOTONIC = True DEF NON_MONOTONIC = True
@ -317,17 +314,20 @@ cdef class ArcEager(TransitionSystem):
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions',
{ {
SHIFT: {'': True}, SHIFT: [''],
REDUCE: {'': True}, REDUCE: [''],
RIGHT: {}, RIGHT: [],
LEFT: {}, LEFT: [],
BREAK: {'ROOT': True}}) BREAK: ['ROOT']})
seen_actions = set()
for label in kwargs.get('left_labels', []): for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT': if label.upper() != 'ROOT':
actions[LEFT][label] = True if (LEFT, label) not in seen_actions:
actions[LEFT].append(label)
for label in kwargs.get('right_labels', []): for label in kwargs.get('right_labels', []):
if label.upper() != 'ROOT': if label.upper() != 'ROOT':
actions[RIGHT][label] = True if (RIGHT, label) not in seen_actions:
actions[RIGHT].append(label)
for raw_text, sents in kwargs.get('gold_parses', []): for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, iob), ctnts in sents: for (ids, words, tags, heads, labels, iob), ctnts in sents:
@ -336,9 +336,11 @@ cdef class ArcEager(TransitionSystem):
label = 'ROOT' label = 'ROOT'
if label != 'ROOT': if label != 'ROOT':
if head < child: if head < child:
actions[RIGHT][label] = True if (RIGHT, label) not in seen_actions:
actions[RIGHT].append(label)
elif head > child: elif head > child:
actions[LEFT][label] = True if (LEFT, label) not in seen_actions:
actions[LEFT].append(label)
return actions return actions
property action_types: property action_types:

View File

@ -1,50 +1,34 @@
"""
MALT-style dependency parser
"""
# cython: profile=True # cython: profile=True
# cython: experimental_cpp_class_def=True # cython: experimental_cpp_class_def=True
# cython: cdivision=True # cython: cdivision=True
# cython: infer_types=True # cython: infer_types=True
""" # coding: utf-8
MALT-style dependency parser
""" from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
cimport cython cimport cython
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from libc.stdlib cimport rand from libc.stdlib cimport rand
from libc.math cimport log, exp, isnan, isinf from libc.math cimport log, exp, isnan, isinf
import random
import os.path
from os import path
import shutil
import json
import math
from cymem.cymem cimport Pool, Address from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport real_hash64 as hash64 from murmurhash.mrmr cimport real_hash64 as hash64
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
from util import Config
from thinc.linear.features cimport ConjunctionExtracter from thinc.linear.features cimport ConjunctionExtracter
from thinc.structs cimport FeatureC, ExampleC from thinc.structs cimport FeatureC, ExampleC
from thinc.extra.search cimport Beam, MaxViolation
from thinc.extra.search cimport Beam
from thinc.extra.search cimport MaxViolation
from thinc.extra.eg cimport Example from thinc.extra.eg cimport Example
from thinc.extra.mb cimport Minibatch from thinc.extra.mb cimport Minibatch
from ..structs cimport TokenC from ..structs cimport TokenC
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..strings cimport StringStore from ..strings cimport StringStore
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse from ..gold cimport GoldParse
from . import _parse_features from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
@ -266,4 +250,3 @@ def is_gold(StateClass state, GoldParse gold, StringStore strings):
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
truth.add((id_, head, dep)) truth.add((id_, head, dep))
return truth == predicted return truth == predicted

View File

@ -1,9 +1,14 @@
from spacy.parts_of_speech cimport NOUN, PROPN, PRON # coding: utf-8
from __future__ import unicode_literals
from ..parts_of_speech cimport NOUN, PROPN, PRON
def english_noun_chunks(obj): def english_noun_chunks(obj):
'''Detect base noun phrases from a dependency parse. """
Works on both Doc and Span.''' Detect base noun phrases from a dependency parse.
Works on both Doc and Span.
"""
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
'attr', 'ROOT', 'root'] 'attr', 'ROOT', 'root']
doc = obj.doc # Ensure works on both Doc and Span. doc = obj.doc # Ensure works on both Doc and Span.

View File

@ -1,17 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .transition_system cimport Transition
from .transition_system cimport do_func_t
from ..structs cimport TokenC, Entity
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .transition_system cimport Transition
from .transition_system cimport do_func_t
from ..structs cimport TokenC, Entity
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
cdef enum: cdef enum:
@ -21,6 +20,7 @@ cdef enum:
LAST LAST
UNIT UNIT
OUT OUT
ISNT
N_MOVES N_MOVES
@ -31,6 +31,7 @@ MOVE_NAMES[IN] = 'I'
MOVE_NAMES[LAST] = 'L' MOVE_NAMES[LAST] = 'L'
MOVE_NAMES[UNIT] = 'U' MOVE_NAMES[UNIT] = 'U'
MOVE_NAMES[OUT] = 'O' MOVE_NAMES[OUT] = 'O'
MOVE_NAMES[ISNT] = 'x'
cdef do_func_t[N_MOVES] do_funcs cdef do_func_t[N_MOVES] do_funcs
@ -54,16 +55,20 @@ cdef class BiluoPushDown(TransitionSystem):
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions',
{ {
MISSING: {'': True}, MISSING: [''],
BEGIN: {}, BEGIN: [],
IN: {}, IN: [],
LAST: {}, LAST: [],
UNIT: {}, UNIT: [],
OUT: {'': True} OUT: ['']
}) })
seen_entities = set()
for entity_type in kwargs.get('entity_types', []): for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities:
continue
seen_entities.add(entity_type)
for action in (BEGIN, IN, LAST, UNIT): for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = True actions[action].append(entity_type)
moves = ('M', 'B', 'I', 'L', 'U') moves = ('M', 'B', 'I', 'L', 'U')
for raw_text, sents in kwargs.get('gold_parses', []): for raw_text, sents in kwargs.get('gold_parses', []):
for (ids, words, tags, heads, labels, biluo), _ in sents: for (ids, words, tags, heads, labels, biluo), _ in sents:
@ -72,8 +77,10 @@ cdef class BiluoPushDown(TransitionSystem):
if ner_tag.count('-') != 1: if ner_tag.count('-') != 1:
raise ValueError(ner_tag) raise ValueError(ner_tag)
_, label = ner_tag.split('-') _, label = ner_tag.split('-')
for move_str in ('B', 'I', 'L', 'U'): if label not in seen_entities:
actions[moves.index(move_str)][label] = True seen_entities.add(label)
for move_str in ('B', 'I', 'L', 'U'):
actions[moves.index(move_str)].append(label)
return actions return actions
property action_types: property action_types:
@ -111,11 +118,17 @@ cdef class BiluoPushDown(TransitionSystem):
label = 0 label = 0
elif '-' in name: elif '-' in name:
move_str, label_str = name.split('-', 1) move_str, label_str = name.split('-', 1)
# Hacky way to denote 'not this entity'
if label_str.startswith('!'):
label_str = label_str[1:]
move_str = 'x'
label = self.strings[label_str] label = self.strings[label_str]
else: else:
move_str = name move_str = name
label = 0 label = 0
move = MOVE_NAMES.index(move_str) move = MOVE_NAMES.index(move_str)
if move == ISNT:
return Transition(clas=0, move=ISNT, label=label, score=0)
for i in range(self.n_moves): for i in range(self.n_moves):
if self.c[i].move == move and self.c[i].label == label: if self.c[i].move == move and self.c[i].label == label:
return self.c[i] return self.c[i]
@ -225,6 +238,9 @@ cdef class Begin:
elif g_act == BEGIN: elif g_act == BEGIN:
# B, Gold B --> Label match # B, Gold B --> Label match
return label != g_tag return label != g_tag
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return label == g_tag
else: else:
# B, Gold I --> False (P) # B, Gold I --> False (P)
# B, Gold L --> False (P) # B, Gold L --> False (P)
@ -359,6 +375,9 @@ cdef class Unit:
elif g_act == UNIT: elif g_act == UNIT:
# U, Gold U --> True iff tag match # U, Gold U --> True iff tag match
return label != g_tag return label != g_tag
# Support partial supervision in the form of "not this label"
elif g_act == ISNT:
return label == g_tag
else: else:
# U, Gold B --> False # U, Gold B --> False
# U, Gold I --> False # U, Gold I --> False
@ -388,7 +407,7 @@ cdef class Out:
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef int g_tag = gold.ner[s.B(0)].label
if g_act == MISSING: if g_act == MISSING or g_act == ISNT:
return 0 return 0
elif g_act == BEGIN: elif g_act == BEGIN:
# O, Gold B --> False # O, Gold B --> False

View File

@ -1,8 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from copy import copy from copy import copy
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from spacy.attrs import DEP, HEAD from ..attrs import DEP, HEAD
def ancestors(tokenid, heads): def ancestors(tokenid, heads):
@ -201,5 +202,3 @@ class PseudoProjectivity:
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
filtered.append((raw_text, filtered_sents)) filtered.append((raw_text, filtered_sents))
return filtered return filtered

View File

@ -1,58 +1,46 @@
# cython: infer_types=True
""" """
MALT-style dependency parser MALT-style dependency parser
""" """
# coding: utf-8
# cython: infer_types=True
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import Counter
import ujson
cimport cython cimport cython
cimport cython.parallel cimport cython.parallel
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals from cpython.exc cimport PyErr_CheckSignals
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from libc.stdlib cimport malloc, calloc, free from libc.stdlib cimport malloc, calloc, free
import os.path
from collections import Counter
from os import path
import shutil
import json
import sys
from .nonproj import PseudoProjectivity
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec from thinc.linalg cimport VecVec
from thinc.structs cimport SparseArrayC from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
from thinc.extra.eg cimport Example
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from thinc.structs cimport FeatureC
from thinc.structs cimport ExampleC
from thinc.extra.eg cimport Example
from util import Config
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse
from . import _parse_features from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .nonproj import PseudoProjectivity
from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from ..gold cimport GoldParse
USE_FTRL = True
USE_FTRL = False
DEBUG = False DEBUG = False
def set_debug(val): def set_debug(val):
global DEBUG global DEBUG
@ -80,7 +68,9 @@ cdef class ParserModel(AveragedPerceptron):
return nr_feat return nr_feat
def update(self, Example eg, itn=0): def update(self, Example eg, itn=0):
'''Does regression on negative cost. Sort of cute?''' """
Does regression on negative cost. Sort of cute?
"""
self.time += 1 self.time += 1
cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class) cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
cdef int guess = eg.guess cdef int guess = eg.guess
@ -132,10 +122,13 @@ cdef class ParserModel(AveragedPerceptron):
cdef class Parser: cdef class Parser:
"""Base class of the DependencyParser and EntityRecognizer.""" """
Base class of the DependencyParser and EntityRecognizer.
"""
@classmethod @classmethod
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg): def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
"""Load the statistical model from the supplied path. """
Load the statistical model from the supplied path.
Arguments: Arguments:
path (Path): path (Path):
@ -148,10 +141,16 @@ cdef class Parser:
The newly constructed object. The newly constructed object.
""" """
with (path / 'config.json').open() as file_: with (path / 'config.json').open() as file_:
cfg = json.load(file_) cfg = ujson.load(file_)
# TODO: remove this shim when we don't have to support older data # TODO: remove this shim when we don't have to support older data
if 'labels' in cfg and 'actions' not in cfg: if 'labels' in cfg and 'actions' not in cfg:
cfg['actions'] = cfg.pop('labels') cfg['actions'] = cfg.pop('labels')
# TODO: remove this shim when we don't have to support older data
for action_name, labels in dict(cfg['actions']).items():
# We need this to be sorted
if isinstance(labels, dict):
labels = list(sorted(labels.keys()))
cfg['actions'][action_name] = labels
self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
if (path / 'model').exists(): if (path / 'model').exists():
self.model.load(str(path / 'model')) self.model.load(str(path / 'model'))
@ -161,7 +160,8 @@ cdef class Parser:
return self return self
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
"""Create a Parser. """
Create a Parser.
Arguments: Arguments:
vocab (Vocab): vocab (Vocab):
@ -186,12 +186,18 @@ cdef class Parser:
self.model.learn_rate = cfg.get('learn_rate', 0.001) self.model.learn_rate = cfg.get('learn_rate', 0.001)
self.cfg = cfg self.cfg = cfg
# TODO: This is a pretty hacky fix to the problem of adding more
# labels. The issue is they come in out of order, if labels are
# added during training
for label in cfg.get('extra_labels', []):
self.add_label(label)
def __reduce__(self): def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None) return (Parser, (self.vocab, self.moves, self.model), None, None)
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the entity recognizer, setting the annotations onto the Doc object. """
Apply the entity recognizer, setting the annotations onto the Doc object.
Arguments: Arguments:
doc (Doc): The document to be processed. doc (Doc): The document to be processed.
@ -208,7 +214,8 @@ cdef class Parser:
self.moves.finalize_doc(tokens) self.moves.finalize_doc(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2): def pipe(self, stream, int batch_size=1000, int n_threads=2):
"""Process a stream of documents. """
Process a stream of documents.
Arguments: Arguments:
stream: The sequence of documents to process. stream: The sequence of documents to process.
@ -296,7 +303,8 @@ cdef class Parser:
return 0 return 0
def update(self, Doc tokens, GoldParse gold, itn=0): def update(self, Doc tokens, GoldParse gold, itn=0):
"""Update the statistical model. """
Update the statistical model.
Arguments: Arguments:
doc (Doc): doc (Doc):
@ -334,15 +342,17 @@ cdef class Parser:
self.moves.finalize_state(stcls.c) self.moves.finalize_state(stcls.c)
return loss return loss
def step_through(self, Doc doc): def step_through(self, Doc doc, GoldParse gold=None):
"""Set up a stepwise state, to introspect and control the transition sequence. """
Set up a stepwise state, to introspect and control the transition sequence.
Arguments: Arguments:
doc (Doc): The document to step through. doc (Doc): The document to step through.
gold (GoldParse): Optional gold parse
Returns (StepwiseState): Returns (StepwiseState):
A state object, to step through the annotation process. A state object, to step through the annotation process.
""" """
return StepwiseState(self, doc) return StepwiseState(self, doc, gold=gold)
def from_transition_sequence(self, Doc doc, sequence): def from_transition_sequence(self, Doc doc, sequence):
"""Control the annotations on a document by specifying a transition sequence """Control the annotations on a document by specifying a transition sequence
@ -360,18 +370,28 @@ cdef class Parser:
def add_label(self, label): def add_label(self, label):
# Doesn't set label into serializer -- subclasses override it to do that. # Doesn't set label into serializer -- subclasses override it to do that.
for action in self.moves.action_types: for action in self.moves.action_types:
self.moves.add_action(action, label) added = self.moves.add_action(action, label)
if added:
# Important that the labels be stored as a list! We need the
# order, or the model goes out of synch
self.cfg.setdefault('extra_labels', []).append(label)
cdef class StepwiseState: cdef class StepwiseState:
cdef readonly StateClass stcls cdef readonly StateClass stcls
cdef readonly Example eg cdef readonly Example eg
cdef readonly Doc doc cdef readonly Doc doc
cdef readonly GoldParse gold
cdef readonly Parser parser cdef readonly Parser parser
def __init__(self, Parser parser, Doc doc): def __init__(self, Parser parser, Doc doc, GoldParse gold=None):
self.parser = parser self.parser = parser
self.doc = doc self.doc = doc
if gold is not None:
self.gold = gold
self.parser.moves.preprocess_gold(self.gold)
else:
self.gold = GoldParse(doc)
self.stcls = StateClass.init(doc.c, doc.length) self.stcls = StateClass.init(doc.c, doc.length)
self.parser.moves.initialize_state(self.stcls.c) self.parser.moves.initialize_state(self.stcls.c)
self.eg = Example( self.eg = Example(
@ -406,6 +426,24 @@ cdef class StepwiseState:
return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
for i in range(self.stcls.c.length)] for i in range(self.stcls.c.length)]
@property
def costs(self):
"""
Find the action-costs for the current state.
"""
if not self.gold:
raise ValueError("Can't set costs: No GoldParse provided")
self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
self.stcls, self.gold)
costs = {}
for i in range(self.parser.moves.n_moves):
if not self.eg.c.is_valid[i]:
continue
transition = self.parser.moves.c[i]
name = self.parser.moves.move_name(transition.move, transition.label)
costs[name] = self.eg.c.costs[i]
return costs
def predict(self): def predict(self):
self.eg.reset() self.eg.reset()
self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features, self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features,

View File

@ -1,5 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from ..vocab cimport EMPTY_LEXEME from ..vocab cimport EMPTY_LEXEME
from ..structs cimport Entity from ..structs cimport Entity
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
@ -28,6 +32,6 @@ cdef class StateClass:
top = words[self.S(0)] + '_%d' % self.S_(0).head top = words[self.S(0)] + '_%d' % self.S_(0).head
second = words[self.S(1)] + '_%d' % self.S_(1).head second = words[self.S(1)] + '_%d' % self.S_(1).head
third = words[self.S(2)] + '_%d' % self.S_(2).head third = words[self.S(2)] + '_%d' % self.S_(2).head
n0 = words[self.B(0)] n0 = words[self.B(0)]
n1 = words[self.B(1)] n1 = words[self.B(1)]
return ' '.join((third, second, top, '|', n0, n1)) return ' '.join((third, second, top, '|', n0, n1))

View File

@ -1,4 +1,8 @@
# cython: infer_types=True # cython: infer_types=True
# coding: utf-8
from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from collections import defaultdict from collections import defaultdict
@ -6,7 +10,6 @@ from collections import defaultdict
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
cdef weight_t MIN_SCORE = -90000 cdef weight_t MIN_SCORE = -90000
@ -32,7 +35,7 @@ cdef class TransitionSystem:
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition)) self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
for action, label_strs in sorted(labels_by_action.items()): for action, label_strs in sorted(labels_by_action.items()):
for label_str in sorted(label_strs): for label_str in label_strs:
self.add_action(int(action), label_str) self.add_action(int(action), label_str)
self.root_label = self.strings['ROOT'] self.root_label = self.strings['ROOT']
self.freqs = {} if _freqs is None else _freqs self.freqs = {} if _freqs is None else _freqs

View File

@ -1,18 +0,0 @@
from os import path
import json
class Config(object):
def __init__(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)
def get(self, attr, default=None):
return self.__dict__.get(attr, default)
@classmethod
def write(cls, model_dir, name, **kwargs):
open(path.join(model_dir, '%s.json' % name), 'w').write(json.dumps(kwargs))
@classmethod
def read(cls, model_dir, name):
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))

View File

@ -1,5 +1,7 @@
import json # coding: utf8
import pathlib from __future__ import unicode_literals
import ujson
from collections import defaultdict from collections import defaultdict
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -12,8 +14,8 @@ from thinc.linalg cimport VecVec
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .attrs cimport TAG from .attrs cimport TAG
from .gold cimport GoldParse from .gold cimport GoldParse
from .attrs cimport * from .attrs cimport *
from . import util
cpdef enum: cpdef enum:
@ -106,10 +108,13 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
cdef class Tagger: cdef class Tagger:
"""Annotate part-of-speech tags on Doc objects.""" """
Annotate part-of-speech tags on Doc objects.
"""
@classmethod @classmethod
def load(cls, path, vocab, require=False): def load(cls, path, vocab, require=False):
"""Load the statistical model from the supplied path. """
Load the statistical model from the supplied path.
Arguments: Arguments:
path (Path): path (Path):
@ -123,10 +128,10 @@ cdef class Tagger:
""" """
# TODO: Change this to expect config.json when we don't have to # TODO: Change this to expect config.json when we don't have to
# support old data. # support old data.
path = path if not isinstance(path, basestring) else pathlib.Path(path) path = util.ensure_path(path)
if (path / 'templates.json').exists(): if (path / 'templates.json').exists():
with (path / 'templates.json').open('r', encoding='utf8') as file_: with (path / 'templates.json').open('r', encoding='utf8') as file_:
templates = json.load(file_) templates = ujson.load(file_)
elif require: elif require:
raise IOError( raise IOError(
"Required file %s/templates.json not found when loading Tagger" % str(path)) "Required file %s/templates.json not found when loading Tagger" % str(path))
@ -142,7 +147,8 @@ cdef class Tagger:
return self return self
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
"""Create a Tagger. """
Create a Tagger.
Arguments: Arguments:
vocab (Vocab): vocab (Vocab):
@ -180,7 +186,8 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object. """
Apply the tagger, setting the POS tags onto the Doc object.
Arguments: Arguments:
doc (Doc): The tokens to be tagged. doc (Doc): The tokens to be tagged.
@ -208,7 +215,8 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2): def pipe(self, stream, batch_size=1000, n_threads=2):
"""Tag a stream of documents. """
Tag a stream of documents.
Arguments: Arguments:
stream: The sequence of documents to tag. stream: The sequence of documents to tag.
@ -225,7 +233,8 @@ cdef class Tagger:
yield doc yield doc
def update(self, Doc tokens, GoldParse gold, itn=0): def update(self, Doc tokens, GoldParse gold, itn=0):
"""Update the statistical model, with tags supplied for the given document. """
Update the statistical model, with tags supplied for the given document.
Arguments: Arguments:
doc (Doc): doc (Doc):

View File

@ -1,17 +1,11 @@
# cython: embedsignature=True # cython: embedsignature=True
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pathlib import ujson
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
try:
import ujson as json
except ImportError:
import json
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
@ -23,12 +17,15 @@ from .tokens.doc cimport Doc
cdef class Tokenizer: cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment boundaries.""" """
Segment text, and create Doc objects with the discovered segment boundaries.
"""
@classmethod @classmethod
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
infix_finditer=None, token_match=None): infix_finditer=None, token_match=None):
'''Load a Tokenizer, reading unsupplied components from the path. """
Load a Tokenizer, reading unsupplied components from the path.
Arguments: Arguments:
path (Path): path (Path):
The path to load from. The path to load from.
@ -45,13 +42,11 @@ cdef class Tokenizer:
infix_finditer: infix_finditer:
Signature of re.compile(string).finditer Signature of re.compile(string).finditer
Returns Tokenizer Returns Tokenizer
''' """
if isinstance(path, basestring): path = util.ensure_path(path)
path = pathlib.Path(path)
if rules is None: if rules is None:
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_: with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
rules = json.load(file_) rules = ujson.load(file_)
if prefix_search in (None, True): if prefix_search in (None, True):
with (path / 'tokenizer' / 'prefix.txt').open() as file_: with (path / 'tokenizer' / 'prefix.txt').open() as file_:
entries = file_.read().split('\n') entries = file_.read().split('\n')
@ -67,8 +62,9 @@ cdef class Tokenizer:
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
'''Create a Tokenizer, to create Doc objects given unicode text. """
Create a Tokenizer, to create Doc objects given unicode text.
Arguments: Arguments:
vocab (Vocab): vocab (Vocab):
A storage container for lexical types. A storage container for lexical types.
@ -85,7 +81,7 @@ cdef class Tokenizer:
to find infixes. to find infixes.
token_match: token_match:
A boolean function matching strings that becomes tokens. A boolean function matching strings that becomes tokens.
''' """
self.mem = Pool() self.mem = Pool()
self._cache = PreshMap() self._cache = PreshMap()
self._specials = PreshMap() self._specials = PreshMap()
@ -107,7 +103,7 @@ cdef class Tokenizer:
self.token_match) self.token_match)
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings): cpdef Doc tokens_from_list(self, list strings):
return Doc(self.vocab, words=strings) return Doc(self.vocab, words=strings)
#raise NotImplementedError( #raise NotImplementedError(
@ -117,7 +113,8 @@ cdef class Tokenizer:
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, unicode string): def __call__(self, unicode string):
"""Tokenize a string. """
Tokenize a string.
Arguments: Arguments:
string (unicode): The string to tokenize. string (unicode): The string to tokenize.
@ -170,7 +167,8 @@ cdef class Tokenizer:
return tokens return tokens
def pipe(self, texts, batch_size=1000, n_threads=2): def pipe(self, texts, batch_size=1000, n_threads=2):
"""Tokenize a stream of texts. """
Tokenize a stream of texts.
Arguments: Arguments:
texts: A sequence of unicode texts. texts: A sequence of unicode texts.
@ -270,7 +268,7 @@ cdef class Tokenizer:
cache_hit = self._try_cache(hash_string(string), tokens) cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit: if cache_hit:
pass pass
elif self.token_match and self.token_match(string): elif self.token_match and self.token_match(string):
# We're always saying 'no' to spaces here -- the caller will # We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original. # fix up the outermost one, with reference to the original.
# See Issue #859 # See Issue #859
@ -324,7 +322,8 @@ cdef class Tokenizer:
self._cache.set(key, cached) self._cache.set(key, cached)
def find_infix(self, unicode string): def find_infix(self, unicode string):
"""Find internal split points of the string, such as hyphens. """
Find internal split points of the string, such as hyphens.
string (unicode): The string to segment. string (unicode): The string to segment.
@ -337,7 +336,8 @@ cdef class Tokenizer:
return list(self.infix_finditer(string)) return list(self.infix_finditer(string))
def find_prefix(self, unicode string): def find_prefix(self, unicode string):
"""Find the length of a prefix that should be segmented from the string, """
Find the length of a prefix that should be segmented from the string,
or None if no prefix rules match. or None if no prefix rules match.
Arguments: Arguments:
@ -350,7 +350,8 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string): def find_suffix(self, unicode string):
"""Find the length of a suffix that should be segmented from the string, """
Find the length of a suffix that should be segmented from the string,
or None if no suffix rules match. or None if no suffix rules match.
Arguments: Arguments:
@ -363,13 +364,15 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, special_cases): def _load_special_tokenization(self, special_cases):
'''Add special-case tokenization rules. """
''' Add special-case tokenization rules.
"""
for chunk, substrings in sorted(special_cases.items()): for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings) self.add_special_case(chunk, substrings)
def add_special_case(self, unicode string, substrings): def add_special_case(self, unicode string, substrings):
'''Add a special-case tokenization rule. """
Add a special-case tokenization rule.
Arguments: Arguments:
string (unicode): The string to specially tokenize. string (unicode): The string to specially tokenize.
@ -378,7 +381,7 @@ cdef class Tokenizer:
attributes. The ORTH fields of the attributes must exactly match attributes. The ORTH fields of the attributes must exactly match
the string when they are concatenated. the string when they are concatenated.
Returns None Returns None
''' """
substrings = list(substrings) substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings) cached.length = len(substrings)

View File

@ -1,15 +1,18 @@
# coding: utf8
from __future__ import unicode_literals
cimport cython cimport cython
cimport numpy as np
import numpy
import numpy.linalg
import struct
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.math cimport sqrt from libc.math cimport sqrt
import numpy from .span cimport Span
import numpy.linalg from .token cimport Token
import struct
cimport numpy as np
import six
import warnings
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
@ -19,11 +22,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .span cimport Span
from .token cimport Token
from ..serialize.bits cimport BitArray from ..serialize.bits cimport BitArray
from ..util import normalize_slice from ..util import normalize_slice
from ..syntax.iterators import CHUNKERS from ..syntax.iterators import CHUNKERS
from ..compat import is_config
DEF PADDING = 5 DEF PADDING = 5
@ -76,7 +78,7 @@ cdef class Doc:
""" """
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
''' """
Create a Doc object. Create a Doc object.
Aside: Implementation Aside: Implementation
@ -97,7 +99,7 @@ cdef class Doc:
A list of boolean values, of the same length as words. True A list of boolean values, of the same length as words. True
means that the word is followed by a space, False means it is not. means that the word is followed by a space, False means it is not.
If None, defaults to [True]*len(words) If None, defaults to [True]*len(words)
''' """
self.vocab = vocab self.vocab = vocab
size = 20 size = 20
self.mem = Pool() self.mem = Pool()
@ -158,7 +160,7 @@ cdef class Doc:
self.is_parsed = True self.is_parsed = True
def __getitem__(self, object i): def __getitem__(self, object i):
''' """
doc[i] doc[i]
Get the Token object at position i, where i is an integer. Get the Token object at position i, where i is an integer.
Negative indexing is supported, and follows the usual Python Negative indexing is supported, and follows the usual Python
@ -172,7 +174,7 @@ cdef class Doc:
are not supported, as `Span` objects must be contiguous (cannot have gaps). are not supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have their You can use negative indices and open-ended ranges, which have their
normal Python semantics. normal Python semantics.
''' """
if isinstance(i, slice): if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step) start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
return Span(self, start, stop, label=0) return Span(self, start, stop, label=0)
@ -186,7 +188,7 @@ cdef class Doc:
return Token.cinit(self.vocab, &self.c[i], i, self) return Token.cinit(self.vocab, &self.c[i], i, self)
def __iter__(self): def __iter__(self):
''' """
for token in doc for token in doc
Iterate over `Token` objects, from which the annotations can Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token be easily accessed. This is the main way of accessing Token
@ -194,7 +196,7 @@ cdef class Doc:
Python. If faster-than-Python speeds are required, you can Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the instead access the annotations as a numpy array, or access the
underlying C data directly from Cython. underlying C data directly from Cython.
''' """
cdef int i cdef int i
for i in range(self.length): for i in range(self.length):
if self._py_tokens[i] is not None: if self._py_tokens[i] is not None:
@ -203,10 +205,10 @@ cdef class Doc:
yield Token.cinit(self.vocab, &self.c[i], i, self) yield Token.cinit(self.vocab, &self.c[i], i, self)
def __len__(self): def __len__(self):
''' """
len(doc) len(doc)
The number of tokens in the document. The number of tokens in the document.
''' """
return self.length return self.length
def __unicode__(self): def __unicode__(self):
@ -216,7 +218,7 @@ cdef class Doc:
return u''.join([t.text_with_ws for t in self]).encode('utf-8') return u''.join([t.text_with_ws for t in self]).encode('utf-8')
def __str__(self): def __str__(self):
if six.PY3: if is_config(python3=True):
return self.__unicode__() return self.__unicode__()
return self.__bytes__() return self.__bytes__()
@ -228,7 +230,8 @@ cdef class Doc:
return self return self
def similarity(self, other): def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine """
Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors. similarity using an average of word vectors.
Arguments: Arguments:
@ -237,7 +240,7 @@ cdef class Doc:
Return: Return:
score (float): A scalar similarity score. Higher is more similar. score (float): A scalar similarity score. Higher is more similar.
''' """
if 'similarity' in self.user_hooks: if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other) return self.user_hooks['similarity'](self, other)
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
@ -245,9 +248,9 @@ cdef class Doc:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector: property has_vector:
''' """
A boolean value indicating whether a word vector is associated with the object. A boolean value indicating whether a word vector is associated with the object.
''' """
def __get__(self): def __get__(self):
if 'has_vector' in self.user_hooks: if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self) return self.user_hooks['has_vector'](self)
@ -255,11 +258,11 @@ cdef class Doc:
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
''' """
A real-valued meaning representation. Defaults to an average of the token vectors. A real-valued meaning representation. Defaults to an average of the token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32'] Type: numpy.ndarray[ndim=1, dtype='float32']
''' """
def __get__(self): def __get__(self):
if 'vector' in self.user_hooks: if 'vector' in self.user_hooks:
return self.user_hooks['vector'](self) return self.user_hooks['vector'](self)
@ -294,17 +297,21 @@ cdef class Doc:
return self.text return self.text
property text: property text:
'''A unicode representation of the document text.''' """
A unicode representation of the document text.
"""
def __get__(self): def __get__(self):
return u''.join(t.text_with_ws for t in self) return u''.join(t.text_with_ws for t in self)
property text_with_ws: property text_with_ws:
'''An alias of Doc.text, provided for duck-type compatibility with Span and Token.''' """
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
"""
def __get__(self): def __get__(self):
return self.text return self.text
property ents: property ents:
''' """
Yields named-entity `Span` objects, if the entity recognizer Yields named-entity `Span` objects, if the entity recognizer
has been applied to the document. Iterate over the span to get has been applied to the document. Iterate over the span to get
individual Token objects, or access the label: individual Token objects, or access the label:
@ -318,7 +325,7 @@ cdef class Doc:
assert ents[0].label_ == 'PERSON' assert ents[0].label_ == 'PERSON'
assert ents[0].orth_ == 'Best' assert ents[0].orth_ == 'Best'
assert ents[0].text == 'Mr. Best' assert ents[0].text == 'Mr. Best'
''' """
def __get__(self): def __get__(self):
cdef int i cdef int i
cdef const TokenC* token cdef const TokenC* token
@ -382,13 +389,13 @@ cdef class Doc:
self.c[start].ent_iob = 3 self.c[start].ent_iob = 3
property noun_chunks: property noun_chunks:
''' """
Yields base noun-phrase #[code Span] objects, if the document Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to 'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example: phrases, and no relative clauses.
''' """
def __get__(self): def __get__(self):
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(
@ -496,7 +503,8 @@ cdef class Doc:
return output return output
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed """
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
by the values of the given attribute ID. by the values of the given attribute ID.
Example: Example:
@ -563,8 +571,9 @@ cdef class Doc:
self.c[i] = parsed[i] self.c[i] = parsed[i]
def from_array(self, attrs, array): def from_array(self, attrs, array):
'''Write to a `Doc` object, from an `(M, N)` array of attributes. """
''' Write to a `Doc` object, from an `(M, N)` array of attributes.
"""
cdef int i, col cdef int i, col
cdef attr_id_t attr_id cdef attr_id_t attr_id
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c
@ -603,19 +612,23 @@ cdef class Doc:
return self return self
def to_bytes(self): def to_bytes(self):
'''Serialize, producing a byte string.''' """
Serialize, producing a byte string.
"""
byte_string = self.vocab.serializer.pack(self) byte_string = self.vocab.serializer.pack(self)
cdef uint32_t length = len(byte_string) cdef uint32_t length = len(byte_string)
return struct.pack('I', length) + byte_string return struct.pack('I', length) + byte_string
def from_bytes(self, data): def from_bytes(self, data):
'''Deserialize, loading from bytes.''' """
Deserialize, loading from bytes.
"""
self.vocab.serializer.unpack_into(data[4:], self) self.vocab.serializer.unpack_into(data[4:], self)
return self return self
@staticmethod @staticmethod
def read_bytes(file_): def read_bytes(file_):
''' """
A static method, used to read serialized #[code Doc] objects from A static method, used to read serialized #[code Doc] objects from
a file. For example: a file. For example:
@ -630,7 +643,7 @@ cdef class Doc:
for byte_string in Doc.read_bytes(file_): for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string)) docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2 assert len(docs) == 2
''' """
keep_reading = True keep_reading = True
while keep_reading: while keep_reading:
try: try:
@ -644,7 +657,8 @@ cdef class Doc:
yield n_bytes_str + data yield n_bytes_str + data
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at doc.text[start_idx : end_idx] """
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
is merged into a single token. If start_idx and end_idx do not mark start is merged into a single token. If start_idx and end_idx do not mark start
and end token boundaries, the document remains unchanged. and end token boundaries, the document remains unchanged.
@ -658,7 +672,6 @@ cdef class Doc:
token (Token): token (Token):
The newly merged token, or None if the start and end indices did The newly merged token, or None if the start and end indices did
not fall at token boundaries. not fall at token boundaries.
""" """
cdef unicode tag, lemma, ent_type cdef unicode tag, lemma, ent_type
if len(args) == 3: if len(args) == 3:

View File

@ -1,26 +1,31 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import defaultdict from collections import defaultdict
cimport numpy as np
import numpy import numpy
import numpy.linalg import numpy.linalg
cimport numpy as np
from libc.math cimport sqrt from libc.math cimport sqrt
import six
from .doc cimport token_by_start, token_by_end
from ..structs cimport TokenC, LexemeC from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t, hash_t from ..typedefs cimport flags_t, attr_t, hash_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice
from .doc cimport token_by_start, token_by_end
from ..attrs cimport IS_PUNCT, IS_SPACE from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config
cdef class Span: cdef class Span:
"""A slice from a Doc object.""" """
A slice from a Doc object.
"""
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None, def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
vector_norm=None): vector_norm=None):
'''Create a Span object from the slice doc[start : end] """
Create a Span object from the slice doc[start : end]
Arguments: Arguments:
doc (Doc): The parent document. doc (Doc): The parent document.
@ -30,7 +35,7 @@ cdef class Span:
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
Returns: Returns:
Span The newly constructed object. Span The newly constructed object.
''' """
if not (0 <= start <= end <= len(doc)): if not (0 <= start <= end <= len(doc)):
raise IndexError raise IndexError
@ -68,7 +73,7 @@ cdef class Span:
return self.end - self.start return self.end - self.start
def __repr__(self): def __repr__(self):
if six.PY3: if is_config(python3=True):
return self.text return self.text
return self.text.encode('utf-8') return self.text.encode('utf-8')
@ -89,7 +94,8 @@ cdef class Span:
yield self.doc[i] yield self.doc[i]
def merge(self, *args, **attributes): def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single token. """
Retokenize the document, such that the span is merged into a single token.
Arguments: Arguments:
**attributes: **attributes:
@ -102,7 +108,8 @@ cdef class Span:
return self.doc.merge(self.start_char, self.end_char, *args, **attributes) return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
def similarity(self, other): def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine """
Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors. similarity using an average of word vectors.
Arguments: Arguments:
@ -111,7 +118,7 @@ cdef class Span:
Return: Return:
score (float): A scalar similarity score. Higher is more similar. score (float): A scalar similarity score. Higher is more similar.
''' """
if 'similarity' in self.doc.user_span_hooks: if 'similarity' in self.doc.user_span_hooks:
self.doc.user_span_hooks['similarity'](self, other) self.doc.user_span_hooks['similarity'](self, other)
if self.vector_norm == 0.0 or other.vector_norm == 0.0: if self.vector_norm == 0.0 or other.vector_norm == 0.0:
@ -133,11 +140,12 @@ cdef class Span:
self.end = end + 1 self.end = end + 1
property sent: property sent:
'''The sentence span that this span is a part of. """
The sentence span that this span is a part of.
Returns: Returns:
Span The sentence this is part of. Span The sentence this is part of.
''' """
def __get__(self): def __get__(self):
if 'sent' in self.doc.user_span_hooks: if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self) return self.doc.user_span_hooks['sent'](self)
@ -198,13 +206,13 @@ cdef class Span:
return u''.join([t.text_with_ws for t in self]) return u''.join([t.text_with_ws for t in self])
property noun_chunks: property noun_chunks:
''' """
Yields base noun-phrase #[code Span] objects, if the document Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to 'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example: phrases, and no relative clauses. For example:
''' """
def __get__(self): def __get__(self):
if not self.doc.is_parsed: if not self.doc.is_parsed:
raise ValueError( raise ValueError(
@ -223,17 +231,16 @@ cdef class Span:
yield span yield span
property root: property root:
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered. """
The token within the span that's highest in the parse tree. If there's a
tie, the earlist is prefered.
Returns: Returns:
Token: The root token. Token: The root token.
i.e. has the i.e. has the shortest path to the root of the sentence (or is the root
shortest path to the root of the sentence (or is the root itself). itself). If multiple words are equally high in the tree, the first word
is taken. For example:
If multiple words are equally high in the tree, the first word is taken.
For example:
>>> toks = nlp(u'I like New York in Autumn.') >>> toks = nlp(u'I like New York in Autumn.')
@ -303,7 +310,8 @@ cdef class Span:
return self.doc[root] return self.doc[root]
property lefts: property lefts:
"""Tokens that are to the left of the span, whose head is within the Span. """
Tokens that are to the left of the span, whose head is within the Span.
Yields: Token A left-child of a token of the span. Yields: Token A left-child of a token of the span.
""" """
@ -314,7 +322,8 @@ cdef class Span:
yield left yield left
property rights: property rights:
"""Tokens that are to the right of the Span, whose head is within the Span. """
Tokens that are to the right of the Span, whose head is within the Span.
Yields: Token A right-child of a token of the span. Yields: Token A right-child of a token of the span.
""" """
@ -325,7 +334,8 @@ cdef class Span:
yield right yield right
property subtree: property subtree:
"""Tokens that descend from tokens in the span, but fall outside it. """
Tokens that descend from tokens in the span, but fall outside it.
Yields: Token A descendant of a token within the span. Yields: Token A descendant of a token within the span.
""" """
@ -337,7 +347,9 @@ cdef class Span:
yield from word.subtree yield from word.subtree
property ent_id: property ent_id:
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.''' """
An (integer) entity ID. Usually assigned by patterns in the Matcher.
"""
def __get__(self): def __get__(self):
return self.root.ent_id return self.root.ent_id
@ -345,9 +357,11 @@ cdef class Span:
# TODO # TODO
raise NotImplementedError( raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on the issue " "Can't yet set ent_id from Span. Vote for this feature on the issue "
"tracker: http://github.com/spacy-io/spaCy") "tracker: http://github.com/explosion/spaCy/issues")
property ent_id_: property ent_id_:
'''A (string) entity ID. Usually assigned by patterns in the Matcher.''' """
A (string) entity ID. Usually assigned by patterns in the Matcher.
"""
def __get__(self): def __get__(self):
return self.root.ent_id_ return self.root.ent_id_
@ -355,7 +369,7 @@ cdef class Span:
# TODO # TODO
raise NotImplementedError( raise NotImplementedError(
"Can't yet set ent_id_ from Span. Vote for this feature on the issue " "Can't yet set ent_id_ from Span. Vote for this feature on the issue "
"tracker: http://github.com/spacy-io/spaCy") "tracker: http://github.com/explosion/spaCy/issues")
property orth_: property orth_:
def __get__(self): def __get__(self):
@ -397,5 +411,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
raise RuntimeError( raise RuntimeError(
"Array bounds exceeded while searching for root word. This likely " "Array bounds exceeded while searching for root word. This likely "
"means the parse tree is in an invalid state. Please report this " "means the parse tree is in an invalid state. Please report this "
"issue here: http://github.com/honnibal/spaCy/") "issue here: http://github.com/explosion/spaCy/issues")
return n return n

View File

@ -1,5 +1,5 @@
# coding: utf8
# cython: infer_types=True # cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.string cimport memcpy from libc.string cimport memcpy
@ -8,20 +8,15 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
from cython.view cimport array as cvarray from cython.view cimport array as cvarray
cimport numpy as np cimport numpy as np
np.import_array() np.import_array()
import numpy import numpy
import six
from ..typedefs cimport hash_t from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from .. import parts_of_speech from .. import parts_of_speech
from ..attrs cimport LEMMA from ..attrs cimport LEMMA
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport POS, LEMMA, TAG, DEP from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CCONJ, PUNCT from ..parts_of_speech cimport CCONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET from ..attrs cimport IS_BRACKET
from ..attrs cimport IS_QUOTE from ..attrs cimport IS_QUOTE
@ -29,12 +24,13 @@ from ..attrs cimport IS_LEFT_PUNCT
from ..attrs cimport IS_RIGHT_PUNCT from ..attrs cimport IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV from ..attrs cimport IS_OOV
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config
cdef class Token: cdef class Token:
"""An individual token --- i.e. a word, punctuation symbol, whitespace, etc. """
An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
""" """
def __cinit__(self, Vocab vocab, Doc doc, int offset): def __cinit__(self, Vocab vocab, Doc doc, int offset):
self.vocab = vocab self.vocab = vocab
@ -46,7 +42,9 @@ cdef class Token:
return hash((self.doc, self.i)) return hash((self.doc, self.i))
def __len__(self): def __len__(self):
'''Number of unicode characters in token.text''' """
Number of unicode characters in token.text.
"""
return self.c.lex.length return self.c.lex.length
def __unicode__(self): def __unicode__(self):
@ -56,7 +54,7 @@ cdef class Token:
return self.text.encode('utf8') return self.text.encode('utf8')
def __str__(self): def __str__(self):
if six.PY3: if is_config(python3=True):
return self.__unicode__() return self.__unicode__()
return self.__bytes__() return self.__bytes__()
@ -83,27 +81,30 @@ cdef class Token:
raise ValueError(op) raise ValueError(op)
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
'''Check the value of a boolean flag. """
Check the value of a boolean flag.
Arguments: Arguments:
flag_id (int): The ID of the flag attribute. flag_id (int): The ID of the flag attribute.
Returns: Returns:
is_set (bool): Whether the flag is set. is_set (bool): Whether the flag is set.
''' """
return Lexeme.c_check_flag(self.c.lex, flag_id) return Lexeme.c_check_flag(self.c.lex, flag_id)
def nbor(self, int i=1): def nbor(self, int i=1):
'''Get a neighboring token. """
Get a neighboring token.
Arguments: Arguments:
i (int): The relative position of the token to get. Defaults to 1. i (int): The relative position of the token to get. Defaults to 1.
Returns: Returns:
neighbor (Token): The token at position self.doc[self.i+i] neighbor (Token): The token at position self.doc[self.i+i]
''' """
return self.doc[self.i+i] return self.doc[self.i+i]
def similarity(self, other): def similarity(self, other):
'''Compute a semantic similarity estimate. Defaults to cosine over vectors. """
Compute a semantic similarity estimate. Defaults to cosine over vectors.
Arguments: Arguments:
other: other:
@ -111,7 +112,7 @@ cdef class Token:
Token and Lexeme objects. Token and Lexeme objects.
Returns: Returns:
score (float): A scalar similarity score. Higher is more similar. score (float): A scalar similarity score. Higher is more similar.
''' """
if 'similarity' in self.doc.user_token_hooks: if 'similarity' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['similarity'](self) return self.doc.user_token_hooks['similarity'](self)
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
@ -209,9 +210,9 @@ cdef class Token:
self.c.dep = label self.c.dep = label
property has_vector: property has_vector:
''' """
A boolean value indicating whether a word vector is associated with the object. A boolean value indicating whether a word vector is associated with the object.
''' """
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.user_token_hooks: if 'has_vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['has_vector'](self) return self.doc.user_token_hooks['has_vector'](self)
@ -223,11 +224,11 @@ cdef class Token:
return False return False
property vector: property vector:
''' """
A real-valued meaning representation. A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32'] Type: numpy.ndarray[ndim=1, dtype='float32']
''' """
def __get__(self): def __get__(self):
if 'vector' in self.doc.user_token_hooks: if 'vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector'](self) return self.doc.user_token_hooks['vector'](self)
@ -245,6 +246,7 @@ cdef class Token:
property repvec: property repvec:
def __get__(self): def __get__(self):
raise AttributeError("repvec was renamed to vector in v0.100") raise AttributeError("repvec was renamed to vector in v0.100")
property has_repvec: property has_repvec:
def __get__(self): def __get__(self):
raise AttributeError("has_repvec was renamed to has_vector in v0.100") raise AttributeError("has_repvec was renamed to has_vector in v0.100")
@ -265,7 +267,8 @@ cdef class Token:
property lefts: property lefts:
def __get__(self): def __get__(self):
"""The leftward immediate children of the word, in the syntactic """
The leftward immediate children of the word, in the syntactic
dependency parse. dependency parse.
""" """
cdef int nr_iter = 0 cdef int nr_iter = 0
@ -282,8 +285,10 @@ cdef class Token:
property rights: property rights:
def __get__(self): def __get__(self):
"""The rightward immediate children of the word, in the syntactic """
dependency parse.""" The rightward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
tokens = [] tokens = []
cdef int nr_iter = 0 cdef int nr_iter = 0
@ -300,19 +305,21 @@ cdef class Token:
yield t yield t
property children: property children:
'''A sequence of the token's immediate syntactic children. """
A sequence of the token's immediate syntactic children.
Yields: Token A child token such that child.head==self Yields: Token A child token such that child.head==self
''' """
def __get__(self): def __get__(self):
yield from self.lefts yield from self.lefts
yield from self.rights yield from self.rights
property subtree: property subtree:
'''A sequence of all the token's syntactic descendents. """
A sequence of all the token's syntactic descendents.
Yields: Token A descendent token such that self.is_ancestor(descendent) Yields: Token A descendent token such that self.is_ancestor(descendent)
''' """
def __get__(self): def __get__(self):
for word in self.lefts: for word in self.lefts:
yield from word.subtree yield from word.subtree
@ -321,26 +328,29 @@ cdef class Token:
yield from word.subtree yield from word.subtree
property left_edge: property left_edge:
'''The leftmost token of this token's syntactic descendents. """
The leftmost token of this token's syntactic descendents.
Returns: Token The first token such that self.is_ancestor(token) Returns: Token The first token such that self.is_ancestor(token)
''' """
def __get__(self): def __get__(self):
return self.doc[self.c.l_edge] return self.doc[self.c.l_edge]
property right_edge: property right_edge:
'''The rightmost token of this token's syntactic descendents. """
The rightmost token of this token's syntactic descendents.
Returns: Token The last token such that self.is_ancestor(token) Returns: Token The last token such that self.is_ancestor(token)
''' """
def __get__(self): def __get__(self):
return self.doc[self.c.r_edge] return self.doc[self.c.r_edge]
property ancestors: property ancestors:
'''A sequence of this token's syntactic ancestors. """
A sequence of this token's syntactic ancestors.
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self) Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
''' """
def __get__(self): def __get__(self):
cdef const TokenC* head_ptr = self.c cdef const TokenC* head_ptr = self.c
# guard against infinite loop, no token can have # guard against infinite loop, no token can have
@ -356,25 +366,29 @@ cdef class Token:
return self.is_ancestor(descendant) return self.is_ancestor(descendant)
def is_ancestor(self, descendant): def is_ancestor(self, descendant):
'''Check whether this token is a parent, grandparent, etc. of another """
Check whether this token is a parent, grandparent, etc. of another
in the dependency tree. in the dependency tree.
Arguments: Arguments:
descendant (Token): Another token. descendant (Token): Another token.
Returns: Returns:
is_ancestor (bool): Whether this token is the ancestor of the descendant. is_ancestor (bool): Whether this token is the ancestor of the descendant.
''' """
if self.doc is not descendant.doc: if self.doc is not descendant.doc:
return False return False
return any( ancestor.i == self.i for ancestor in descendant.ancestors ) return any( ancestor.i == self.i for ancestor in descendant.ancestors )
property head: property head:
'''The syntactic parent, or "governor", of this token. """
The syntactic parent, or "governor", of this token.
Returns: Token Returns: Token
''' """
def __get__(self): def __get__(self):
"""The token predicted by the parser to be the head of the current token.""" """
The token predicted by the parser to be the head of the current token.
"""
return self.doc[self.i + self.c.head] return self.doc[self.i + self.c.head]
def __set__(self, Token new_head): def __set__(self, Token new_head):
# this function sets the head of self to new_head # this function sets the head of self to new_head
@ -467,10 +481,11 @@ cdef class Token:
self.c.head = rel_newhead_i self.c.head = rel_newhead_i
property conjuncts: property conjuncts:
'''A sequence of coordinated tokens, including the token itself. """
A sequence of coordinated tokens, including the token itself.
Yields: Token A coordinated token Yields: Token A coordinated token
''' """
def __get__(self): def __get__(self):
"""Get a list of conjoined words.""" """Get a list of conjoined words."""
cdef Token word cdef Token word
@ -501,7 +516,9 @@ cdef class Token:
return iob_strings[self.c.ent_iob] return iob_strings[self.c.ent_iob]
property ent_id: property ent_id:
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.''' """
An (integer) entity ID. Usually assigned by patterns in the Matcher.
"""
def __get__(self): def __get__(self):
return self.c.ent_id return self.c.ent_id
@ -509,7 +526,9 @@ cdef class Token:
self.c.ent_id = key self.c.ent_id = key
property ent_id_: property ent_id_:
'''A (string) entity ID. Usually assigned by patterns in the Matcher.''' """
A (string) entity ID. Usually assigned by patterns in the Matcher.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.ent_id] return self.vocab.strings[self.c.ent_id]

View File

@ -1,15 +1,16 @@
from __future__ import absolute_import # coding: utf8
from __future__ import unicode_literals from __future__ import absolute_import, unicode_literals
import random import random
import tqdm import tqdm
from .gold import GoldParse from .gold import GoldParse, merge_sents
from .scorer import Scorer from .scorer import Scorer
from .gold import merge_sents
class Trainer(object): class Trainer(object):
'''Manage training of an NLP pipeline.''' """
Manage training of an NLP pipeline.
"""
def __init__(self, nlp, gold_tuples): def __init__(self, nlp, gold_tuples):
self.nlp = nlp self.nlp = nlp
self.gold_tuples = gold_tuples self.gold_tuples = gold_tuples

View File

@ -1,29 +1,18 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import os
import io import io
import json import ujson
import re import re
import os.path from pathlib import Path
import pathlib
import sys import sys
import textwrap import textwrap
from .compat import basestring_, unicode_, input_
try:
basestring
except NameError:
basestring = str
try:
raw_input
except NameError: # Python 3
raw_input = input
LANGUAGES = {} LANGUAGES = {}
_data_path = pathlib.Path(__file__).parent / 'data' _data_path = Path(__file__).parent / 'data'
def set_lang_class(name, cls): def set_lang_class(name, cls):
@ -47,9 +36,14 @@ def get_data_path(require_exists=True):
def set_data_path(path): def set_data_path(path):
global _data_path global _data_path
if isinstance(path, basestring): _data_path = ensure_path(path)
path = pathlib.Path(path)
_data_path = path
def ensure_path(path):
if isinstance(path, basestring_):
return Path(path)
else:
return path
def or_(val1, val2): def or_(val1, val2):
@ -61,41 +55,8 @@ def or_(val1, val2):
return val2 return val2
def match_best_version(target_name, target_version, path):
path = path if not isinstance(path, basestring) else pathlib.Path(path)
if path is None or not path.exists():
return None
matches = []
for data_name in path.iterdir():
name, version = split_data_name(data_name.parts[-1])
if name == target_name and constraint_match(target_version, version):
matches.append((tuple(float(v) for v in version.split('.')), data_name))
if matches:
return pathlib.Path(max(matches)[1])
else:
return None
def split_data_name(name):
return name.split('-', 1) if '-' in name else (name, '')
def constraint_match(constraint_string, version):
# From http://github.com/spacy-io/sputnik
if not constraint_string:
return True
constraints = [c.strip() for c in constraint_string.split(',') if c.strip()]
for c in constraints:
if not re.match(r'[><=][=]?\d+(\.\d+)*', c):
raise ValueError('invalid constraint: %s' % c)
return all(semver.match(version, c) for c in constraints)
def read_regex(path): def read_regex(path):
path = path if not isinstance(path, basestring) else pathlib.Path(path) path = ensure_path(path)
with path.open() as file_: with path.open() as file_:
entries = file_.read().split('\n') entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
@ -152,21 +113,11 @@ def check_renamed_kwargs(renamed, kwargs):
raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def is_windows():
"""Check if user is on Windows."""
return sys.platform.startswith('win')
def is_python2():
"""Check if Python 2 is used."""
return sys.version.startswith('2.')
def parse_package_meta(package_path, package, require=True): def parse_package_meta(package_path, package, require=True):
location = os.path.join(str(package_path), package, 'meta.json') location = package_path / package / 'meta.json'
if os.path.isfile(location): if location.is_file():
with io.open(location, encoding='utf8') as f: with location.open('r', encoding='utf8') as f:
meta = json.load(f) meta = ujson.load(f)
return meta return meta
elif require: elif require:
raise IOError("Could not read meta.json from %s" % location) raise IOError("Could not read meta.json from %s" % location)
@ -181,7 +132,7 @@ def get_raw_input(description, default=False):
additional = ' (default: {d})'.format(d=default) if default else '' additional = ' (default: {d})'.format(d=default) if default else ''
prompt = ' {d}{a}: '.format(d=description, a=additional) prompt = ' {d}{a}: '.format(d=description, a=additional)
user_input = raw_input(prompt) user_input = input_(prompt)
return user_input return user_input
@ -209,10 +160,9 @@ def print_markdown(data, **kwargs):
which will be converted to a list of tuples.""" which will be converted to a list of tuples."""
def excl_value(value): def excl_value(value):
# don't print value if it contains absolute path of directory # don't print value if it contains absolute path of directory (i.e.
# (i.e. personal info that shouldn't need to be shared) # personal info). Other conditions can be included here if necessary.
# other conditions can be included here if necessary if unicode_(Path(__file__).parent) in value:
if str(pathlib.Path(__file__).parent) in value:
return True return True
if type(data) == dict: if type(data) == dict:

View File

@ -1,41 +1,29 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import bz2
import ujson
import re
from libc.string cimport memset from libc.string cimport memset
from libc.stdint cimport int32_t from libc.stdint cimport int32_t
from libc.math cimport sqrt from libc.math cimport sqrt
from cymem.cymem cimport Address
from pathlib import Path
import bz2
import ujson as json
import re
try:
import cPickle as pickle
except ImportError:
import pickle
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile, StringCFile from .cfile cimport CFile, StringCFile
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .tokens.token cimport Token from .tokens.token cimport Token
from . import attrs
from . import symbols
from cymem.cymem cimport Address
from .serialize.packer cimport Packer from .serialize.packer cimport Packer
from .attrs cimport PROB, LANG from .attrs cimport PROB, LANG
from .compat import copy_reg, pickle
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from . import util from . import util
from . import attrs
from . import symbols
try:
import copy_reg
except ImportError:
import copyreg as copy_reg
DEF MAX_VEC_SIZE = 100000 DEF MAX_VEC_SIZE = 100000
@ -48,8 +36,9 @@ EMPTY_LEXEME.vector = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
'''A map container for a language's LexemeC structs. """
''' A map container for a language's LexemeC structs.
"""
@classmethod @classmethod
def load(cls, path, lex_attr_getters=None, lemmatizer=True, def load(cls, path, lex_attr_getters=None, lemmatizer=True,
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs): tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
@ -72,8 +61,7 @@ cdef class Vocab:
Returns: Returns:
Vocab: The newly constructed vocab object. Vocab: The newly constructed vocab object.
""" """
if isinstance(path, basestring): path = util.ensure_path(path)
path = Path(path)
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
if 'vectors' in deprecated_kwargs: if 'vectors' in deprecated_kwargs:
raise AttributeError( raise AttributeError(
@ -81,7 +69,7 @@ cdef class Vocab:
"Install vectors after loading.") "Install vectors after loading.")
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists(): if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_: with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
tag_map = json.load(file_) tag_map = ujson.load(file_)
elif tag_map is True: elif tag_map is True:
tag_map = None tag_map = None
if lex_attr_getters is not None \ if lex_attr_getters is not None \
@ -94,12 +82,12 @@ cdef class Vocab:
lemmatizer = Lemmatizer.load(path) lemmatizer = Lemmatizer.load(path)
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists(): if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_: with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
serializer_freqs = json.load(file_) serializer_freqs = ujson.load(file_)
else: else:
serializer_freqs = None serializer_freqs = None
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
strings_list = json.load(file_) strings_list = ujson.load(file_)
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map, cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
strings=strings_list) strings=strings_list)
@ -108,7 +96,8 @@ cdef class Vocab:
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
serializer_freqs=None, strings=tuple(), **deprecated_kwargs): serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
'''Create the vocabulary. """
Create the vocabulary.
lex_attr_getters (dict): lex_attr_getters (dict):
A dictionary mapping attribute IDs to functions to compute them. A dictionary mapping attribute IDs to functions to compute them.
@ -123,7 +112,7 @@ cdef class Vocab:
Returns: Returns:
Vocab: The newly constructed vocab object. Vocab: The newly constructed vocab object.
''' """
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@ -172,17 +161,19 @@ cdef class Vocab:
return langfunc('_') if langfunc else '' return langfunc('_') if langfunc else ''
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """
The current number of lexemes stored.
"""
return self.length return self.length
def resize_vectors(self, int new_size): def resize_vectors(self, int new_size):
''' """
Set vectors_length to a new size, and allocate more memory for the Lexeme Set vectors_length to a new size, and allocate more memory for the Lexeme
vectors if necessary. The memory will be zeroed. vectors if necessary. The memory will be zeroed.
Arguments: Arguments:
new_size (int): The new size of the vectors. new_size (int): The new size of the vectors.
''' """
cdef hash_t key cdef hash_t key
cdef size_t addr cdef size_t addr
if new_size > self.vectors_length: if new_size > self.vectors_length:
@ -193,7 +184,8 @@ cdef class Vocab:
self.vectors_length = new_size self.vectors_length = new_size
def add_flag(self, flag_getter, int flag_id=-1): def add_flag(self, flag_getter, int flag_id=-1):
'''Set a new boolean flag to words in the vocabulary. """
Set a new boolean flag to words in the vocabulary.
The flag_setter function will be called over the words currently in the The flag_setter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able vocab, and then applied to new words as they occur. You'll then be able
@ -213,7 +205,7 @@ cdef class Vocab:
Returns: Returns:
flag_id (int): The integer ID by which the flag value can be checked. flag_id (int): The integer ID by which the flag value can be checked.
''' """
if flag_id == -1: if flag_id == -1:
for bit in range(1, 64): for bit in range(1, 64):
if bit not in self.lex_attr_getters: if bit not in self.lex_attr_getters:
@ -234,9 +226,11 @@ cdef class Vocab:
return flag_id return flag_id
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme """
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.''' is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if string == u'': if string == u'':
return &EMPTY_LEXEME return &EMPTY_LEXEME
cdef LexemeC* lex cdef LexemeC* lex
@ -252,9 +246,11 @@ cdef class Vocab:
return self._new_lexeme(mem, string) return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme """
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.''' is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if orth == 0: if orth == 0:
return &EMPTY_LEXEME return &EMPTY_LEXEME
cdef LexemeC* lex cdef LexemeC* lex
@ -297,30 +293,33 @@ cdef class Vocab:
self.length += 1 self.length += 1
def __contains__(self, unicode string): def __contains__(self, unicode string):
'''Check whether the string has an entry in the vocabulary. """
Check whether the string has an entry in the vocabulary.
Arguments: Arguments:
string (unicode): The ID string. string (unicode): The ID string.
Returns: Returns:
bool Whether the string has an entry in the vocabulary. bool Whether the string has an entry in the vocabulary.
''' """
key = hash_string(string) key = hash_string(string)
lex = self._by_hash.get(key) lex = self._by_hash.get(key)
return lex is not NULL return lex is not NULL
def __iter__(self): def __iter__(self):
'''Iterate over the lexemes in the vocabulary. """
Iterate over the lexemes in the vocabulary.
Yields: Lexeme An entry in the vocabulary. Yields: Lexeme An entry in the vocabulary.
''' """
cdef attr_t orth cdef attr_t orth
cdef size_t addr cdef size_t addr
for orth, addr in self._by_orth.items(): for orth, addr in self._by_orth.items():
yield Lexeme(self, orth) yield Lexeme(self, orth)
def __getitem__(self, id_or_string): def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously """
Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new lexeme is created and stored. unseen unicode string is given, a new lexeme is created and stored.
Arguments: Arguments:
@ -332,7 +331,7 @@ cdef class Vocab:
Returns: Returns:
lexeme (Lexeme): The lexeme indicated by the given ID. lexeme (Lexeme): The lexeme indicated by the given ID.
''' """
cdef attr_t orth cdef attr_t orth
if type(id_or_string) == unicode: if type(id_or_string) == unicode:
orth = self.strings[id_or_string] orth = self.strings[id_or_string]
@ -355,7 +354,8 @@ cdef class Vocab:
return tokens return tokens
def dump(self, loc=None): def dump(self, loc=None):
"""Save the lexemes binary data to the given location, or """
Save the lexemes binary data to the given location, or
return a byte-string with the data if loc is None. return a byte-string with the data if loc is None.
Arguments: Arguments:
@ -392,14 +392,15 @@ cdef class Vocab:
return fp.string_data() return fp.string_data()
def load_lexemes(self, loc): def load_lexemes(self, loc):
'''Load the binary vocabulary data from the given location. """
Load the binary vocabulary data from the given location.
Arguments: Arguments:
loc (Path): The path to load from. loc (Path): The path to load from.
Returns: Returns:
None None
''' """
fp = CFile(loc, 'rb', fp = CFile(loc, 'rb',
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc)) on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
cdef LexemeC* lexeme = NULL cdef LexemeC* lexeme = NULL
@ -440,8 +441,9 @@ cdef class Vocab:
fp.close() fp.close()
def _deserialize_lexemes(self, CFile fp): def _deserialize_lexemes(self, CFile fp):
'''Load the binary vocabulary data from the given CFile. """
''' Load the binary vocabulary data from the given CFile.
"""
cdef LexemeC* lexeme = NULL cdef LexemeC* lexeme = NULL
cdef hash_t key cdef hash_t key
cdef unicode py_str cdef unicode py_str
@ -494,13 +496,14 @@ cdef class Vocab:
fp.close() fp.close()
def dump_vectors(self, out_loc): def dump_vectors(self, out_loc):
'''Save the word vectors to a binary file. """
Save the word vectors to a binary file.
Arguments: Arguments:
loc (Path): The path to save to. loc (Path): The path to save to.
Returns: Returns:
None None
''' """
cdef int32_t vec_len = self.vectors_length cdef int32_t vec_len = self.vectors_length
cdef int32_t word_len cdef int32_t word_len
cdef bytes word_str cdef bytes word_str
@ -522,7 +525,8 @@ cdef class Vocab:
out_file.close() out_file.close()
def load_vectors(self, file_): def load_vectors(self, file_):
"""Load vectors from a text-based file. """
Load vectors from a text-based file.
Arguments: Arguments:
file_ (buffer): The file to read from. Entries should be separated by newlines, file_ (buffer): The file to read from. Entries should be separated by newlines,
@ -561,7 +565,8 @@ cdef class Vocab:
return vec_len return vec_len
def load_vectors_from_bin_loc(self, loc): def load_vectors_from_bin_loc(self, loc):
"""Load vectors from the location of a binary file. """
Load vectors from the location of a binary file.
Arguments: Arguments:
loc (unicode): The path of the binary file to load from. loc (unicode): The path of the binary file to load from.