mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
d7229967b0
|
@ -1,3 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
'''Example of training a named entity recognition system from scratch using spaCy
|
||||
|
||||
This example is written to be self-contained and reasonably transparent.
|
||||
|
@ -81,7 +82,7 @@ def load_vocab(path):
|
|||
def init_ner_model(vocab, features=None):
|
||||
if features is None:
|
||||
features = tuple(EntityRecognizer.feature_templates)
|
||||
return BeamEntityRecognizer(vocab, features=features)
|
||||
return EntityRecognizer(vocab, features=features)
|
||||
|
||||
|
||||
def save_ner_model(model, path):
|
||||
|
@ -99,7 +100,7 @@ def save_ner_model(model, path):
|
|||
|
||||
|
||||
def load_ner_model(vocab, path):
|
||||
return BeamEntityRecognizer.load(path, vocab)
|
||||
return EntityRecognizer.load(path, vocab)
|
||||
|
||||
|
||||
class Pipeline(object):
|
||||
|
@ -110,18 +111,21 @@ class Pipeline(object):
|
|||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||
vocab = load_vocab(path / 'vocab')
|
||||
vocab = load_vocab(path)
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
ner_model = load_ner_model(vocab, path / 'ner')
|
||||
return cls(vocab, tokenizer, ner_model)
|
||||
|
||||
def __init__(self, vocab=None, tokenizer=None, ner_model=None):
|
||||
def __init__(self, vocab=None, tokenizer=None, entity=None):
|
||||
if vocab is None:
|
||||
self.vocab = init_vocab()
|
||||
vocab = init_vocab()
|
||||
if tokenizer is None:
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
if ner_model is None:
|
||||
self.entity = init_ner_model(self.vocab)
|
||||
if entity is None:
|
||||
entity = init_ner_model(self.vocab)
|
||||
self.vocab = vocab
|
||||
self.tokenizer = tokenizer
|
||||
self.entity = entity
|
||||
self.pipeline = [self.entity]
|
||||
|
||||
def __call__(self, input_):
|
||||
|
@ -173,7 +177,7 @@ class Pipeline(object):
|
|||
save_ner_model(self.entity, path / 'ner')
|
||||
|
||||
|
||||
def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
||||
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
|
||||
next_epoch = train_examples
|
||||
print("Iter", "Loss", "P", "R", "F")
|
||||
for i in range(nr_epoch):
|
||||
|
@ -186,14 +190,17 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
|||
next_epoch.append((input_, annot))
|
||||
random.shuffle(next_epoch)
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
precision = '%.2f' % scores['ents_p']
|
||||
recall = '%.2f' % scores['ents_r']
|
||||
f_measure = '%.2f' % scores['ents_f']
|
||||
print(i, int(loss), precision, recall, f_measure)
|
||||
report_scores(i, loss, scores)
|
||||
nlp.average_weights()
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
print("After averaging")
|
||||
print(scores['ents_p'], scores['ents_r'], scores['ents_f'])
|
||||
report_scores(channels, i+1, loss, scores)
|
||||
|
||||
|
||||
def report_scores(i, loss, scores):
|
||||
precision = '%.2f' % scores['ents_p']
|
||||
recall = '%.2f' % scores['ents_r']
|
||||
f_measure = '%.2f' % scores['ents_f']
|
||||
print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
|
||||
|
||||
|
||||
def read_examples(path):
|
||||
|
@ -221,15 +228,17 @@ def read_examples(path):
|
|||
train_loc=("Path to your training data", "positional", None, Path),
|
||||
dev_loc=("Path to your development data", "positional", None, Path),
|
||||
)
|
||||
def main(model_dir, train_loc, dev_loc, nr_epoch=10):
|
||||
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
|
||||
train_loc=None, dev_loc=None, nr_epoch=30):
|
||||
|
||||
train_examples = read_examples(train_loc)
|
||||
dev_examples = read_examples(dev_loc)
|
||||
nlp = Pipeline()
|
||||
nlp = Pipeline.load(model_dir)
|
||||
|
||||
train(nlp, train_examples, list(dev_examples), nr_epoch)
|
||||
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
|
||||
|
||||
nlp.save(model_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
main()
|
74
examples/training/train_new_entity_type.py
Normal file
74
examples/training/train_new_entity_type.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
import json
|
||||
import pathlib
|
||||
import random
|
||||
|
||||
import spacy
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
|
||||
|
||||
try:
|
||||
unicode
|
||||
except:
|
||||
unicode = str
|
||||
|
||||
|
||||
def train_ner(nlp, train_data, output_dir):
|
||||
# Add new words to vocab.
|
||||
for raw_text, _ in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
for word in doc:
|
||||
_ = nlp.vocab[word.orth]
|
||||
|
||||
for itn in range(20):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
doc = nlp.make_doc(raw_text)
|
||||
nlp.tagger(doc)
|
||||
loss = nlp.entity.update(doc, gold)
|
||||
nlp.end_training()
|
||||
nlp.save_to_directory(output_dir)
|
||||
|
||||
|
||||
def main(model_name, output_directory=None):
|
||||
nlp = spacy.load(model_name)
|
||||
|
||||
train_data = [
|
||||
(
|
||||
"Horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')],
|
||||
),
|
||||
(
|
||||
"horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]
|
||||
),
|
||||
(
|
||||
"horses pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]
|
||||
),
|
||||
(
|
||||
"they pretend to care about your feelings, those horses",
|
||||
[(48, 54, 'ANIMAL')]
|
||||
)
|
||||
]
|
||||
nlp.entity.add_label('ANIMAL')
|
||||
if output_directory is not None:
|
||||
output_directory = pathlib.Path(output_directory)
|
||||
ner = train_ner(nlp, train_data, output_directory)
|
||||
|
||||
doc = nlp('Do you like horses?')
|
||||
for ent in doc.ents:
|
||||
print(ent.label_, ent.text)
|
||||
nlp2 = spacy.load('en', path=output_directory)
|
||||
nlp2.entity.add_label('ANIMAL')
|
||||
doc2 = nlp2('Do you like horses?')
|
||||
for ent in doc2.ents:
|
||||
print(ent.label_, ent.text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import plac
|
||||
plac.call(main)
|
2
fabfile.py
vendored
2
fabfile.py
vendored
|
@ -14,7 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
|
|||
def env(lang='python2.7'):
|
||||
if path.exists(VENV_DIR):
|
||||
local('rm -rf {env}'.format(env=VENV_DIR))
|
||||
local('virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
||||
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
||||
|
||||
|
||||
def install():
|
||||
|
|
|
@ -1,27 +1,13 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from .util import set_lang_class, get_lang_class, parse_package_meta
|
||||
from .deprecated import resolve_model_name
|
||||
from .cli import info
|
||||
|
||||
from . import en
|
||||
from . import de
|
||||
from . import zh
|
||||
from . import es
|
||||
from . import it
|
||||
from . import hu
|
||||
from . import fr
|
||||
from . import pt
|
||||
from . import nl
|
||||
from . import sv
|
||||
from . import fi
|
||||
from . import bn
|
||||
from . import he
|
||||
|
||||
from .about import *
|
||||
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
|
||||
|
||||
|
||||
set_lang_class(en.English.lang, en.English)
|
||||
|
|
|
@ -14,8 +14,9 @@ from spacy.cli import convert as cli_convert
|
|||
|
||||
|
||||
class CLI(object):
|
||||
"""Command-line interface for spaCy"""
|
||||
|
||||
"""
|
||||
Command-line interface for spaCy
|
||||
"""
|
||||
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
|
||||
|
||||
@plac.annotations(
|
||||
|
@ -29,7 +30,6 @@ class CLI(object):
|
|||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version.
|
||||
"""
|
||||
|
||||
cli_download(model, direct)
|
||||
|
||||
|
||||
|
@ -44,7 +44,6 @@ class CLI(object):
|
|||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
|
||||
cli_link(origin, link_name, force)
|
||||
|
||||
|
||||
|
@ -58,7 +57,6 @@ class CLI(object):
|
|||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
"""
|
||||
|
||||
cli_info(model, markdown)
|
||||
|
||||
|
||||
|
@ -73,7 +71,6 @@ class CLI(object):
|
|||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over.
|
||||
"""
|
||||
|
||||
cli_package(input_dir, output_dir, force)
|
||||
|
||||
|
||||
|
@ -93,7 +90,6 @@ class CLI(object):
|
|||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
|
||||
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
|
||||
not no_parser, not no_ner, parser_L1)
|
||||
|
||||
|
@ -108,7 +104,6 @@ class CLI(object):
|
|||
"""
|
||||
Initialize a new model and its data directory.
|
||||
"""
|
||||
|
||||
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
|
||||
|
||||
@plac.annotations(
|
||||
|
@ -122,7 +117,6 @@ class CLI(object):
|
|||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
"""
|
||||
|
||||
cli_convert(input_file, output_dir, n_sents, morphology)
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
IDS = {
|
||||
"": NULL_ATTR,
|
||||
"IS_ALPHA": IS_ALPHA,
|
||||
|
@ -92,7 +96,8 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
|||
|
||||
|
||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||
'''Normalize a dictionary of attributes, converting them to ints.
|
||||
"""
|
||||
Normalize a dictionary of attributes, converting them to ints.
|
||||
|
||||
Arguments:
|
||||
stringy_attrs (dict):
|
||||
|
@ -105,7 +110,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
inty_attrs (dict):
|
||||
Attributes dictionary with keys and optionally values converted to
|
||||
ints.
|
||||
'''
|
||||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
if 'F' in stringy_attrs:
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite
|
||||
from libc.string cimport memcpy
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
from pathlib import Path
|
||||
|
||||
from .converters import conllu2json
|
||||
|
|
|
@ -2,12 +2,12 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
from ...gold import read_json_file, merge_sents
|
||||
from ... import util
|
||||
|
||||
|
||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||
"""Convert conllu files into JSON format for use with train cli.
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
useful for languages such as Spanish, where UD tags are not so rich.
|
||||
"""
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pip
|
||||
import requests
|
||||
import os
|
||||
import subprocess
|
||||
|
|
|
@ -18,7 +18,6 @@ def info(model=None, markdown=False):
|
|||
else:
|
||||
data['source'] = str(model_path)
|
||||
print_info(data, "model " + model, markdown)
|
||||
|
||||
else:
|
||||
data = get_spacy_data()
|
||||
print_info(data, "spaCy", markdown)
|
||||
|
@ -26,10 +25,8 @@ def info(model=None, markdown=False):
|
|||
|
||||
def print_info(data, title, markdown):
|
||||
title = "Info about {title}".format(title=title)
|
||||
|
||||
if markdown:
|
||||
util.print_markdown(data, title=title)
|
||||
|
||||
else:
|
||||
util.print_table(data, title=title)
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
import pip
|
||||
from pathlib import Path
|
||||
import importlib
|
||||
from ..compat import unicode_, symlink_to
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -20,7 +21,6 @@ def link_package(package_name, link_name, force=False):
|
|||
# Python's installation and import rules are very complicated.
|
||||
pkg = importlib.import_module(package_name)
|
||||
package_path = Path(pkg.__file__).parent.parent
|
||||
|
||||
meta = get_meta(package_path, package_name)
|
||||
model_name = package_name + '-' + meta['version']
|
||||
model_path = package_path / package_name / model_name
|
||||
|
@ -43,23 +43,17 @@ def symlink(model_path, link_name, force):
|
|||
elif link_path.exists():
|
||||
link_path.unlink()
|
||||
|
||||
# Add workaround for Python 2 on Windows (see issue #909)
|
||||
if util.is_python2() and util.is_windows():
|
||||
import subprocess
|
||||
command = ['mklink', '/d', unicode(link_path), unicode(model_path)]
|
||||
try:
|
||||
subprocess.call(command, shell=True)
|
||||
except:
|
||||
# This is quite dirty, but just making sure other Windows-specific
|
||||
# errors are caught so users at least see a proper error message.
|
||||
util.sys_exit(
|
||||
"Creating a symlink in spacy/data failed. You can still import "
|
||||
"the model as a Python package and call its load() method, or "
|
||||
"create the symlink manually:",
|
||||
"{a} --> {b}".format(a=unicode(model_path), b=unicode(link_path)),
|
||||
title="Error: Couldn't link model to '{l}'".format(l=link_name))
|
||||
else:
|
||||
link_path.symlink_to(model_path)
|
||||
try:
|
||||
symlink_to(link_path, model_path)
|
||||
except:
|
||||
# This is quite dirty, but just making sure other errors are caught so
|
||||
# users at least see a proper message.
|
||||
util.sys_exit(
|
||||
"Creating a symlink in spacy/data failed. You can still import "
|
||||
"the model as a Python package and call its load() method, or "
|
||||
"create the symlink manually:",
|
||||
"{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
|
||||
title="Error: Couldn't link model to '{l}'".format(l=link_name))
|
||||
|
||||
util.print_msg(
|
||||
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
||||
|
|
|
@ -1,20 +1,13 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
import six
|
||||
|
||||
from .. import about
|
||||
from ..compat import unicode_, json_dumps
|
||||
from .. import util
|
||||
|
||||
if six.PY2:
|
||||
json_dumps = lambda data: json.dumps(data, indent=2).decode("utf8")
|
||||
elif six.PY3:
|
||||
json_dumps = lambda data: json.dumps(data, indent=2)
|
||||
|
||||
def package(input_dir, output_dir, force):
|
||||
input_path = Path(input_dir)
|
||||
|
@ -32,31 +25,31 @@ def package(input_dir, output_dir, force):
|
|||
package_path = main_path / model_name
|
||||
|
||||
create_dirs(package_path, force)
|
||||
shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix())
|
||||
shutil.copytree(unicode_(input_path), unicode_(package_path / model_name_v))
|
||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||
create_file(main_path / 'setup.py', template_setup)
|
||||
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||
create_file(package_path / '__init__.py', template_init)
|
||||
|
||||
util.print_msg(
|
||||
main_path.as_posix(),
|
||||
unicode_(main_path),
|
||||
"To build the package, run `python setup.py sdist` in that directory.",
|
||||
title="Successfully created package {p}".format(p=model_name_v))
|
||||
|
||||
|
||||
def check_dirs(input_path, output_path):
|
||||
if not input_path.exists():
|
||||
util.sys_exit(input_path.as_poisx(), title="Model directory not found")
|
||||
util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
|
||||
if not output_path.exists():
|
||||
util.sys_exit(output_path.as_posix(), title="Output directory not found")
|
||||
util.sys_exit(unicode_(output_path), title="Output directory not found")
|
||||
|
||||
|
||||
def create_dirs(package_path, force):
|
||||
if package_path.exists():
|
||||
if force:
|
||||
shutil.rmtree(package_path.as_posix())
|
||||
shutil.rmtree(unicode_(package_path.as_posix))
|
||||
else:
|
||||
util.sys_exit(package_path.as_posix(),
|
||||
util.sys_exit(unicode_(package_path.as_posix),
|
||||
"Please delete the directory and try again.",
|
||||
title="Package directory already exists")
|
||||
Path.mkdir(package_path, parents=True)
|
||||
|
|
|
@ -5,8 +5,6 @@ import json
|
|||
from pathlib import Path
|
||||
|
||||
from ..scorer import Scorer
|
||||
from ..tagger import Tagger
|
||||
from ..syntax.parser import Parser
|
||||
from ..gold import GoldParse, merge_sents
|
||||
from ..gold import read_json_file as read_gold_json
|
||||
from .. import util
|
||||
|
@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
|
|||
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
||||
|
||||
with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
||||
loss = 0
|
||||
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
|
||||
for doc, gold in epoch:
|
||||
trainer.update(doc, gold)
|
||||
|
|
54
spacy/compat.py
Normal file
54
spacy/compat.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import six
|
||||
import sys
|
||||
import ujson
|
||||
|
||||
try:
|
||||
import cPickle as pickle
|
||||
except ImportError:
|
||||
import pickle
|
||||
|
||||
try:
|
||||
import copy_reg
|
||||
except ImportError:
|
||||
import copyreg as copy_reg
|
||||
|
||||
|
||||
is_python2 = six.PY2
|
||||
is_python3 = six.PY3
|
||||
is_windows = sys.platform.startswith('win')
|
||||
is_linux = sys.platform.startswith('linux')
|
||||
is_osx = sys.platform == 'darwin'
|
||||
|
||||
|
||||
if is_python2:
|
||||
bytes_ = str
|
||||
unicode_ = unicode
|
||||
basestring_ = basestring
|
||||
input_ = raw_input
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
|
||||
|
||||
elif is_python3:
|
||||
bytes_ = bytes
|
||||
unicode_ = str
|
||||
basestring_ = str
|
||||
input_ = input
|
||||
json_dumps = lambda data: ujson.dumps(data, indent=2)
|
||||
|
||||
|
||||
def symlink_to(orig, dest):
|
||||
if is_python2 and is_windows:
|
||||
import subprocess
|
||||
subprocess.call(['mklink', '/d', unicode(orig), unicode(dest)], shell=True)
|
||||
else:
|
||||
orig.symlink_to(dest)
|
||||
|
||||
|
||||
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
||||
return ((python2 == None or python2 == is_python2) and
|
||||
(python3 == None or python3 == is_python3) and
|
||||
(windows == None or windows == is_windows) and
|
||||
(linux == None or linux == is_linux) and
|
||||
(osx == None or osx == is_osx))
|
|
@ -1,16 +1,14 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from . import about
|
||||
from . import util
|
||||
from .cli import download
|
||||
from .cli import link
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
|
||||
|
||||
def read_lang_data(package):
|
||||
tokenization = package.load_json(('tokenizer', 'specials.json'))
|
||||
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
|
||||
|
@ -36,7 +34,8 @@ def align_tokens(ref, indices): # Deprecated, surely?
|
|||
|
||||
|
||||
def detokenize(token_rules, words): # Deprecated?
|
||||
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
||||
"""
|
||||
To align with treebanks, return a list of "chunks", where a chunk is a
|
||||
sequence of tokens that are separated by whitespace in actual strings. Each
|
||||
chunk should be a tuple of token indices, e.g.
|
||||
|
||||
|
@ -57,10 +56,30 @@ def detokenize(token_rules, words): # Deprecated?
|
|||
return positions
|
||||
|
||||
|
||||
def fix_glove_vectors_loading(overrides):
|
||||
"""Special-case hack for loading the GloVe vectors, to support deprecated
|
||||
<1.0 stuff. Phase this out once the data is fixed."""
|
||||
def match_best_version(target_name, target_version, path):
|
||||
path = util.ensure_path(path)
|
||||
if path is None or not path.exists():
|
||||
return None
|
||||
matches = []
|
||||
for data_name in path.iterdir():
|
||||
name, version = split_data_name(data_name.parts[-1])
|
||||
if name == target_name:
|
||||
matches.append((tuple(float(v) for v in version.split('.')), data_name))
|
||||
if matches:
|
||||
return Path(max(matches)[1])
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def split_data_name(name):
|
||||
return name.split('-', 1) if '-' in name else (name, '')
|
||||
|
||||
|
||||
def fix_glove_vectors_loading(overrides):
|
||||
"""
|
||||
Special-case hack for loading the GloVe vectors, to support deprecated
|
||||
<1.0 stuff. Phase this out once the data is fixed.
|
||||
"""
|
||||
if 'data_dir' in overrides and 'path' not in overrides:
|
||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||
if overrides.get('path') is False:
|
||||
|
@ -68,18 +87,16 @@ def fix_glove_vectors_loading(overrides):
|
|||
if overrides.get('path') in (None, True):
|
||||
data_path = util.get_data_path()
|
||||
else:
|
||||
path = overrides['path']
|
||||
if isinstance(path, basestring):
|
||||
path = Path(path)
|
||||
path = util.ensure_path(overrides['path'])
|
||||
data_path = path.parent
|
||||
vec_path = None
|
||||
if 'add_vectors' not in overrides:
|
||||
if 'vectors' in overrides:
|
||||
vec_path = util.match_best_version(overrides['vectors'], None, data_path)
|
||||
vec_path = match_best_version(overrides['vectors'], None, data_path)
|
||||
if vec_path is None:
|
||||
return overrides
|
||||
else:
|
||||
vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
||||
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
||||
if vec_path is not None:
|
||||
vec_path = vec_path / 'vocab' / 'vec.bin'
|
||||
if vec_path is not None:
|
||||
|
@ -88,13 +105,13 @@ def fix_glove_vectors_loading(overrides):
|
|||
|
||||
|
||||
def resolve_model_name(name):
|
||||
"""If spaCy is loaded with 'de', check if symlink already exists. If
|
||||
"""
|
||||
If spaCy is loaded with 'de', check if symlink already exists. If
|
||||
not, user have upgraded from older version and have old models installed.
|
||||
Check if old model directory exists and if so, return that instead and create
|
||||
shortcut link. If English model is found and no shortcut exists, raise error
|
||||
and tell user to install new model.
|
||||
"""
|
||||
|
||||
if name == 'en' or name == 'de':
|
||||
versions = ['1.0.0', '1.1.0']
|
||||
data_path = Path(util.get_data_path())
|
||||
|
@ -117,9 +134,11 @@ def resolve_model_name(name):
|
|||
|
||||
|
||||
class ModelDownload():
|
||||
"""Replace download modules within en and de with deprecation warning and
|
||||
"""
|
||||
Replace download modules within en and de with deprecation warning and
|
||||
download default language model (using shortcut). Use classmethods to allow
|
||||
importing ModelDownload as download and calling download.en() etc."""
|
||||
importing ModelDownload as download and calling download.en() etc.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def load(self, lang):
|
||||
|
|
|
@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading
|
|||
from .language_data import *
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
|
||||
|
||||
class English(Language):
|
||||
lang = 'en'
|
||||
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
# cython: profile=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from os import path
|
||||
|
||||
import ujson as json
|
||||
import ujson
|
||||
from pathlib import Path
|
||||
|
||||
from .syntax import nonproj
|
||||
|
||||
|
@ -141,12 +139,13 @@ def _min_edit_path(cand_words, gold_words):
|
|||
|
||||
|
||||
def read_json_file(loc, docs_filter=None):
|
||||
if path.isdir(loc):
|
||||
for filename in os.listdir(loc):
|
||||
yield from read_json_file(path.join(loc, filename))
|
||||
loc = Path(loc)
|
||||
if loc.is_dir():
|
||||
for filename in loc.iterdir():
|
||||
yield from read_json_file(loc / filename)
|
||||
else:
|
||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||
docs = json.load(file_)
|
||||
docs = ujson.load(file_)
|
||||
for doc in docs:
|
||||
if docs_filter is not None and not docs_filter(doc):
|
||||
continue
|
||||
|
@ -220,7 +219,8 @@ cdef class GoldParse:
|
|||
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||
deps=None, entities=None, make_projective=False):
|
||||
"""Create a GoldParse.
|
||||
"""
|
||||
Create a GoldParse.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
|
@ -302,7 +302,8 @@ cdef class GoldParse:
|
|||
self.heads = proj_heads
|
||||
|
||||
def __len__(self):
|
||||
"""Get the number of gold-standard tokens.
|
||||
"""
|
||||
Get the number of gold-standard tokens.
|
||||
|
||||
Returns (int): The number of gold-standard tokens.
|
||||
"""
|
||||
|
@ -310,13 +311,16 @@ cdef class GoldParse:
|
|||
|
||||
@property
|
||||
def is_projective(self):
|
||||
"""Whether the provided syntactic annotations form a projective dependency
|
||||
tree."""
|
||||
"""
|
||||
Whether the provided syntactic annotations form a projective dependency
|
||||
tree.
|
||||
"""
|
||||
return not nonproj.is_nonproj_tree(self.heads)
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities):
|
||||
'''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
"""
|
||||
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
scheme (biluo).
|
||||
|
||||
Arguments:
|
||||
|
@ -347,7 +351,7 @@ def biluo_tags_from_offsets(doc, entities):
|
|||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
|
||||
assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||
'''
|
||||
"""
|
||||
starts = {token.idx: token.i for token in doc}
|
||||
ends = {token.idx+len(token): token.i for token in doc}
|
||||
biluo = ['-' for _ in doc]
|
||||
|
|
|
@ -1,39 +1,26 @@
|
|||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
import pathlib
|
||||
# coding: utf8
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
from contextlib import contextmanager
|
||||
import shutil
|
||||
|
||||
import ujson
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
|
||||
try:
|
||||
unicode
|
||||
except NameError:
|
||||
unicode = str
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .vocab import Vocab
|
||||
from .tagger import Tagger
|
||||
from .matcher import Matcher
|
||||
from . import attrs
|
||||
from . import orth
|
||||
from . import util
|
||||
from . import language_data
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .train import Trainer
|
||||
|
||||
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
|
||||
from .syntax.parser import get_templates
|
||||
from .syntax.nonproj import PseudoProjectivity
|
||||
from .pipeline import DependencyParser, EntityRecognizer
|
||||
from .syntax.arc_eager import ArcEager
|
||||
from .syntax.ner import BiluoPushDown
|
||||
from .compat import unicode_
|
||||
from .attrs import IS_STOP
|
||||
from . import attrs
|
||||
from . import orth
|
||||
from . import util
|
||||
from . import language_data
|
||||
|
||||
|
||||
class BaseDefaults(object):
|
||||
|
@ -150,25 +137,15 @@ class BaseDefaults(object):
|
|||
return pipeline
|
||||
|
||||
token_match = language_data.TOKEN_MATCH
|
||||
|
||||
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
|
||||
|
||||
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||
|
||||
infixes = tuple(language_data.TOKENIZER_INFIXES)
|
||||
|
||||
tag_map = dict(language_data.TAG_MAP)
|
||||
|
||||
tokenizer_exceptions = {}
|
||||
|
||||
parser_features = get_templates('parser')
|
||||
|
||||
entity_features = get_templates('ner')
|
||||
|
||||
tagger_features = Tagger.feature_templates # TODO -- fix this
|
||||
|
||||
stop_words = set()
|
||||
|
||||
lemma_rules = {}
|
||||
lemma_exc = {}
|
||||
lemma_index = {}
|
||||
|
@ -202,53 +179,42 @@ class BaseDefaults(object):
|
|||
|
||||
|
||||
class Language(object):
|
||||
'''A text-processing pipeline. Usually you'll load this once per process, and
|
||||
"""
|
||||
A text-processing pipeline. Usually you'll load this once per process, and
|
||||
pass the instance around your program.
|
||||
'''
|
||||
"""
|
||||
Defaults = BaseDefaults
|
||||
lang = None
|
||||
|
||||
@classmethod
|
||||
@contextmanager
|
||||
def train(cls, path, gold_tuples, *configs):
|
||||
if isinstance(path, basestring):
|
||||
path = pathlib.Path(path)
|
||||
tagger_cfg, parser_cfg, entity_cfg = configs
|
||||
dep_model_dir = path / 'deps'
|
||||
ner_model_dir = path / 'ner'
|
||||
pos_model_dir = path / 'pos'
|
||||
if dep_model_dir.exists():
|
||||
shutil.rmtree(str(dep_model_dir))
|
||||
if ner_model_dir.exists():
|
||||
shutil.rmtree(str(ner_model_dir))
|
||||
if pos_model_dir.exists():
|
||||
shutil.rmtree(str(pos_model_dir))
|
||||
dep_model_dir.mkdir()
|
||||
ner_model_dir.mkdir()
|
||||
pos_model_dir.mkdir()
|
||||
def setup_directory(cls, path, **configs):
|
||||
for name, config in configs.items():
|
||||
directory = path / name
|
||||
if directory.exists():
|
||||
shutil.rmtree(str(directory))
|
||||
directory.mkdir()
|
||||
with (directory / 'config.json').open('wb') as file_:
|
||||
data = ujson.dumps(config, indent=2)
|
||||
if isinstance(data, unicode_):
|
||||
data = data.encode('utf8')
|
||||
file_.write(data)
|
||||
if not (path / 'vocab').exists():
|
||||
(path / 'vocab').mkdir()
|
||||
|
||||
@classmethod
|
||||
@contextmanager
|
||||
def train(cls, path, gold_tuples, **configs):
|
||||
if parser_cfg['pseudoprojective']:
|
||||
# preprocess training data here before ArcEager.get_labels() is called
|
||||
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
|
||||
|
||||
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
|
||||
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
|
||||
for subdir in ('deps', 'ner', 'pos'):
|
||||
if subdir not in configs:
|
||||
configs[subdir] = {}
|
||||
configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
|
||||
configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
|
||||
|
||||
with (dep_model_dir / 'config.json').open('wb') as file_:
|
||||
data = ujson.dumps(parser_cfg)
|
||||
if isinstance(data, unicode):
|
||||
data = data.encode('utf8')
|
||||
file_.write(data)
|
||||
with (ner_model_dir / 'config.json').open('wb') as file_:
|
||||
data = ujson.dumps(entity_cfg)
|
||||
if isinstance(data, unicode):
|
||||
data = data.encode('utf8')
|
||||
file_.write(data)
|
||||
with (pos_model_dir / 'config.json').open('wb') as file_:
|
||||
data = ujson.dumps(tagger_cfg)
|
||||
if isinstance(data, unicode):
|
||||
data = data.encode('utf8')
|
||||
file_.write(data)
|
||||
cls.setup_directory(path, **configs)
|
||||
|
||||
self = cls(
|
||||
path=path,
|
||||
|
@ -269,14 +235,14 @@ class Language(object):
|
|||
self.entity = self.Defaults.create_entity(self)
|
||||
self.pipeline = self.Defaults.create_pipeline(self)
|
||||
yield Trainer(self, gold_tuples)
|
||||
self.end_training(path=path)
|
||||
self.end_training()
|
||||
self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg,
|
||||
pos=self.tagger.cfg)
|
||||
|
||||
def __init__(self, **overrides):
|
||||
if 'data_dir' in overrides and 'path' not in overrides:
|
||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||
path = overrides.get('path', True)
|
||||
if isinstance(path, basestring):
|
||||
path = pathlib.Path(path)
|
||||
path = util.ensure_path(overrides.get('path', True))
|
||||
if path is True:
|
||||
path = util.get_data_path() / self.lang
|
||||
if not path.exists() and 'path' not in overrides:
|
||||
|
@ -322,7 +288,8 @@ class Language(object):
|
|||
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
|
||||
|
||||
def __call__(self, text, tag=True, parse=True, entity=True):
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
"""
|
||||
Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
|
@ -352,7 +319,8 @@ class Language(object):
|
|||
return doc
|
||||
|
||||
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
|
||||
'''Process texts as a stream, and yield Doc objects in order.
|
||||
"""
|
||||
Process texts as a stream, and yield Doc objects in order.
|
||||
|
||||
Supports GIL-free multi-threading.
|
||||
|
||||
|
@ -361,7 +329,7 @@ class Language(object):
|
|||
tag (bool)
|
||||
parse (bool)
|
||||
entity (bool)
|
||||
'''
|
||||
"""
|
||||
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
||||
stream = (self.make_doc(text) for text in texts)
|
||||
for proc in self.pipeline:
|
||||
|
@ -373,51 +341,35 @@ class Language(object):
|
|||
for doc in stream:
|
||||
yield doc
|
||||
|
||||
def end_training(self, path=None):
|
||||
if path is None:
|
||||
path = self.path
|
||||
elif isinstance(path, basestring):
|
||||
path = pathlib.Path(path)
|
||||
def save_to_directory(self, path):
|
||||
configs = {
|
||||
'pos': self.tagger.cfg if self.tagger else {},
|
||||
'deps': self.parser.cfg if self.parser else {},
|
||||
'ner': self.entity.cfg if self.entity else {},
|
||||
}
|
||||
|
||||
if self.tagger:
|
||||
self.tagger.model.end_training()
|
||||
self.tagger.model.dump(str(path / 'pos' / 'model'))
|
||||
if self.parser:
|
||||
self.parser.model.end_training()
|
||||
self.parser.model.dump(str(path / 'deps' / 'model'))
|
||||
if self.entity:
|
||||
self.entity.model.end_training()
|
||||
self.entity.model.dump(str(path / 'ner' / 'model'))
|
||||
self.setup_directory(path, **configs)
|
||||
|
||||
strings_loc = path / 'vocab' / 'strings.json'
|
||||
with strings_loc.open('w', encoding='utf8') as file_:
|
||||
self.vocab.strings.dump(file_)
|
||||
self.vocab.dump(path / 'vocab' / 'lexemes.bin')
|
||||
|
||||
# TODO: Word vectors?
|
||||
if self.tagger:
|
||||
tagger_freqs = list(self.tagger.freqs[TAG].items())
|
||||
else:
|
||||
tagger_freqs = []
|
||||
self.tagger.model.dump(str(path / 'pos' / 'model'))
|
||||
if self.parser:
|
||||
dep_freqs = list(self.parser.moves.freqs[DEP].items())
|
||||
head_freqs = list(self.parser.moves.freqs[HEAD].items())
|
||||
else:
|
||||
dep_freqs = []
|
||||
head_freqs = []
|
||||
self.parser.model.dump(str(path / 'deps' / 'model'))
|
||||
if self.entity:
|
||||
entity_iob_freqs = list(self.entity.moves.freqs[ENT_IOB].items())
|
||||
entity_type_freqs = list(self.entity.moves.freqs[ENT_TYPE].items())
|
||||
else:
|
||||
entity_iob_freqs = []
|
||||
entity_type_freqs = []
|
||||
with (path / 'vocab' / 'serializer.json').open('wb') as file_:
|
||||
data = ujson.dumps([
|
||||
(TAG, tagger_freqs),
|
||||
(DEP, dep_freqs),
|
||||
(ENT_IOB, entity_iob_freqs),
|
||||
(ENT_TYPE, entity_type_freqs),
|
||||
(HEAD, head_freqs)
|
||||
])
|
||||
if isinstance(data, unicode):
|
||||
data = data.encode('utf8')
|
||||
file_.write(data)
|
||||
self.entity.model.dump(str(path / 'ner' / 'model'))
|
||||
|
||||
def end_training(self, path=None):
|
||||
if self.tagger:
|
||||
self.tagger.model.end_training()
|
||||
if self.parser:
|
||||
self.parser.model.end_training()
|
||||
if self.entity:
|
||||
self.entity.model.end_training()
|
||||
# NB: This is slightly different from before --- we no longer default
|
||||
# to taking nlp.path
|
||||
if path is not None:
|
||||
self.save_to_directory(path)
|
||||
|
|
|
@ -1,13 +1,8 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
import codecs
|
||||
import pathlib
|
||||
|
||||
import ujson as json
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||
from .symbols import VerbForm_inf, VerbForm_none
|
||||
from .symbols import Number_sing
|
||||
from .symbols import Degree_pos
|
||||
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
|
@ -38,8 +33,10 @@ class Lemmatizer(object):
|
|||
return lemmas
|
||||
|
||||
def is_base_form(self, univ_pos, morphology=None):
|
||||
'''Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.'''
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
"""
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||
true_morph_key = morphology.get('morph', 0)
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
# cython: embedsignature=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from libc.math cimport sqrt
|
||||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -9,14 +12,11 @@ from cython.view cimport array as cvarray
|
|||
cimport numpy as np
|
||||
np.import_array()
|
||||
|
||||
|
||||
|
||||
from libc.string cimport memset
|
||||
import numpy
|
||||
|
||||
from .orth cimport word_shape
|
||||
from .typedefs cimport attr_t, flags_t
|
||||
import numpy
|
||||
|
||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from .attrs cimport IS_BRACKET
|
||||
|
@ -30,13 +30,15 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
|||
|
||||
|
||||
cdef class Lexeme:
|
||||
"""An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||
"""
|
||||
An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||
tag).
|
||||
"""
|
||||
def __init__(self, Vocab vocab, int orth):
|
||||
"""Create a Lexeme object.
|
||||
"""
|
||||
Create a Lexeme object.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab): The parent vocabulary
|
||||
|
@ -80,7 +82,8 @@ cdef class Lexeme:
|
|||
return self.c.orth
|
||||
|
||||
def set_flag(self, attr_id_t flag_id, bint value):
|
||||
"""Change the value of a boolean flag.
|
||||
"""
|
||||
Change the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to set.
|
||||
|
@ -89,7 +92,8 @@ cdef class Lexeme:
|
|||
Lexeme.c_set_flag(self.c, flag_id, value)
|
||||
|
||||
def check_flag(self, attr_id_t flag_id):
|
||||
"""Check the value of a boolean flag.
|
||||
"""
|
||||
Check the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to query.
|
||||
|
@ -98,7 +102,8 @@ cdef class Lexeme:
|
|||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||||
|
||||
def similarity(self, other):
|
||||
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
"""
|
||||
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
|
||||
Arguments:
|
||||
other:
|
||||
|
@ -106,7 +111,7 @@ cdef class Lexeme:
|
|||
Token and Lexeme objects.
|
||||
Returns:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
'''
|
||||
"""
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
# cython: profile=True
|
||||
# cython: infer_types=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .typedefs cimport hash_t
|
||||
from .attrs cimport attr_id_t
|
||||
|
@ -52,12 +55,6 @@ from .attrs import FLAG36 as L9_ENT
|
|||
from .attrs import FLAG35 as L10_ENT
|
||||
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
|
||||
cpdef enum quantifier_t:
|
||||
_META
|
||||
ONE
|
||||
|
@ -164,7 +161,7 @@ def _convert_strings(token_specs, string_store):
|
|||
def merge_phrase(matcher, doc, i, matches):
|
||||
'''Callback to merge a phrase on match'''
|
||||
ent_id, label, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span = doc[start : end]
|
||||
span.merge(ent_type=label, ent_id=ent_id)
|
||||
|
||||
|
||||
|
@ -180,7 +177,8 @@ cdef class Matcher:
|
|||
|
||||
@classmethod
|
||||
def load(cls, path, vocab):
|
||||
'''Load the matcher and patterns from a file path.
|
||||
"""
|
||||
Load the matcher and patterns from a file path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
|
@ -189,16 +187,17 @@ cdef class Matcher:
|
|||
The vocabulary that the documents to match over will refer to.
|
||||
Returns:
|
||||
Matcher: The newly constructed object.
|
||||
'''
|
||||
"""
|
||||
if (path / 'gazetteer.json').exists():
|
||||
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
||||
patterns = json.load(file_)
|
||||
patterns = ujson.load(file_)
|
||||
else:
|
||||
patterns = {}
|
||||
return cls(vocab, patterns)
|
||||
|
||||
def __init__(self, vocab, patterns={}):
|
||||
"""Create the Matcher.
|
||||
"""
|
||||
Create the Matcher.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
|
@ -227,7 +226,8 @@ cdef class Matcher:
|
|||
|
||||
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
||||
acceptor=None, on_match=None):
|
||||
"""Add an entity to the matcher.
|
||||
"""
|
||||
Add an entity to the matcher.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
|
@ -264,7 +264,8 @@ cdef class Matcher:
|
|||
self._callbacks[entity_key] = on_match
|
||||
|
||||
def add_pattern(self, entity_key, token_specs, label=""):
|
||||
"""Add a pattern to the matcher.
|
||||
"""
|
||||
Add a pattern to the matcher.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
|
@ -307,7 +308,8 @@ cdef class Matcher:
|
|||
return entity_key
|
||||
|
||||
def has_entity(self, entity_key):
|
||||
"""Check whether the matcher has an entity.
|
||||
"""
|
||||
Check whether the matcher has an entity.
|
||||
|
||||
Arguments:
|
||||
entity_key (string or int): The entity key to check.
|
||||
|
@ -318,7 +320,8 @@ cdef class Matcher:
|
|||
return entity_key in self._entities
|
||||
|
||||
def get_entity(self, entity_key):
|
||||
"""Retrieve the attributes stored for an entity.
|
||||
"""
|
||||
Retrieve the attributes stored for an entity.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int): The entity to retrieve.
|
||||
|
@ -332,7 +335,8 @@ cdef class Matcher:
|
|||
return None
|
||||
|
||||
def __call__(self, Doc doc, acceptor=None):
|
||||
"""Find all token sequences matching the supplied patterns on the Doc.
|
||||
"""
|
||||
Find all token sequences matching the supplied patterns on the Doc.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
|
@ -445,7 +449,8 @@ cdef class Matcher:
|
|||
return matches
|
||||
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
"""
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
Arguments:
|
||||
docs: A stream of documents.
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
# cython: infer_types
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
||||
from .attrs cimport POS, IS_SPACE
|
||||
from .parts_of_speech import IDS as POS_IDS
|
||||
|
@ -16,7 +12,9 @@ from .attrs import LEMMA, intify_attrs
|
|||
|
||||
|
||||
def _normalize_props(props):
|
||||
'''Transform deprecated string keys to correct names.'''
|
||||
"""
|
||||
Transform deprecated string keys to correct names.
|
||||
"""
|
||||
out = {}
|
||||
for key, value in props.items():
|
||||
if key == POS:
|
||||
|
@ -98,13 +96,14 @@ cdef class Morphology:
|
|||
flags[0] &= ~(one << flag_id)
|
||||
|
||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
||||
'''Add a special-case rule to the morphological analyser. Tokens whose
|
||||
"""
|
||||
Add a special-case rule to the morphological analyser. Tokens whose
|
||||
tag and orth match the rule will receive the specified properties.
|
||||
|
||||
Arguments:
|
||||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
'''
|
||||
"""
|
||||
tag = self.strings[tag_str]
|
||||
tag_id = self.reverse_index[tag]
|
||||
orth = self.strings[orth_str]
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
class RegexMerger(object):
|
||||
def __init__(self, regexes):
|
||||
self.regexes = regexes
|
||||
|
||||
def __call__(self, tokens):
|
||||
for tag, entity_type, regex in self.regexes:
|
||||
for m in regex.finditer(tokens.string):
|
||||
tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf8
|
||||
# cython: infer_types=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import unicodedata
|
||||
import re
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .syntax.parser cimport Parser
|
||||
from .syntax.beam_parser cimport BeamParser
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
|
@ -11,44 +14,40 @@ from .attrs import DEP, ENT_TYPE
|
|||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
"""Annotate named entities on Doc objects."""
|
||||
"""
|
||||
Annotate named entities on Doc objects.
|
||||
"""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
||||
def add_label(self, label):
|
||||
for action in self.moves.action_types:
|
||||
self.moves.add_action(action, label)
|
||||
if 'actions' in self.cfg:
|
||||
self.cfg['actions'].setdefault(action,
|
||||
{}).setdefault(label, True)
|
||||
Parser.add_label(self, label)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
# Set label into serializer. Super hacky :(
|
||||
for attr, freqs in self.vocab.serializer_freqs:
|
||||
if attr == ENT_TYPE and label not in freqs:
|
||||
freqs.append([label, 1])
|
||||
# Super hacky :(
|
||||
self.vocab._serializer = None
|
||||
|
||||
|
||||
cdef class BeamEntityRecognizer(BeamParser):
|
||||
"""Annotate named entities on Doc objects."""
|
||||
"""
|
||||
Annotate named entities on Doc objects.
|
||||
"""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
||||
def add_label(self, label):
|
||||
for action in self.moves.action_types:
|
||||
self.moves.add_action(action, label)
|
||||
if 'actions' in self.cfg:
|
||||
self.cfg['actions'].setdefault(action,
|
||||
{}).setdefault(label, True)
|
||||
Parser.add_label(self, label)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
# Set label into serializer. Super hacky :(
|
||||
for attr, freqs in self.vocab.serializer_freqs:
|
||||
if attr == ENT_TYPE and label not in freqs:
|
||||
freqs.append([label, 1])
|
||||
# Super hacky :(
|
||||
self.vocab._serializer = None
|
||||
|
||||
|
||||
|
@ -58,11 +57,7 @@ cdef class DependencyParser(Parser):
|
|||
feature_templates = get_feature_templates('basic')
|
||||
|
||||
def add_label(self, label):
|
||||
for action in self.moves.action_types:
|
||||
self.moves.add_action(action, label)
|
||||
if 'actions' in self.cfg:
|
||||
self.cfg['actions'].setdefault(action,
|
||||
{}).setdefault(label, True)
|
||||
Parser.add_label(self, label)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
for attr, freqs in self.vocab.serializer_freqs:
|
||||
|
@ -78,11 +73,7 @@ cdef class BeamDependencyParser(BeamParser):
|
|||
feature_templates = get_feature_templates('basic')
|
||||
|
||||
def add_label(self, label):
|
||||
for action in self.moves.action_types:
|
||||
self.moves.add_action(action, label)
|
||||
if 'actions' in self.cfg:
|
||||
self.cfg['actions'].setdefault(action,
|
||||
{}).setdefault(label, True)
|
||||
Parser.add_label(self, label)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
for attr, freqs in self.vocab.serializer_freqs:
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
# coding: utf8
|
||||
from __future__ import division, print_function, unicode_literals
|
||||
|
||||
from .gold import tags_to_entities
|
||||
|
||||
|
||||
class PRFScore(object):
|
||||
"""A precision / recall / F score"""
|
||||
"""
|
||||
A precision / recall / F score
|
||||
"""
|
||||
def __init__(self):
|
||||
self.tp = 0
|
||||
self.fp = 0
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
# cython: infer_types=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, absolute_import
|
||||
|
||||
cimport cython
|
||||
from libc.string cimport memcpy
|
||||
from libc.stdint cimport uint64_t, uint32_t
|
||||
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
|
||||
from preshed.maps cimport map_iter, key_t
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
|
@ -73,13 +72,16 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
|||
|
||||
|
||||
cdef class StringStore:
|
||||
'''Map strings to and from integer IDs.'''
|
||||
"""
|
||||
Map strings to and from integer IDs.
|
||||
"""
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
'''Create the StringStore.
|
||||
"""
|
||||
Create the StringStore.
|
||||
|
||||
Arguments:
|
||||
strings: A sequence of unicode strings to add to the store.
|
||||
'''
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._oov = PreshMap()
|
||||
|
@ -104,7 +106,8 @@ cdef class StringStore:
|
|||
return (StringStore, (list(self),))
|
||||
|
||||
def __len__(self):
|
||||
"""The number of strings in the store.
|
||||
"""
|
||||
The number of strings in the store.
|
||||
|
||||
Returns:
|
||||
int The number of strings in the store.
|
||||
|
@ -112,8 +115,9 @@ cdef class StringStore:
|
|||
return self.size-1
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
"""
|
||||
Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
Arguments:
|
||||
string_or_id (bytes or unicode or int):
|
||||
The value to encode.
|
||||
|
@ -149,17 +153,18 @@ cdef class StringStore:
|
|||
raise TypeError(type(string_or_id))
|
||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
||||
if utf8str is NULL:
|
||||
# TODO: We need to use 32 bit here, for compatibility with the
|
||||
# TODO: We need to use 32 bit here, for compatibility with the
|
||||
# vocabulary values. This makes birthday paradox probabilities
|
||||
# pretty bad.
|
||||
# We could also get unlucky here, and hash into a value that
|
||||
# collides with the 'real' strings.
|
||||
# collides with the 'real' strings.
|
||||
return hash32_utf8(byte_string, len(byte_string))
|
||||
else:
|
||||
return utf8str - self.c
|
||||
|
||||
def __contains__(self, unicode string not None):
|
||||
"""Check whether a string is in the store.
|
||||
"""
|
||||
Check whether a string is in the store.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to check.
|
||||
|
@ -172,7 +177,8 @@ cdef class StringStore:
|
|||
return self._map.get(key) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the strings in the store, in order.
|
||||
"""
|
||||
Iterate over the strings in the store, in order.
|
||||
|
||||
Yields: unicode A string in the store.
|
||||
"""
|
||||
|
@ -230,7 +236,8 @@ cdef class StringStore:
|
|||
return &self.c[self.size-1]
|
||||
|
||||
def dump(self, file_):
|
||||
"""Save the strings to a JSON file.
|
||||
"""
|
||||
Save the strings to a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to save the strings.
|
||||
|
@ -244,7 +251,8 @@ cdef class StringStore:
|
|||
file_.write(string_data)
|
||||
|
||||
def load(self, file_):
|
||||
"""Load the strings from a JSON file.
|
||||
"""
|
||||
Load the strings from a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file from which to load the strings.
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
IDS = {
|
||||
|
|
|
@ -7,17 +7,17 @@ out of "context") is in features/extractor.pyx
|
|||
The atomic feature names are listed in a big enum, so that the feature tuples
|
||||
can refer to them.
|
||||
"""
|
||||
from libc.string cimport memset
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memset
|
||||
from itertools import combinations
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..structs cimport TokenC
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
|
||||
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||
if token is NULL:
|
||||
|
|
|
@ -1,29 +1,26 @@
|
|||
# cython: profile=True
|
||||
# cython: cdivision=True
|
||||
# cython: infer_types=True
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
|
||||
import ctypes
|
||||
import os
|
||||
|
||||
from ..structs cimport TokenC
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, is_space_token
|
||||
from .nonproj import PseudoProjectivity
|
||||
from .nonproj import is_nonproj_tree
|
||||
from .transition_system cimport do_func_t, get_cost_func_t
|
||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
|
||||
from ..lexeme cimport Lexeme
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, is_space_token
|
||||
from .nonproj import PseudoProjectivity
|
||||
from .nonproj import is_nonproj_tree
|
||||
from ..structs cimport TokenC
|
||||
|
||||
|
||||
DEF NON_MONOTONIC = True
|
||||
|
@ -317,17 +314,20 @@ cdef class ArcEager(TransitionSystem):
|
|||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
{
|
||||
SHIFT: {'': True},
|
||||
REDUCE: {'': True},
|
||||
RIGHT: {},
|
||||
LEFT: {},
|
||||
BREAK: {'ROOT': True}})
|
||||
SHIFT: [''],
|
||||
REDUCE: [''],
|
||||
RIGHT: [],
|
||||
LEFT: [],
|
||||
BREAK: ['ROOT']})
|
||||
seen_actions = set()
|
||||
for label in kwargs.get('left_labels', []):
|
||||
if label.upper() != 'ROOT':
|
||||
actions[LEFT][label] = True
|
||||
if (LEFT, label) not in seen_actions:
|
||||
actions[LEFT].append(label)
|
||||
for label in kwargs.get('right_labels', []):
|
||||
if label.upper() != 'ROOT':
|
||||
actions[RIGHT][label] = True
|
||||
if (RIGHT, label) not in seen_actions:
|
||||
actions[RIGHT].append(label)
|
||||
|
||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
|
@ -336,9 +336,11 @@ cdef class ArcEager(TransitionSystem):
|
|||
label = 'ROOT'
|
||||
if label != 'ROOT':
|
||||
if head < child:
|
||||
actions[RIGHT][label] = True
|
||||
if (RIGHT, label) not in seen_actions:
|
||||
actions[RIGHT].append(label)
|
||||
elif head > child:
|
||||
actions[LEFT][label] = True
|
||||
if (LEFT, label) not in seen_actions:
|
||||
actions[LEFT].append(label)
|
||||
return actions
|
||||
|
||||
property action_types:
|
||||
|
|
|
@ -1,50 +1,34 @@
|
|||
"""
|
||||
MALT-style dependency parser
|
||||
"""
|
||||
# cython: profile=True
|
||||
# cython: experimental_cpp_class_def=True
|
||||
# cython: cdivision=True
|
||||
# cython: infer_types=True
|
||||
"""
|
||||
MALT-style dependency parser
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
# coding: utf-8
|
||||
|
||||
from __future__ import unicode_literals, print_function
|
||||
cimport cython
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport rand
|
||||
from libc.math cimport log, exp, isnan, isinf
|
||||
import random
|
||||
import os.path
|
||||
from os import path
|
||||
import shutil
|
||||
import json
|
||||
import math
|
||||
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport real_hash64 as hash64
|
||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||
|
||||
|
||||
from util import Config
|
||||
|
||||
from thinc.linear.features cimport ConjunctionExtracter
|
||||
from thinc.structs cimport FeatureC, ExampleC
|
||||
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.extra.search cimport MaxViolation
|
||||
from thinc.extra.search cimport Beam, MaxViolation
|
||||
from thinc.extra.eg cimport Example
|
||||
from thinc.extra.mb cimport Minibatch
|
||||
|
||||
from ..structs cimport TokenC
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..strings cimport StringStore
|
||||
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
|
||||
from ..gold cimport GoldParse
|
||||
|
||||
from . import _parse_features
|
||||
from ._parse_features cimport CONTEXT_SIZE
|
||||
from ._parse_features cimport fill_context
|
||||
|
@ -266,4 +250,3 @@ def is_gold(StateClass state, GoldParse gold, StringStore strings):
|
|||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||
truth.add((id_, head, dep))
|
||||
return truth == predicted
|
||||
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
from spacy.parts_of_speech cimport NOUN, PROPN, PRON
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..parts_of_speech cimport NOUN, PROPN, PRON
|
||||
|
||||
|
||||
def english_noun_chunks(obj):
|
||||
'''Detect base noun phrases from a dependency parse.
|
||||
Works on both Doc and Span.'''
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse.
|
||||
Works on both Doc and Span.
|
||||
"""
|
||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||
'attr', 'ROOT', 'root']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
|
|
|
@ -1,17 +1,16 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .transition_system cimport Transition
|
||||
from .transition_system cimport do_func_t
|
||||
|
||||
from ..structs cimport TokenC, Entity
|
||||
|
||||
from thinc.typedefs cimport weight_t
|
||||
from ..gold cimport GoldParseC
|
||||
from ..gold cimport GoldParse
|
||||
from ..attrs cimport ENT_TYPE, ENT_IOB
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .transition_system cimport Transition
|
||||
from .transition_system cimport do_func_t
|
||||
from ..structs cimport TokenC, Entity
|
||||
from ..gold cimport GoldParseC
|
||||
from ..gold cimport GoldParse
|
||||
from ..attrs cimport ENT_TYPE, ENT_IOB
|
||||
|
||||
|
||||
cdef enum:
|
||||
|
@ -21,6 +20,7 @@ cdef enum:
|
|||
LAST
|
||||
UNIT
|
||||
OUT
|
||||
ISNT
|
||||
N_MOVES
|
||||
|
||||
|
||||
|
@ -31,6 +31,7 @@ MOVE_NAMES[IN] = 'I'
|
|||
MOVE_NAMES[LAST] = 'L'
|
||||
MOVE_NAMES[UNIT] = 'U'
|
||||
MOVE_NAMES[OUT] = 'O'
|
||||
MOVE_NAMES[ISNT] = 'x'
|
||||
|
||||
|
||||
cdef do_func_t[N_MOVES] do_funcs
|
||||
|
@ -54,16 +55,20 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
{
|
||||
MISSING: {'': True},
|
||||
BEGIN: {},
|
||||
IN: {},
|
||||
LAST: {},
|
||||
UNIT: {},
|
||||
OUT: {'': True}
|
||||
MISSING: [''],
|
||||
BEGIN: [],
|
||||
IN: [],
|
||||
LAST: [],
|
||||
UNIT: [],
|
||||
OUT: ['']
|
||||
})
|
||||
seen_entities = set()
|
||||
for entity_type in kwargs.get('entity_types', []):
|
||||
if entity_type in seen_entities:
|
||||
continue
|
||||
seen_entities.add(entity_type)
|
||||
for action in (BEGIN, IN, LAST, UNIT):
|
||||
actions[action][entity_type] = True
|
||||
actions[action].append(entity_type)
|
||||
moves = ('M', 'B', 'I', 'L', 'U')
|
||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||
for (ids, words, tags, heads, labels, biluo), _ in sents:
|
||||
|
@ -72,8 +77,10 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if ner_tag.count('-') != 1:
|
||||
raise ValueError(ner_tag)
|
||||
_, label = ner_tag.split('-')
|
||||
for move_str in ('B', 'I', 'L', 'U'):
|
||||
actions[moves.index(move_str)][label] = True
|
||||
if label not in seen_entities:
|
||||
seen_entities.add(label)
|
||||
for move_str in ('B', 'I', 'L', 'U'):
|
||||
actions[moves.index(move_str)].append(label)
|
||||
return actions
|
||||
|
||||
property action_types:
|
||||
|
@ -111,11 +118,17 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
label = 0
|
||||
elif '-' in name:
|
||||
move_str, label_str = name.split('-', 1)
|
||||
# Hacky way to denote 'not this entity'
|
||||
if label_str.startswith('!'):
|
||||
label_str = label_str[1:]
|
||||
move_str = 'x'
|
||||
label = self.strings[label_str]
|
||||
else:
|
||||
move_str = name
|
||||
label = 0
|
||||
move = MOVE_NAMES.index(move_str)
|
||||
if move == ISNT:
|
||||
return Transition(clas=0, move=ISNT, label=label, score=0)
|
||||
for i in range(self.n_moves):
|
||||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
|
@ -225,6 +238,9 @@ cdef class Begin:
|
|||
elif g_act == BEGIN:
|
||||
# B, Gold B --> Label match
|
||||
return label != g_tag
|
||||
# Support partial supervision in the form of "not this label"
|
||||
elif g_act == ISNT:
|
||||
return label == g_tag
|
||||
else:
|
||||
# B, Gold I --> False (P)
|
||||
# B, Gold L --> False (P)
|
||||
|
@ -359,6 +375,9 @@ cdef class Unit:
|
|||
elif g_act == UNIT:
|
||||
# U, Gold U --> True iff tag match
|
||||
return label != g_tag
|
||||
# Support partial supervision in the form of "not this label"
|
||||
elif g_act == ISNT:
|
||||
return label == g_tag
|
||||
else:
|
||||
# U, Gold B --> False
|
||||
# U, Gold I --> False
|
||||
|
@ -388,7 +407,7 @@ cdef class Out:
|
|||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING:
|
||||
if g_act == MISSING or g_act == ISNT:
|
||||
return 0
|
||||
elif g_act == BEGIN:
|
||||
# O, Gold B --> False
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
from copy import copy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from spacy.attrs import DEP, HEAD
|
||||
from ..attrs import DEP, HEAD
|
||||
|
||||
|
||||
def ancestors(tokenid, heads):
|
||||
|
@ -201,5 +202,3 @@ class PseudoProjectivity:
|
|||
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
||||
filtered.append((raw_text, filtered_sents))
|
||||
return filtered
|
||||
|
||||
|
||||
|
|
|
@ -1,58 +1,46 @@
|
|||
# cython: infer_types=True
|
||||
"""
|
||||
MALT-style dependency parser
|
||||
"""
|
||||
# coding: utf-8
|
||||
# cython: infer_types=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import Counter
|
||||
import ujson
|
||||
|
||||
cimport cython
|
||||
cimport cython.parallel
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.exc cimport PyErr_CheckSignals
|
||||
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
|
||||
import os.path
|
||||
from collections import Counter
|
||||
from os import path
|
||||
import shutil
|
||||
import json
|
||||
import sys
|
||||
from .nonproj import PseudoProjectivity
|
||||
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.linalg cimport VecVec
|
||||
from thinc.structs cimport SparseArrayC
|
||||
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
||||
from thinc.extra.eg cimport Example
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport MapStruct
|
||||
from preshed.maps cimport map_get
|
||||
|
||||
from thinc.structs cimport FeatureC
|
||||
from thinc.structs cimport ExampleC
|
||||
from thinc.extra.eg cimport Example
|
||||
|
||||
from util import Config
|
||||
|
||||
from ..structs cimport TokenC
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..strings cimport StringStore
|
||||
|
||||
from .transition_system import OracleError
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
|
||||
from ..gold cimport GoldParse
|
||||
|
||||
from . import _parse_features
|
||||
from ._parse_features cimport CONTEXT_SIZE
|
||||
from ._parse_features cimport fill_context
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .nonproj import PseudoProjectivity
|
||||
from .transition_system import OracleError
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..strings cimport StringStore
|
||||
from ..gold cimport GoldParse
|
||||
|
||||
USE_FTRL = True
|
||||
|
||||
USE_FTRL = False
|
||||
DEBUG = False
|
||||
def set_debug(val):
|
||||
global DEBUG
|
||||
|
@ -80,7 +68,9 @@ cdef class ParserModel(AveragedPerceptron):
|
|||
return nr_feat
|
||||
|
||||
def update(self, Example eg, itn=0):
|
||||
'''Does regression on negative cost. Sort of cute?'''
|
||||
"""
|
||||
Does regression on negative cost. Sort of cute?
|
||||
"""
|
||||
self.time += 1
|
||||
cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
|
||||
cdef int guess = eg.guess
|
||||
|
@ -132,10 +122,13 @@ cdef class ParserModel(AveragedPerceptron):
|
|||
|
||||
|
||||
cdef class Parser:
|
||||
"""Base class of the DependencyParser and EntityRecognizer."""
|
||||
"""
|
||||
Base class of the DependencyParser and EntityRecognizer.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
|
||||
"""Load the statistical model from the supplied path.
|
||||
"""
|
||||
Load the statistical model from the supplied path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
|
@ -148,10 +141,16 @@ cdef class Parser:
|
|||
The newly constructed object.
|
||||
"""
|
||||
with (path / 'config.json').open() as file_:
|
||||
cfg = json.load(file_)
|
||||
cfg = ujson.load(file_)
|
||||
# TODO: remove this shim when we don't have to support older data
|
||||
if 'labels' in cfg and 'actions' not in cfg:
|
||||
cfg['actions'] = cfg.pop('labels')
|
||||
# TODO: remove this shim when we don't have to support older data
|
||||
for action_name, labels in dict(cfg['actions']).items():
|
||||
# We need this to be sorted
|
||||
if isinstance(labels, dict):
|
||||
labels = list(sorted(labels.keys()))
|
||||
cfg['actions'][action_name] = labels
|
||||
self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
|
||||
if (path / 'model').exists():
|
||||
self.model.load(str(path / 'model'))
|
||||
|
@ -161,7 +160,8 @@ cdef class Parser:
|
|||
return self
|
||||
|
||||
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
|
||||
"""Create a Parser.
|
||||
"""
|
||||
Create a Parser.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
|
@ -186,12 +186,18 @@ cdef class Parser:
|
|||
self.model.learn_rate = cfg.get('learn_rate', 0.001)
|
||||
|
||||
self.cfg = cfg
|
||||
# TODO: This is a pretty hacky fix to the problem of adding more
|
||||
# labels. The issue is they come in out of order, if labels are
|
||||
# added during training
|
||||
for label in cfg.get('extra_labels', []):
|
||||
self.add_label(label)
|
||||
|
||||
def __reduce__(self):
|
||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the entity recognizer, setting the annotations onto the Doc object.
|
||||
"""
|
||||
Apply the entity recognizer, setting the annotations onto the Doc object.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The document to be processed.
|
||||
|
@ -208,7 +214,8 @@ cdef class Parser:
|
|||
self.moves.finalize_doc(tokens)
|
||||
|
||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||
"""Process a stream of documents.
|
||||
"""
|
||||
Process a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to process.
|
||||
|
@ -296,7 +303,8 @@ cdef class Parser:
|
|||
return 0
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||
"""Update the statistical model.
|
||||
"""
|
||||
Update the statistical model.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
|
@ -334,15 +342,17 @@ cdef class Parser:
|
|||
self.moves.finalize_state(stcls.c)
|
||||
return loss
|
||||
|
||||
def step_through(self, Doc doc):
|
||||
"""Set up a stepwise state, to introspect and control the transition sequence.
|
||||
def step_through(self, Doc doc, GoldParse gold=None):
|
||||
"""
|
||||
Set up a stepwise state, to introspect and control the transition sequence.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The document to step through.
|
||||
gold (GoldParse): Optional gold parse
|
||||
Returns (StepwiseState):
|
||||
A state object, to step through the annotation process.
|
||||
"""
|
||||
return StepwiseState(self, doc)
|
||||
return StepwiseState(self, doc, gold=gold)
|
||||
|
||||
def from_transition_sequence(self, Doc doc, sequence):
|
||||
"""Control the annotations on a document by specifying a transition sequence
|
||||
|
@ -360,18 +370,28 @@ cdef class Parser:
|
|||
def add_label(self, label):
|
||||
# Doesn't set label into serializer -- subclasses override it to do that.
|
||||
for action in self.moves.action_types:
|
||||
self.moves.add_action(action, label)
|
||||
added = self.moves.add_action(action, label)
|
||||
if added:
|
||||
# Important that the labels be stored as a list! We need the
|
||||
# order, or the model goes out of synch
|
||||
self.cfg.setdefault('extra_labels', []).append(label)
|
||||
|
||||
|
||||
cdef class StepwiseState:
|
||||
cdef readonly StateClass stcls
|
||||
cdef readonly Example eg
|
||||
cdef readonly Doc doc
|
||||
cdef readonly GoldParse gold
|
||||
cdef readonly Parser parser
|
||||
|
||||
def __init__(self, Parser parser, Doc doc):
|
||||
def __init__(self, Parser parser, Doc doc, GoldParse gold=None):
|
||||
self.parser = parser
|
||||
self.doc = doc
|
||||
if gold is not None:
|
||||
self.gold = gold
|
||||
self.parser.moves.preprocess_gold(self.gold)
|
||||
else:
|
||||
self.gold = GoldParse(doc)
|
||||
self.stcls = StateClass.init(doc.c, doc.length)
|
||||
self.parser.moves.initialize_state(self.stcls.c)
|
||||
self.eg = Example(
|
||||
|
@ -406,6 +426,24 @@ cdef class StepwiseState:
|
|||
return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
|
||||
for i in range(self.stcls.c.length)]
|
||||
|
||||
@property
|
||||
def costs(self):
|
||||
"""
|
||||
Find the action-costs for the current state.
|
||||
"""
|
||||
if not self.gold:
|
||||
raise ValueError("Can't set costs: No GoldParse provided")
|
||||
self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
|
||||
self.stcls, self.gold)
|
||||
costs = {}
|
||||
for i in range(self.parser.moves.n_moves):
|
||||
if not self.eg.c.is_valid[i]:
|
||||
continue
|
||||
transition = self.parser.moves.c[i]
|
||||
name = self.parser.moves.move_name(transition.move, transition.label)
|
||||
costs[name] = self.eg.c.costs[i]
|
||||
return costs
|
||||
|
||||
def predict(self):
|
||||
self.eg.reset()
|
||||
self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features,
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.stdint cimport uint32_t
|
||||
|
||||
from ..vocab cimport EMPTY_LEXEME
|
||||
from ..structs cimport Entity
|
||||
from ..lexeme cimport Lexeme
|
||||
|
@ -28,6 +32,6 @@ cdef class StateClass:
|
|||
top = words[self.S(0)] + '_%d' % self.S_(0).head
|
||||
second = words[self.S(1)] + '_%d' % self.S_(1).head
|
||||
third = words[self.S(2)] + '_%d' % self.S_(2).head
|
||||
n0 = words[self.B(0)]
|
||||
n1 = words[self.B(1)]
|
||||
n0 = words[self.B(0)]
|
||||
n1 = words[self.B(1)]
|
||||
return ' '.join((third, second, top, '|', n0, n1))
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
# cython: infer_types=True
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
from collections import defaultdict
|
||||
|
@ -6,7 +10,6 @@ from collections import defaultdict
|
|||
from ..structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
|
@ -32,7 +35,7 @@ cdef class TransitionSystem:
|
|||
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
|
||||
|
||||
for action, label_strs in sorted(labels_by_action.items()):
|
||||
for label_str in sorted(label_strs):
|
||||
for label_str in label_strs:
|
||||
self.add_action(int(action), label_str)
|
||||
self.root_label = self.strings['ROOT']
|
||||
self.freqs = {} if _freqs is None else _freqs
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
from os import path
|
||||
import json
|
||||
|
||||
class Config(object):
|
||||
def __init__(self, **kwargs):
|
||||
for key, value in kwargs.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
def get(self, attr, default=None):
|
||||
return self.__dict__.get(attr, default)
|
||||
|
||||
@classmethod
|
||||
def write(cls, model_dir, name, **kwargs):
|
||||
open(path.join(model_dir, '%s.json' % name), 'w').write(json.dumps(kwargs))
|
||||
|
||||
@classmethod
|
||||
def read(cls, model_dir, name):
|
||||
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
|
|
@ -1,5 +1,7 @@
|
|||
import json
|
||||
import pathlib
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
from collections import defaultdict
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -12,8 +14,8 @@ from thinc.linalg cimport VecVec
|
|||
from .tokens.doc cimport Doc
|
||||
from .attrs cimport TAG
|
||||
from .gold cimport GoldParse
|
||||
|
||||
from .attrs cimport *
|
||||
from . import util
|
||||
|
||||
|
||||
cpdef enum:
|
||||
|
@ -106,10 +108,13 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|||
|
||||
|
||||
cdef class Tagger:
|
||||
"""Annotate part-of-speech tags on Doc objects."""
|
||||
"""
|
||||
Annotate part-of-speech tags on Doc objects.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, vocab, require=False):
|
||||
"""Load the statistical model from the supplied path.
|
||||
"""
|
||||
Load the statistical model from the supplied path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
|
@ -123,10 +128,10 @@ cdef class Tagger:
|
|||
"""
|
||||
# TODO: Change this to expect config.json when we don't have to
|
||||
# support old data.
|
||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
||||
path = util.ensure_path(path)
|
||||
if (path / 'templates.json').exists():
|
||||
with (path / 'templates.json').open('r', encoding='utf8') as file_:
|
||||
templates = json.load(file_)
|
||||
templates = ujson.load(file_)
|
||||
elif require:
|
||||
raise IOError(
|
||||
"Required file %s/templates.json not found when loading Tagger" % str(path))
|
||||
|
@ -142,7 +147,8 @@ cdef class Tagger:
|
|||
return self
|
||||
|
||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
||||
"""Create a Tagger.
|
||||
"""
|
||||
Create a Tagger.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
|
@ -180,7 +186,8 @@ cdef class Tagger:
|
|||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
"""
|
||||
Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The tokens to be tagged.
|
||||
|
@ -208,7 +215,8 @@ cdef class Tagger:
|
|||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
"""Tag a stream of documents.
|
||||
"""
|
||||
Tag a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to tag.
|
||||
|
@ -225,7 +233,8 @@ cdef class Tagger:
|
|||
yield doc
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||
"""Update the statistical model, with tags supplied for the given document.
|
||||
"""
|
||||
Update the statistical model, with tags supplied for the given document.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
|
|
|
@ -1,17 +1,11 @@
|
|||
# cython: embedsignature=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pathlib
|
||||
import ujson
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as preinc
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
|
@ -23,12 +17,15 @@ from .tokens.doc cimport Doc
|
|||
|
||||
|
||||
cdef class Tokenizer:
|
||||
"""Segment text, and create Doc objects with the discovered segment boundaries."""
|
||||
"""
|
||||
Segment text, and create Doc objects with the discovered segment boundaries.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
|
||||
infix_finditer=None, token_match=None):
|
||||
'''Load a Tokenizer, reading unsupplied components from the path.
|
||||
|
||||
"""
|
||||
Load a Tokenizer, reading unsupplied components from the path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
|
@ -45,13 +42,11 @@ cdef class Tokenizer:
|
|||
infix_finditer:
|
||||
Signature of re.compile(string).finditer
|
||||
Returns Tokenizer
|
||||
'''
|
||||
if isinstance(path, basestring):
|
||||
path = pathlib.Path(path)
|
||||
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
if rules is None:
|
||||
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
|
||||
rules = json.load(file_)
|
||||
rules = ujson.load(file_)
|
||||
if prefix_search in (None, True):
|
||||
with (path / 'tokenizer' / 'prefix.txt').open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
|
@ -67,8 +62,9 @@ cdef class Tokenizer:
|
|||
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
|
||||
|
||||
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
||||
'''Create a Tokenizer, to create Doc objects given unicode text.
|
||||
|
||||
"""
|
||||
Create a Tokenizer, to create Doc objects given unicode text.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
A storage container for lexical types.
|
||||
|
@ -85,7 +81,7 @@ cdef class Tokenizer:
|
|||
to find infixes.
|
||||
token_match:
|
||||
A boolean function matching strings that becomes tokens.
|
||||
'''
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._cache = PreshMap()
|
||||
self._specials = PreshMap()
|
||||
|
@ -107,7 +103,7 @@ cdef class Tokenizer:
|
|||
self.token_match)
|
||||
|
||||
return (self.__class__, args, None, None)
|
||||
|
||||
|
||||
cpdef Doc tokens_from_list(self, list strings):
|
||||
return Doc(self.vocab, words=strings)
|
||||
#raise NotImplementedError(
|
||||
|
@ -117,7 +113,8 @@ cdef class Tokenizer:
|
|||
|
||||
@cython.boundscheck(False)
|
||||
def __call__(self, unicode string):
|
||||
"""Tokenize a string.
|
||||
"""
|
||||
Tokenize a string.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to tokenize.
|
||||
|
@ -170,7 +167,8 @@ cdef class Tokenizer:
|
|||
return tokens
|
||||
|
||||
def pipe(self, texts, batch_size=1000, n_threads=2):
|
||||
"""Tokenize a stream of texts.
|
||||
"""
|
||||
Tokenize a stream of texts.
|
||||
|
||||
Arguments:
|
||||
texts: A sequence of unicode texts.
|
||||
|
@ -270,7 +268,7 @@ cdef class Tokenizer:
|
|||
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||
if cache_hit:
|
||||
pass
|
||||
elif self.token_match and self.token_match(string):
|
||||
elif self.token_match and self.token_match(string):
|
||||
# We're always saying 'no' to spaces here -- the caller will
|
||||
# fix up the outermost one, with reference to the original.
|
||||
# See Issue #859
|
||||
|
@ -324,7 +322,8 @@ cdef class Tokenizer:
|
|||
self._cache.set(key, cached)
|
||||
|
||||
def find_infix(self, unicode string):
|
||||
"""Find internal split points of the string, such as hyphens.
|
||||
"""
|
||||
Find internal split points of the string, such as hyphens.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
|
||||
|
@ -337,7 +336,8 @@ cdef class Tokenizer:
|
|||
return list(self.infix_finditer(string))
|
||||
|
||||
def find_prefix(self, unicode string):
|
||||
"""Find the length of a prefix that should be segmented from the string,
|
||||
"""
|
||||
Find the length of a prefix that should be segmented from the string,
|
||||
or None if no prefix rules match.
|
||||
|
||||
Arguments:
|
||||
|
@ -350,7 +350,8 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def find_suffix(self, unicode string):
|
||||
"""Find the length of a suffix that should be segmented from the string,
|
||||
"""
|
||||
Find the length of a suffix that should be segmented from the string,
|
||||
or None if no suffix rules match.
|
||||
|
||||
Arguments:
|
||||
|
@ -363,13 +364,15 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, special_cases):
|
||||
'''Add special-case tokenization rules.
|
||||
'''
|
||||
"""
|
||||
Add special-case tokenization rules.
|
||||
"""
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
|
||||
|
||||
def add_special_case(self, unicode string, substrings):
|
||||
'''Add a special-case tokenization rule.
|
||||
"""
|
||||
Add a special-case tokenization rule.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to specially tokenize.
|
||||
|
@ -378,7 +381,7 @@ cdef class Tokenizer:
|
|||
attributes. The ORTH fields of the attributes must exactly match
|
||||
the string when they are concatenated.
|
||||
Returns None
|
||||
'''
|
||||
"""
|
||||
substrings = list(substrings)
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.length = len(substrings)
|
||||
|
|
|
@ -1,15 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
cimport cython
|
||||
cimport numpy as np
|
||||
import numpy
|
||||
import numpy.linalg
|
||||
import struct
|
||||
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.math cimport sqrt
|
||||
|
||||
import numpy
|
||||
import numpy.linalg
|
||||
import struct
|
||||
cimport numpy as np
|
||||
import six
|
||||
import warnings
|
||||
|
||||
from .span cimport Span
|
||||
from .token cimport Token
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..lexeme cimport EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
|
@ -19,11 +22,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
|||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..lexeme cimport Lexeme
|
||||
from .span cimport Span
|
||||
from .token cimport Token
|
||||
from ..serialize.bits cimport BitArray
|
||||
from ..util import normalize_slice
|
||||
from ..syntax.iterators import CHUNKERS
|
||||
from ..compat import is_config
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
@ -76,7 +78,7 @@ cdef class Doc:
|
|||
|
||||
"""
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
||||
'''
|
||||
"""
|
||||
Create a Doc object.
|
||||
|
||||
Aside: Implementation
|
||||
|
@ -97,7 +99,7 @@ cdef class Doc:
|
|||
A list of boolean values, of the same length as words. True
|
||||
means that the word is followed by a space, False means it is not.
|
||||
If None, defaults to [True]*len(words)
|
||||
'''
|
||||
"""
|
||||
self.vocab = vocab
|
||||
size = 20
|
||||
self.mem = Pool()
|
||||
|
@ -158,7 +160,7 @@ cdef class Doc:
|
|||
self.is_parsed = True
|
||||
|
||||
def __getitem__(self, object i):
|
||||
'''
|
||||
"""
|
||||
doc[i]
|
||||
Get the Token object at position i, where i is an integer.
|
||||
Negative indexing is supported, and follows the usual Python
|
||||
|
@ -172,7 +174,7 @@ cdef class Doc:
|
|||
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||
You can use negative indices and open-ended ranges, which have their
|
||||
normal Python semantics.
|
||||
'''
|
||||
"""
|
||||
if isinstance(i, slice):
|
||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
return Span(self, start, stop, label=0)
|
||||
|
@ -186,7 +188,7 @@ cdef class Doc:
|
|||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
|
||||
def __iter__(self):
|
||||
'''
|
||||
"""
|
||||
for token in doc
|
||||
Iterate over `Token` objects, from which the annotations can
|
||||
be easily accessed. This is the main way of accessing Token
|
||||
|
@ -194,7 +196,7 @@ cdef class Doc:
|
|||
Python. If faster-than-Python speeds are required, you can
|
||||
instead access the annotations as a numpy array, or access the
|
||||
underlying C data directly from Cython.
|
||||
'''
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(self.length):
|
||||
if self._py_tokens[i] is not None:
|
||||
|
@ -203,10 +205,10 @@ cdef class Doc:
|
|||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
|
||||
def __len__(self):
|
||||
'''
|
||||
"""
|
||||
len(doc)
|
||||
The number of tokens in the document.
|
||||
'''
|
||||
"""
|
||||
return self.length
|
||||
|
||||
def __unicode__(self):
|
||||
|
@ -216,7 +218,7 @@ cdef class Doc:
|
|||
return u''.join([t.text_with_ws for t in self]).encode('utf-8')
|
||||
|
||||
def __str__(self):
|
||||
if six.PY3:
|
||||
if is_config(python3=True):
|
||||
return self.__unicode__()
|
||||
return self.__bytes__()
|
||||
|
||||
|
@ -228,7 +230,8 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def similarity(self, other):
|
||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
||||
"""
|
||||
Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
||||
Arguments:
|
||||
|
@ -237,7 +240,7 @@ cdef class Doc:
|
|||
|
||||
Return:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
'''
|
||||
"""
|
||||
if 'similarity' in self.user_hooks:
|
||||
return self.user_hooks['similarity'](self, other)
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
|
@ -245,9 +248,9 @@ cdef class Doc:
|
|||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property has_vector:
|
||||
'''
|
||||
"""
|
||||
A boolean value indicating whether a word vector is associated with the object.
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.user_hooks:
|
||||
return self.user_hooks['has_vector'](self)
|
||||
|
@ -255,11 +258,11 @@ cdef class Doc:
|
|||
return any(token.has_vector for token in self)
|
||||
|
||||
property vector:
|
||||
'''
|
||||
"""
|
||||
A real-valued meaning representation. Defaults to an average of the token vectors.
|
||||
|
||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'vector' in self.user_hooks:
|
||||
return self.user_hooks['vector'](self)
|
||||
|
@ -294,17 +297,21 @@ cdef class Doc:
|
|||
return self.text
|
||||
|
||||
property text:
|
||||
'''A unicode representation of the document text.'''
|
||||
"""
|
||||
A unicode representation of the document text.
|
||||
"""
|
||||
def __get__(self):
|
||||
return u''.join(t.text_with_ws for t in self)
|
||||
|
||||
property text_with_ws:
|
||||
'''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
|
||||
"""
|
||||
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.text
|
||||
|
||||
property ents:
|
||||
'''
|
||||
"""
|
||||
Yields named-entity `Span` objects, if the entity recognizer
|
||||
has been applied to the document. Iterate over the span to get
|
||||
individual Token objects, or access the label:
|
||||
|
@ -318,7 +325,7 @@ cdef class Doc:
|
|||
assert ents[0].label_ == 'PERSON'
|
||||
assert ents[0].orth_ == 'Best'
|
||||
assert ents[0].text == 'Mr. Best'
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef int i
|
||||
cdef const TokenC* token
|
||||
|
@ -382,13 +389,13 @@ cdef class Doc:
|
|||
self.c[start].ent_iob = 3
|
||||
|
||||
property noun_chunks:
|
||||
'''
|
||||
"""
|
||||
Yields base noun-phrase #[code Span] objects, if the document
|
||||
has been syntactically parsed. A base noun phrase, or
|
||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||
be nested within it – so no NP-level coordination, no prepositional
|
||||
phrases, and no relative clauses. For example:
|
||||
'''
|
||||
phrases, and no relative clauses.
|
||||
"""
|
||||
def __get__(self):
|
||||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
|
@ -496,7 +503,8 @@ cdef class Doc:
|
|||
return output
|
||||
|
||||
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
||||
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
"""
|
||||
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
by the values of the given attribute ID.
|
||||
|
||||
Example:
|
||||
|
@ -563,8 +571,9 @@ cdef class Doc:
|
|||
self.c[i] = parsed[i]
|
||||
|
||||
def from_array(self, attrs, array):
|
||||
'''Write to a `Doc` object, from an `(M, N)` array of attributes.
|
||||
'''
|
||||
"""
|
||||
Write to a `Doc` object, from an `(M, N)` array of attributes.
|
||||
"""
|
||||
cdef int i, col
|
||||
cdef attr_id_t attr_id
|
||||
cdef TokenC* tokens = self.c
|
||||
|
@ -603,19 +612,23 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def to_bytes(self):
|
||||
'''Serialize, producing a byte string.'''
|
||||
"""
|
||||
Serialize, producing a byte string.
|
||||
"""
|
||||
byte_string = self.vocab.serializer.pack(self)
|
||||
cdef uint32_t length = len(byte_string)
|
||||
return struct.pack('I', length) + byte_string
|
||||
|
||||
def from_bytes(self, data):
|
||||
'''Deserialize, loading from bytes.'''
|
||||
"""
|
||||
Deserialize, loading from bytes.
|
||||
"""
|
||||
self.vocab.serializer.unpack_into(data[4:], self)
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def read_bytes(file_):
|
||||
'''
|
||||
"""
|
||||
A static method, used to read serialized #[code Doc] objects from
|
||||
a file. For example:
|
||||
|
||||
|
@ -630,7 +643,7 @@ cdef class Doc:
|
|||
for byte_string in Doc.read_bytes(file_):
|
||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
||||
assert len(docs) == 2
|
||||
'''
|
||||
"""
|
||||
keep_reading = True
|
||||
while keep_reading:
|
||||
try:
|
||||
|
@ -644,7 +657,8 @@ cdef class Doc:
|
|||
yield n_bytes_str + data
|
||||
|
||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||
"""Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
||||
"""
|
||||
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
||||
is merged into a single token. If start_idx and end_idx do not mark start
|
||||
and end token boundaries, the document remains unchanged.
|
||||
|
||||
|
@ -658,7 +672,6 @@ cdef class Doc:
|
|||
token (Token):
|
||||
The newly merged token, or None if the start and end indices did
|
||||
not fall at token boundaries.
|
||||
|
||||
"""
|
||||
cdef unicode tag, lemma, ent_type
|
||||
if len(args) == 3:
|
||||
|
|
|
@ -1,26 +1,31 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from collections import defaultdict
|
||||
|
||||
cimport numpy as np
|
||||
import numpy
|
||||
import numpy.linalg
|
||||
cimport numpy as np
|
||||
from libc.math cimport sqrt
|
||||
import six
|
||||
|
||||
from .doc cimport token_by_start, token_by_end
|
||||
from ..structs cimport TokenC, LexemeC
|
||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..util import normalize_slice
|
||||
from .doc cimport token_by_start, token_by_end
|
||||
from ..attrs cimport IS_PUNCT, IS_SPACE
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..compat import is_config
|
||||
|
||||
|
||||
cdef class Span:
|
||||
"""A slice from a Doc object."""
|
||||
"""
|
||||
A slice from a Doc object.
|
||||
"""
|
||||
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
||||
vector_norm=None):
|
||||
'''Create a Span object from the slice doc[start : end]
|
||||
"""
|
||||
Create a Span object from the slice doc[start : end]
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The parent document.
|
||||
|
@ -30,7 +35,7 @@ cdef class Span:
|
|||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
Returns:
|
||||
Span The newly constructed object.
|
||||
'''
|
||||
"""
|
||||
if not (0 <= start <= end <= len(doc)):
|
||||
raise IndexError
|
||||
|
||||
|
@ -68,7 +73,7 @@ cdef class Span:
|
|||
return self.end - self.start
|
||||
|
||||
def __repr__(self):
|
||||
if six.PY3:
|
||||
if is_config(python3=True):
|
||||
return self.text
|
||||
return self.text.encode('utf-8')
|
||||
|
||||
|
@ -89,7 +94,8 @@ cdef class Span:
|
|||
yield self.doc[i]
|
||||
|
||||
def merge(self, *args, **attributes):
|
||||
"""Retokenize the document, such that the span is merged into a single token.
|
||||
"""
|
||||
Retokenize the document, such that the span is merged into a single token.
|
||||
|
||||
Arguments:
|
||||
**attributes:
|
||||
|
@ -102,7 +108,8 @@ cdef class Span:
|
|||
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||
|
||||
def similarity(self, other):
|
||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
||||
"""
|
||||
Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
||||
Arguments:
|
||||
|
@ -111,7 +118,7 @@ cdef class Span:
|
|||
|
||||
Return:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
'''
|
||||
"""
|
||||
if 'similarity' in self.doc.user_span_hooks:
|
||||
self.doc.user_span_hooks['similarity'](self, other)
|
||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||
|
@ -133,11 +140,12 @@ cdef class Span:
|
|||
self.end = end + 1
|
||||
|
||||
property sent:
|
||||
'''The sentence span that this span is a part of.
|
||||
"""
|
||||
The sentence span that this span is a part of.
|
||||
|
||||
Returns:
|
||||
Span The sentence this is part of.
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'sent' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['sent'](self)
|
||||
|
@ -198,13 +206,13 @@ cdef class Span:
|
|||
return u''.join([t.text_with_ws for t in self])
|
||||
|
||||
property noun_chunks:
|
||||
'''
|
||||
"""
|
||||
Yields base noun-phrase #[code Span] objects, if the document
|
||||
has been syntactically parsed. A base noun phrase, or
|
||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||
be nested within it – so no NP-level coordination, no prepositional
|
||||
phrases, and no relative clauses. For example:
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
if not self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
|
@ -223,17 +231,16 @@ cdef class Span:
|
|||
yield span
|
||||
|
||||
property root:
|
||||
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
|
||||
"""
|
||||
The token within the span that's highest in the parse tree. If there's a
|
||||
tie, the earlist is prefered.
|
||||
|
||||
Returns:
|
||||
Token: The root token.
|
||||
|
||||
i.e. has the
|
||||
shortest path to the root of the sentence (or is the root itself).
|
||||
|
||||
If multiple words are equally high in the tree, the first word is taken.
|
||||
|
||||
For example:
|
||||
i.e. has the shortest path to the root of the sentence (or is the root
|
||||
itself). If multiple words are equally high in the tree, the first word
|
||||
is taken. For example:
|
||||
|
||||
>>> toks = nlp(u'I like New York in Autumn.')
|
||||
|
||||
|
@ -303,7 +310,8 @@ cdef class Span:
|
|||
return self.doc[root]
|
||||
|
||||
property lefts:
|
||||
"""Tokens that are to the left of the span, whose head is within the Span.
|
||||
"""
|
||||
Tokens that are to the left of the span, whose head is within the Span.
|
||||
|
||||
Yields: Token A left-child of a token of the span.
|
||||
"""
|
||||
|
@ -314,7 +322,8 @@ cdef class Span:
|
|||
yield left
|
||||
|
||||
property rights:
|
||||
"""Tokens that are to the right of the Span, whose head is within the Span.
|
||||
"""
|
||||
Tokens that are to the right of the Span, whose head is within the Span.
|
||||
|
||||
Yields: Token A right-child of a token of the span.
|
||||
"""
|
||||
|
@ -325,7 +334,8 @@ cdef class Span:
|
|||
yield right
|
||||
|
||||
property subtree:
|
||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
||||
"""
|
||||
Tokens that descend from tokens in the span, but fall outside it.
|
||||
|
||||
Yields: Token A descendant of a token within the span.
|
||||
"""
|
||||
|
@ -337,7 +347,9 @@ cdef class Span:
|
|||
yield from word.subtree
|
||||
|
||||
property ent_id:
|
||||
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
|
||||
"""
|
||||
An (integer) entity ID. Usually assigned by patterns in the Matcher.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.root.ent_id
|
||||
|
||||
|
@ -345,9 +357,11 @@ cdef class Span:
|
|||
# TODO
|
||||
raise NotImplementedError(
|
||||
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
||||
"tracker: http://github.com/spacy-io/spaCy")
|
||||
"tracker: http://github.com/explosion/spaCy/issues")
|
||||
property ent_id_:
|
||||
'''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
|
||||
"""
|
||||
A (string) entity ID. Usually assigned by patterns in the Matcher.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.root.ent_id_
|
||||
|
||||
|
@ -355,7 +369,7 @@ cdef class Span:
|
|||
# TODO
|
||||
raise NotImplementedError(
|
||||
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
|
||||
"tracker: http://github.com/spacy-io/spaCy")
|
||||
"tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
property orth_:
|
||||
def __get__(self):
|
||||
|
@ -397,5 +411,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
|||
raise RuntimeError(
|
||||
"Array bounds exceeded while searching for root word. This likely "
|
||||
"means the parse tree is in an invalid state. Please report this "
|
||||
"issue here: http://github.com/honnibal/spaCy/")
|
||||
"issue here: http://github.com/explosion/spaCy/issues")
|
||||
return n
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# coding: utf8
|
||||
# cython: infer_types=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memcpy
|
||||
|
@ -8,20 +8,15 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
|||
from cython.view cimport array as cvarray
|
||||
cimport numpy as np
|
||||
np.import_array()
|
||||
|
||||
import numpy
|
||||
import six
|
||||
|
||||
|
||||
from ..typedefs cimport hash_t
|
||||
from ..lexeme cimport Lexeme
|
||||
from .. import parts_of_speech
|
||||
|
||||
from ..attrs cimport LEMMA
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport POS, LEMMA, TAG, DEP
|
||||
from ..parts_of_speech cimport CCONJ, PUNCT
|
||||
|
||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from ..attrs cimport IS_BRACKET
|
||||
from ..attrs cimport IS_QUOTE
|
||||
|
@ -29,12 +24,13 @@ from ..attrs cimport IS_LEFT_PUNCT
|
|||
from ..attrs cimport IS_RIGHT_PUNCT
|
||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from ..attrs cimport IS_OOV
|
||||
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..compat import is_config
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
|
||||
"""
|
||||
An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
|
||||
"""
|
||||
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
||||
self.vocab = vocab
|
||||
|
@ -46,7 +42,9 @@ cdef class Token:
|
|||
return hash((self.doc, self.i))
|
||||
|
||||
def __len__(self):
|
||||
'''Number of unicode characters in token.text'''
|
||||
"""
|
||||
Number of unicode characters in token.text.
|
||||
"""
|
||||
return self.c.lex.length
|
||||
|
||||
def __unicode__(self):
|
||||
|
@ -56,7 +54,7 @@ cdef class Token:
|
|||
return self.text.encode('utf8')
|
||||
|
||||
def __str__(self):
|
||||
if six.PY3:
|
||||
if is_config(python3=True):
|
||||
return self.__unicode__()
|
||||
return self.__bytes__()
|
||||
|
||||
|
@ -83,27 +81,30 @@ cdef class Token:
|
|||
raise ValueError(op)
|
||||
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||
'''Check the value of a boolean flag.
|
||||
"""
|
||||
Check the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The ID of the flag attribute.
|
||||
Returns:
|
||||
is_set (bool): Whether the flag is set.
|
||||
'''
|
||||
"""
|
||||
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
||||
|
||||
def nbor(self, int i=1):
|
||||
'''Get a neighboring token.
|
||||
"""
|
||||
Get a neighboring token.
|
||||
|
||||
Arguments:
|
||||
i (int): The relative position of the token to get. Defaults to 1.
|
||||
Returns:
|
||||
neighbor (Token): The token at position self.doc[self.i+i]
|
||||
'''
|
||||
"""
|
||||
return self.doc[self.i+i]
|
||||
|
||||
def similarity(self, other):
|
||||
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
"""
|
||||
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
|
||||
Arguments:
|
||||
other:
|
||||
|
@ -111,7 +112,7 @@ cdef class Token:
|
|||
Token and Lexeme objects.
|
||||
Returns:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
'''
|
||||
"""
|
||||
if 'similarity' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['similarity'](self)
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
|
@ -209,9 +210,9 @@ cdef class Token:
|
|||
self.c.dep = label
|
||||
|
||||
property has_vector:
|
||||
'''
|
||||
"""
|
||||
A boolean value indicating whether a word vector is associated with the object.
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['has_vector'](self)
|
||||
|
@ -223,11 +224,11 @@ cdef class Token:
|
|||
return False
|
||||
|
||||
property vector:
|
||||
'''
|
||||
"""
|
||||
A real-valued meaning representation.
|
||||
|
||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['vector'](self)
|
||||
|
@ -245,6 +246,7 @@ cdef class Token:
|
|||
property repvec:
|
||||
def __get__(self):
|
||||
raise AttributeError("repvec was renamed to vector in v0.100")
|
||||
|
||||
property has_repvec:
|
||||
def __get__(self):
|
||||
raise AttributeError("has_repvec was renamed to has_vector in v0.100")
|
||||
|
@ -265,7 +267,8 @@ cdef class Token:
|
|||
|
||||
property lefts:
|
||||
def __get__(self):
|
||||
"""The leftward immediate children of the word, in the syntactic
|
||||
"""
|
||||
The leftward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
"""
|
||||
cdef int nr_iter = 0
|
||||
|
@ -282,8 +285,10 @@ cdef class Token:
|
|||
|
||||
property rights:
|
||||
def __get__(self):
|
||||
"""The rightward immediate children of the word, in the syntactic
|
||||
dependency parse."""
|
||||
"""
|
||||
The rightward immediate children of the word, in the syntactic
|
||||
dependency parse.
|
||||
"""
|
||||
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
||||
tokens = []
|
||||
cdef int nr_iter = 0
|
||||
|
@ -300,19 +305,21 @@ cdef class Token:
|
|||
yield t
|
||||
|
||||
property children:
|
||||
'''A sequence of the token's immediate syntactic children.
|
||||
"""
|
||||
A sequence of the token's immediate syntactic children.
|
||||
|
||||
Yields: Token A child token such that child.head==self
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
yield from self.lefts
|
||||
yield from self.rights
|
||||
|
||||
property subtree:
|
||||
'''A sequence of all the token's syntactic descendents.
|
||||
"""
|
||||
A sequence of all the token's syntactic descendents.
|
||||
|
||||
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
for word in self.lefts:
|
||||
yield from word.subtree
|
||||
|
@ -321,26 +328,29 @@ cdef class Token:
|
|||
yield from word.subtree
|
||||
|
||||
property left_edge:
|
||||
'''The leftmost token of this token's syntactic descendents.
|
||||
"""
|
||||
The leftmost token of this token's syntactic descendents.
|
||||
|
||||
Returns: Token The first token such that self.is_ancestor(token)
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.doc[self.c.l_edge]
|
||||
|
||||
property right_edge:
|
||||
'''The rightmost token of this token's syntactic descendents.
|
||||
"""
|
||||
The rightmost token of this token's syntactic descendents.
|
||||
|
||||
Returns: Token The last token such that self.is_ancestor(token)
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.doc[self.c.r_edge]
|
||||
|
||||
property ancestors:
|
||||
'''A sequence of this token's syntactic ancestors.
|
||||
"""
|
||||
A sequence of this token's syntactic ancestors.
|
||||
|
||||
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef const TokenC* head_ptr = self.c
|
||||
# guard against infinite loop, no token can have
|
||||
|
@ -356,25 +366,29 @@ cdef class Token:
|
|||
return self.is_ancestor(descendant)
|
||||
|
||||
def is_ancestor(self, descendant):
|
||||
'''Check whether this token is a parent, grandparent, etc. of another
|
||||
"""
|
||||
Check whether this token is a parent, grandparent, etc. of another
|
||||
in the dependency tree.
|
||||
|
||||
Arguments:
|
||||
descendant (Token): Another token.
|
||||
Returns:
|
||||
is_ancestor (bool): Whether this token is the ancestor of the descendant.
|
||||
'''
|
||||
"""
|
||||
if self.doc is not descendant.doc:
|
||||
return False
|
||||
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
||||
|
||||
property head:
|
||||
'''The syntactic parent, or "governor", of this token.
|
||||
"""
|
||||
The syntactic parent, or "governor", of this token.
|
||||
|
||||
Returns: Token
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
"""
|
||||
The token predicted by the parser to be the head of the current token.
|
||||
"""
|
||||
return self.doc[self.i + self.c.head]
|
||||
def __set__(self, Token new_head):
|
||||
# this function sets the head of self to new_head
|
||||
|
@ -467,10 +481,11 @@ cdef class Token:
|
|||
self.c.head = rel_newhead_i
|
||||
|
||||
property conjuncts:
|
||||
'''A sequence of coordinated tokens, including the token itself.
|
||||
"""
|
||||
A sequence of coordinated tokens, including the token itself.
|
||||
|
||||
Yields: Token A coordinated token
|
||||
'''
|
||||
"""
|
||||
def __get__(self):
|
||||
"""Get a list of conjoined words."""
|
||||
cdef Token word
|
||||
|
@ -501,7 +516,9 @@ cdef class Token:
|
|||
return iob_strings[self.c.ent_iob]
|
||||
|
||||
property ent_id:
|
||||
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
|
||||
"""
|
||||
An (integer) entity ID. Usually assigned by patterns in the Matcher.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.ent_id
|
||||
|
||||
|
@ -509,7 +526,9 @@ cdef class Token:
|
|||
self.c.ent_id = key
|
||||
|
||||
property ent_id_:
|
||||
'''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
|
||||
"""
|
||||
A (string) entity ID. Usually assigned by patterns in the Matcher.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_id]
|
||||
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
# coding: utf8
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
import random
|
||||
import tqdm
|
||||
from .gold import GoldParse
|
||||
from .gold import GoldParse, merge_sents
|
||||
from .scorer import Scorer
|
||||
from .gold import merge_sents
|
||||
|
||||
|
||||
class Trainer(object):
|
||||
'''Manage training of an NLP pipeline.'''
|
||||
"""
|
||||
Manage training of an NLP pipeline.
|
||||
"""
|
||||
def __init__(self, nlp, gold_tuples):
|
||||
self.nlp = nlp
|
||||
self.gold_tuples = gold_tuples
|
||||
|
|
|
@ -1,29 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
import os
|
||||
|
||||
import io
|
||||
import json
|
||||
import ujson
|
||||
import re
|
||||
import os.path
|
||||
import pathlib
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
|
||||
|
||||
try:
|
||||
raw_input
|
||||
except NameError: # Python 3
|
||||
raw_input = input
|
||||
from .compat import basestring_, unicode_, input_
|
||||
|
||||
|
||||
LANGUAGES = {}
|
||||
_data_path = pathlib.Path(__file__).parent / 'data'
|
||||
_data_path = Path(__file__).parent / 'data'
|
||||
|
||||
|
||||
def set_lang_class(name, cls):
|
||||
|
@ -47,9 +36,14 @@ def get_data_path(require_exists=True):
|
|||
|
||||
def set_data_path(path):
|
||||
global _data_path
|
||||
if isinstance(path, basestring):
|
||||
path = pathlib.Path(path)
|
||||
_data_path = path
|
||||
_data_path = ensure_path(path)
|
||||
|
||||
|
||||
def ensure_path(path):
|
||||
if isinstance(path, basestring_):
|
||||
return Path(path)
|
||||
else:
|
||||
return path
|
||||
|
||||
|
||||
def or_(val1, val2):
|
||||
|
@ -61,41 +55,8 @@ def or_(val1, val2):
|
|||
return val2
|
||||
|
||||
|
||||
def match_best_version(target_name, target_version, path):
|
||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
||||
if path is None or not path.exists():
|
||||
return None
|
||||
matches = []
|
||||
for data_name in path.iterdir():
|
||||
name, version = split_data_name(data_name.parts[-1])
|
||||
if name == target_name and constraint_match(target_version, version):
|
||||
matches.append((tuple(float(v) for v in version.split('.')), data_name))
|
||||
if matches:
|
||||
return pathlib.Path(max(matches)[1])
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def split_data_name(name):
|
||||
return name.split('-', 1) if '-' in name else (name, '')
|
||||
|
||||
|
||||
def constraint_match(constraint_string, version):
|
||||
# From http://github.com/spacy-io/sputnik
|
||||
if not constraint_string:
|
||||
return True
|
||||
|
||||
constraints = [c.strip() for c in constraint_string.split(',') if c.strip()]
|
||||
|
||||
for c in constraints:
|
||||
if not re.match(r'[><=][=]?\d+(\.\d+)*', c):
|
||||
raise ValueError('invalid constraint: %s' % c)
|
||||
|
||||
return all(semver.match(version, c) for c in constraints)
|
||||
|
||||
|
||||
def read_regex(path):
|
||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
||||
path = ensure_path(path)
|
||||
with path.open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
|
@ -152,21 +113,11 @@ def check_renamed_kwargs(renamed, kwargs):
|
|||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||
|
||||
|
||||
def is_windows():
|
||||
"""Check if user is on Windows."""
|
||||
return sys.platform.startswith('win')
|
||||
|
||||
|
||||
def is_python2():
|
||||
"""Check if Python 2 is used."""
|
||||
return sys.version.startswith('2.')
|
||||
|
||||
|
||||
def parse_package_meta(package_path, package, require=True):
|
||||
location = os.path.join(str(package_path), package, 'meta.json')
|
||||
if os.path.isfile(location):
|
||||
with io.open(location, encoding='utf8') as f:
|
||||
meta = json.load(f)
|
||||
location = package_path / package / 'meta.json'
|
||||
if location.is_file():
|
||||
with location.open('r', encoding='utf8') as f:
|
||||
meta = ujson.load(f)
|
||||
return meta
|
||||
elif require:
|
||||
raise IOError("Could not read meta.json from %s" % location)
|
||||
|
@ -181,7 +132,7 @@ def get_raw_input(description, default=False):
|
|||
|
||||
additional = ' (default: {d})'.format(d=default) if default else ''
|
||||
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
||||
user_input = raw_input(prompt)
|
||||
user_input = input_(prompt)
|
||||
return user_input
|
||||
|
||||
|
||||
|
@ -209,10 +160,9 @@ def print_markdown(data, **kwargs):
|
|||
which will be converted to a list of tuples."""
|
||||
|
||||
def excl_value(value):
|
||||
# don't print value if it contains absolute path of directory
|
||||
# (i.e. personal info that shouldn't need to be shared)
|
||||
# other conditions can be included here if necessary
|
||||
if str(pathlib.Path(__file__).parent) in value:
|
||||
# don't print value if it contains absolute path of directory (i.e.
|
||||
# personal info). Other conditions can be included here if necessary.
|
||||
if unicode_(Path(__file__).parent) in value:
|
||||
return True
|
||||
|
||||
if type(data) == dict:
|
||||
|
|
119
spacy/vocab.pyx
119
spacy/vocab.pyx
|
@ -1,41 +1,29 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import bz2
|
||||
import ujson
|
||||
import re
|
||||
|
||||
from libc.string cimport memset
|
||||
from libc.stdint cimport int32_t
|
||||
from libc.math cimport sqrt
|
||||
|
||||
from pathlib import Path
|
||||
import bz2
|
||||
import ujson as json
|
||||
import re
|
||||
|
||||
try:
|
||||
import cPickle as pickle
|
||||
except ImportError:
|
||||
import pickle
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport Lexeme
|
||||
from .strings cimport hash_string
|
||||
from .typedefs cimport attr_t
|
||||
from .cfile cimport CFile, StringCFile
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from .tokens.token cimport Token
|
||||
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
from .serialize.packer cimport Packer
|
||||
from .attrs cimport PROB, LANG
|
||||
|
||||
from .compat import copy_reg, pickle
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from . import util
|
||||
|
||||
|
||||
try:
|
||||
import copy_reg
|
||||
except ImportError:
|
||||
import copyreg as copy_reg
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
|
||||
|
||||
DEF MAX_VEC_SIZE = 100000
|
||||
|
@ -48,8 +36,9 @@ EMPTY_LEXEME.vector = EMPTY_VEC
|
|||
|
||||
|
||||
cdef class Vocab:
|
||||
'''A map container for a language's LexemeC structs.
|
||||
'''
|
||||
"""
|
||||
A map container for a language's LexemeC structs.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
||||
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
||||
|
@ -72,8 +61,7 @@ cdef class Vocab:
|
|||
Returns:
|
||||
Vocab: The newly constructed vocab object.
|
||||
"""
|
||||
if isinstance(path, basestring):
|
||||
path = Path(path)
|
||||
path = util.ensure_path(path)
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
if 'vectors' in deprecated_kwargs:
|
||||
raise AttributeError(
|
||||
|
@ -81,7 +69,7 @@ cdef class Vocab:
|
|||
"Install vectors after loading.")
|
||||
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
|
||||
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
|
||||
tag_map = json.load(file_)
|
||||
tag_map = ujson.load(file_)
|
||||
elif tag_map is True:
|
||||
tag_map = None
|
||||
if lex_attr_getters is not None \
|
||||
|
@ -94,12 +82,12 @@ cdef class Vocab:
|
|||
lemmatizer = Lemmatizer.load(path)
|
||||
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
|
||||
with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
|
||||
serializer_freqs = json.load(file_)
|
||||
serializer_freqs = ujson.load(file_)
|
||||
else:
|
||||
serializer_freqs = None
|
||||
|
||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||
strings_list = json.load(file_)
|
||||
strings_list = ujson.load(file_)
|
||||
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
|
||||
strings=strings_list)
|
||||
|
@ -108,7 +96,8 @@ cdef class Vocab:
|
|||
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
|
||||
'''Create the vocabulary.
|
||||
"""
|
||||
Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict):
|
||||
A dictionary mapping attribute IDs to functions to compute them.
|
||||
|
@ -123,7 +112,7 @@ cdef class Vocab:
|
|||
|
||||
Returns:
|
||||
Vocab: The newly constructed vocab object.
|
||||
'''
|
||||
"""
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
|
@ -172,17 +161,19 @@ cdef class Vocab:
|
|||
return langfunc('_') if langfunc else ''
|
||||
|
||||
def __len__(self):
|
||||
"""The current number of lexemes stored."""
|
||||
"""
|
||||
The current number of lexemes stored.
|
||||
"""
|
||||
return self.length
|
||||
|
||||
def resize_vectors(self, int new_size):
|
||||
'''
|
||||
"""
|
||||
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
||||
vectors if necessary. The memory will be zeroed.
|
||||
|
||||
Arguments:
|
||||
new_size (int): The new size of the vectors.
|
||||
'''
|
||||
"""
|
||||
cdef hash_t key
|
||||
cdef size_t addr
|
||||
if new_size > self.vectors_length:
|
||||
|
@ -193,7 +184,8 @@ cdef class Vocab:
|
|||
self.vectors_length = new_size
|
||||
|
||||
def add_flag(self, flag_getter, int flag_id=-1):
|
||||
'''Set a new boolean flag to words in the vocabulary.
|
||||
"""
|
||||
Set a new boolean flag to words in the vocabulary.
|
||||
|
||||
The flag_setter function will be called over the words currently in the
|
||||
vocab, and then applied to new words as they occur. You'll then be able
|
||||
|
@ -213,7 +205,7 @@ cdef class Vocab:
|
|||
|
||||
Returns:
|
||||
flag_id (int): The integer ID by which the flag value can be checked.
|
||||
'''
|
||||
"""
|
||||
if flag_id == -1:
|
||||
for bit in range(1, 64):
|
||||
if bit not in self.lex_attr_getters:
|
||||
|
@ -234,9 +226,11 @@ cdef class Vocab:
|
|||
return flag_id
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
"""
|
||||
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""
|
||||
if string == u'':
|
||||
return &EMPTY_LEXEME
|
||||
cdef LexemeC* lex
|
||||
|
@ -252,9 +246,11 @@ cdef class Vocab:
|
|||
return self._new_lexeme(mem, string)
|
||||
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
"""
|
||||
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""
|
||||
if orth == 0:
|
||||
return &EMPTY_LEXEME
|
||||
cdef LexemeC* lex
|
||||
|
@ -297,30 +293,33 @@ cdef class Vocab:
|
|||
self.length += 1
|
||||
|
||||
def __contains__(self, unicode string):
|
||||
'''Check whether the string has an entry in the vocabulary.
|
||||
"""
|
||||
Check whether the string has an entry in the vocabulary.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The ID string.
|
||||
|
||||
Returns:
|
||||
bool Whether the string has an entry in the vocabulary.
|
||||
'''
|
||||
"""
|
||||
key = hash_string(string)
|
||||
lex = self._by_hash.get(key)
|
||||
return lex is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
'''Iterate over the lexemes in the vocabulary.
|
||||
"""
|
||||
Iterate over the lexemes in the vocabulary.
|
||||
|
||||
Yields: Lexeme An entry in the vocabulary.
|
||||
'''
|
||||
"""
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
for orth, addr in self._by_orth.items():
|
||||
yield Lexeme(self, orth)
|
||||
|
||||
def __getitem__(self, id_or_string):
|
||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
"""
|
||||
Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
unseen unicode string is given, a new lexeme is created and stored.
|
||||
|
||||
Arguments:
|
||||
|
@ -332,7 +331,7 @@ cdef class Vocab:
|
|||
|
||||
Returns:
|
||||
lexeme (Lexeme): The lexeme indicated by the given ID.
|
||||
'''
|
||||
"""
|
||||
cdef attr_t orth
|
||||
if type(id_or_string) == unicode:
|
||||
orth = self.strings[id_or_string]
|
||||
|
@ -355,7 +354,8 @@ cdef class Vocab:
|
|||
return tokens
|
||||
|
||||
def dump(self, loc=None):
|
||||
"""Save the lexemes binary data to the given location, or
|
||||
"""
|
||||
Save the lexemes binary data to the given location, or
|
||||
return a byte-string with the data if loc is None.
|
||||
|
||||
Arguments:
|
||||
|
@ -392,14 +392,15 @@ cdef class Vocab:
|
|||
return fp.string_data()
|
||||
|
||||
def load_lexemes(self, loc):
|
||||
'''Load the binary vocabulary data from the given location.
|
||||
"""
|
||||
Load the binary vocabulary data from the given location.
|
||||
|
||||
Arguments:
|
||||
loc (Path): The path to load from.
|
||||
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
"""
|
||||
fp = CFile(loc, 'rb',
|
||||
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
||||
cdef LexemeC* lexeme = NULL
|
||||
|
@ -440,8 +441,9 @@ cdef class Vocab:
|
|||
fp.close()
|
||||
|
||||
def _deserialize_lexemes(self, CFile fp):
|
||||
'''Load the binary vocabulary data from the given CFile.
|
||||
'''
|
||||
"""
|
||||
Load the binary vocabulary data from the given CFile.
|
||||
"""
|
||||
cdef LexemeC* lexeme = NULL
|
||||
cdef hash_t key
|
||||
cdef unicode py_str
|
||||
|
@ -494,13 +496,14 @@ cdef class Vocab:
|
|||
fp.close()
|
||||
|
||||
def dump_vectors(self, out_loc):
|
||||
'''Save the word vectors to a binary file.
|
||||
"""
|
||||
Save the word vectors to a binary file.
|
||||
|
||||
Arguments:
|
||||
loc (Path): The path to save to.
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
"""
|
||||
cdef int32_t vec_len = self.vectors_length
|
||||
cdef int32_t word_len
|
||||
cdef bytes word_str
|
||||
|
@ -522,7 +525,8 @@ cdef class Vocab:
|
|||
out_file.close()
|
||||
|
||||
def load_vectors(self, file_):
|
||||
"""Load vectors from a text-based file.
|
||||
"""
|
||||
Load vectors from a text-based file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
||||
|
@ -561,7 +565,8 @@ cdef class Vocab:
|
|||
return vec_len
|
||||
|
||||
def load_vectors_from_bin_loc(self, loc):
|
||||
"""Load vectors from the location of a binary file.
|
||||
"""
|
||||
Load vectors from the location of a binary file.
|
||||
|
||||
Arguments:
|
||||
loc (unicode): The path of the binary file to load from.
|
||||
|
|
Loading…
Reference in New Issue
Block a user