mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
d7229967b0
|
@ -1,3 +1,4 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
'''Example of training a named entity recognition system from scratch using spaCy
|
'''Example of training a named entity recognition system from scratch using spaCy
|
||||||
|
|
||||||
This example is written to be self-contained and reasonably transparent.
|
This example is written to be self-contained and reasonably transparent.
|
||||||
|
@ -81,7 +82,7 @@ def load_vocab(path):
|
||||||
def init_ner_model(vocab, features=None):
|
def init_ner_model(vocab, features=None):
|
||||||
if features is None:
|
if features is None:
|
||||||
features = tuple(EntityRecognizer.feature_templates)
|
features = tuple(EntityRecognizer.feature_templates)
|
||||||
return BeamEntityRecognizer(vocab, features=features)
|
return EntityRecognizer(vocab, features=features)
|
||||||
|
|
||||||
|
|
||||||
def save_ner_model(model, path):
|
def save_ner_model(model, path):
|
||||||
|
@ -99,7 +100,7 @@ def save_ner_model(model, path):
|
||||||
|
|
||||||
|
|
||||||
def load_ner_model(vocab, path):
|
def load_ner_model(vocab, path):
|
||||||
return BeamEntityRecognizer.load(path, vocab)
|
return EntityRecognizer.load(path, vocab)
|
||||||
|
|
||||||
|
|
||||||
class Pipeline(object):
|
class Pipeline(object):
|
||||||
|
@ -110,18 +111,21 @@ class Pipeline(object):
|
||||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||||
if not path.is_dir():
|
if not path.is_dir():
|
||||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||||
vocab = load_vocab(path / 'vocab')
|
vocab = load_vocab(path)
|
||||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||||
ner_model = load_ner_model(vocab, path / 'ner')
|
ner_model = load_ner_model(vocab, path / 'ner')
|
||||||
return cls(vocab, tokenizer, ner_model)
|
return cls(vocab, tokenizer, ner_model)
|
||||||
|
|
||||||
def __init__(self, vocab=None, tokenizer=None, ner_model=None):
|
def __init__(self, vocab=None, tokenizer=None, entity=None):
|
||||||
if vocab is None:
|
if vocab is None:
|
||||||
self.vocab = init_vocab()
|
vocab = init_vocab()
|
||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||||
if ner_model is None:
|
if entity is None:
|
||||||
self.entity = init_ner_model(self.vocab)
|
entity = init_ner_model(self.vocab)
|
||||||
|
self.vocab = vocab
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.entity = entity
|
||||||
self.pipeline = [self.entity]
|
self.pipeline = [self.entity]
|
||||||
|
|
||||||
def __call__(self, input_):
|
def __call__(self, input_):
|
||||||
|
@ -173,7 +177,7 @@ class Pipeline(object):
|
||||||
save_ner_model(self.entity, path / 'ner')
|
save_ner_model(self.entity, path / 'ner')
|
||||||
|
|
||||||
|
|
||||||
def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
|
||||||
next_epoch = train_examples
|
next_epoch = train_examples
|
||||||
print("Iter", "Loss", "P", "R", "F")
|
print("Iter", "Loss", "P", "R", "F")
|
||||||
for i in range(nr_epoch):
|
for i in range(nr_epoch):
|
||||||
|
@ -186,14 +190,17 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
||||||
next_epoch.append((input_, annot))
|
next_epoch.append((input_, annot))
|
||||||
random.shuffle(next_epoch)
|
random.shuffle(next_epoch)
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
precision = '%.2f' % scores['ents_p']
|
report_scores(i, loss, scores)
|
||||||
recall = '%.2f' % scores['ents_r']
|
|
||||||
f_measure = '%.2f' % scores['ents_f']
|
|
||||||
print(i, int(loss), precision, recall, f_measure)
|
|
||||||
nlp.average_weights()
|
nlp.average_weights()
|
||||||
scores = nlp.evaluate(dev_examples)
|
scores = nlp.evaluate(dev_examples)
|
||||||
print("After averaging")
|
report_scores(channels, i+1, loss, scores)
|
||||||
print(scores['ents_p'], scores['ents_r'], scores['ents_f'])
|
|
||||||
|
|
||||||
|
def report_scores(i, loss, scores):
|
||||||
|
precision = '%.2f' % scores['ents_p']
|
||||||
|
recall = '%.2f' % scores['ents_r']
|
||||||
|
f_measure = '%.2f' % scores['ents_f']
|
||||||
|
print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
|
||||||
|
|
||||||
|
|
||||||
def read_examples(path):
|
def read_examples(path):
|
||||||
|
@ -221,15 +228,17 @@ def read_examples(path):
|
||||||
train_loc=("Path to your training data", "positional", None, Path),
|
train_loc=("Path to your training data", "positional", None, Path),
|
||||||
dev_loc=("Path to your development data", "positional", None, Path),
|
dev_loc=("Path to your development data", "positional", None, Path),
|
||||||
)
|
)
|
||||||
def main(model_dir, train_loc, dev_loc, nr_epoch=10):
|
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
|
||||||
|
train_loc=None, dev_loc=None, nr_epoch=30):
|
||||||
|
|
||||||
train_examples = read_examples(train_loc)
|
train_examples = read_examples(train_loc)
|
||||||
dev_examples = read_examples(dev_loc)
|
dev_examples = read_examples(dev_loc)
|
||||||
nlp = Pipeline()
|
nlp = Pipeline.load(model_dir)
|
||||||
|
|
||||||
train(nlp, train_examples, list(dev_examples), nr_epoch)
|
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
|
||||||
|
|
||||||
nlp.save(model_dir)
|
nlp.save(model_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
main()
|
74
examples/training/train_new_entity_type.py
Normal file
74
examples/training/train_new_entity_type.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
import json
|
||||||
|
import pathlib
|
||||||
|
import random
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from spacy.pipeline import EntityRecognizer
|
||||||
|
from spacy.gold import GoldParse
|
||||||
|
from spacy.tagger import Tagger
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
unicode
|
||||||
|
except:
|
||||||
|
unicode = str
|
||||||
|
|
||||||
|
|
||||||
|
def train_ner(nlp, train_data, output_dir):
|
||||||
|
# Add new words to vocab.
|
||||||
|
for raw_text, _ in train_data:
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
|
for word in doc:
|
||||||
|
_ = nlp.vocab[word.orth]
|
||||||
|
|
||||||
|
for itn in range(20):
|
||||||
|
random.shuffle(train_data)
|
||||||
|
for raw_text, entity_offsets in train_data:
|
||||||
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
|
nlp.tagger(doc)
|
||||||
|
loss = nlp.entity.update(doc, gold)
|
||||||
|
nlp.end_training()
|
||||||
|
nlp.save_to_directory(output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def main(model_name, output_directory=None):
|
||||||
|
nlp = spacy.load(model_name)
|
||||||
|
|
||||||
|
train_data = [
|
||||||
|
(
|
||||||
|
"Horses are too tall and they pretend to care about your feelings",
|
||||||
|
[(0, 6, 'ANIMAL')],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"horses are too tall and they pretend to care about your feelings",
|
||||||
|
[(0, 6, 'ANIMAL')]
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"horses pretend to care about your feelings",
|
||||||
|
[(0, 6, 'ANIMAL')]
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"they pretend to care about your feelings, those horses",
|
||||||
|
[(48, 54, 'ANIMAL')]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
nlp.entity.add_label('ANIMAL')
|
||||||
|
if output_directory is not None:
|
||||||
|
output_directory = pathlib.Path(output_directory)
|
||||||
|
ner = train_ner(nlp, train_data, output_directory)
|
||||||
|
|
||||||
|
doc = nlp('Do you like horses?')
|
||||||
|
for ent in doc.ents:
|
||||||
|
print(ent.label_, ent.text)
|
||||||
|
nlp2 = spacy.load('en', path=output_directory)
|
||||||
|
nlp2.entity.add_label('ANIMAL')
|
||||||
|
doc2 = nlp2('Do you like horses?')
|
||||||
|
for ent in doc2.ents:
|
||||||
|
print(ent.label_, ent.text)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import plac
|
||||||
|
plac.call(main)
|
2
fabfile.py
vendored
2
fabfile.py
vendored
|
@ -14,7 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
|
||||||
def env(lang='python2.7'):
|
def env(lang='python2.7'):
|
||||||
if path.exists(VENV_DIR):
|
if path.exists(VENV_DIR):
|
||||||
local('rm -rf {env}'.format(env=VENV_DIR))
|
local('rm -rf {env}'.format(env=VENV_DIR))
|
||||||
local('virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))
|
||||||
|
|
||||||
|
|
||||||
def install():
|
def install():
|
||||||
|
|
|
@ -1,27 +1,13 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .util import set_lang_class, get_lang_class, parse_package_meta
|
from .util import set_lang_class, get_lang_class, parse_package_meta
|
||||||
from .deprecated import resolve_model_name
|
from .deprecated import resolve_model_name
|
||||||
from .cli import info
|
from .cli import info
|
||||||
|
|
||||||
from . import en
|
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
|
||||||
from . import de
|
|
||||||
from . import zh
|
|
||||||
from . import es
|
|
||||||
from . import it
|
|
||||||
from . import hu
|
|
||||||
from . import fr
|
|
||||||
from . import pt
|
|
||||||
from . import nl
|
|
||||||
from . import sv
|
|
||||||
from . import fi
|
|
||||||
from . import bn
|
|
||||||
from . import he
|
|
||||||
|
|
||||||
from .about import *
|
|
||||||
|
|
||||||
|
|
||||||
set_lang_class(en.English.lang, en.English)
|
set_lang_class(en.English.lang, en.English)
|
||||||
|
|
|
@ -14,8 +14,9 @@ from spacy.cli import convert as cli_convert
|
||||||
|
|
||||||
|
|
||||||
class CLI(object):
|
class CLI(object):
|
||||||
"""Command-line interface for spaCy"""
|
"""
|
||||||
|
Command-line interface for spaCy
|
||||||
|
"""
|
||||||
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
|
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -29,7 +30,6 @@ class CLI(object):
|
||||||
can be shortcut, model name or, if --direct flag is set, full model name
|
can be shortcut, model name or, if --direct flag is set, full model name
|
||||||
with version.
|
with version.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_download(model, direct)
|
cli_download(model, direct)
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,7 +44,6 @@ class CLI(object):
|
||||||
either the name of a pip package, or the local path to the model data
|
either the name of a pip package, or the local path to the model data
|
||||||
directory. Linking models allows loading them via spacy.load(link_name).
|
directory. Linking models allows loading them via spacy.load(link_name).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_link(origin, link_name, force)
|
cli_link(origin, link_name, force)
|
||||||
|
|
||||||
|
|
||||||
|
@ -58,7 +57,6 @@ class CLI(object):
|
||||||
speficied as an argument, print model information. Flag --markdown
|
speficied as an argument, print model information. Flag --markdown
|
||||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_info(model, markdown)
|
cli_info(model, markdown)
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,7 +71,6 @@ class CLI(object):
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
output directory, and model data will be copied over.
|
output directory, and model data will be copied over.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_package(input_dir, output_dir, force)
|
cli_package(input_dir, output_dir, force)
|
||||||
|
|
||||||
|
|
||||||
|
@ -93,7 +90,6 @@ class CLI(object):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
|
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
|
||||||
not no_parser, not no_ner, parser_L1)
|
not no_parser, not no_ner, parser_L1)
|
||||||
|
|
||||||
|
@ -108,7 +104,6 @@ class CLI(object):
|
||||||
"""
|
"""
|
||||||
Initialize a new model and its data directory.
|
Initialize a new model and its data directory.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
|
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -122,7 +117,6 @@ class CLI(object):
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into JSON format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_convert(input_file, output_dir, n_sents, morphology)
|
cli_convert(input_file, output_dir, n_sents, morphology)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
"": NULL_ATTR,
|
"": NULL_ATTR,
|
||||||
"IS_ALPHA": IS_ALPHA,
|
"IS_ALPHA": IS_ALPHA,
|
||||||
|
@ -92,7 +96,8 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||||
|
|
||||||
|
|
||||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
'''Normalize a dictionary of attributes, converting them to ints.
|
"""
|
||||||
|
Normalize a dictionary of attributes, converting them to ints.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
stringy_attrs (dict):
|
stringy_attrs (dict):
|
||||||
|
@ -105,7 +110,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
inty_attrs (dict):
|
inty_attrs (dict):
|
||||||
Attributes dictionary with keys and optionally values converted to
|
Attributes dictionary with keys and optionally values converted to
|
||||||
ints.
|
ints.
|
||||||
'''
|
"""
|
||||||
inty_attrs = {}
|
inty_attrs = {}
|
||||||
if _do_deprecated:
|
if _do_deprecated:
|
||||||
if 'F' in stringy_attrs:
|
if 'F' in stringy_attrs:
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite
|
from libc.stdio cimport fopen, fclose, fread, fwrite
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import io
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .converters import conllu2json
|
from .converters import conllu2json
|
||||||
|
|
|
@ -2,12 +2,12 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from ...gold import read_json_file, merge_sents
|
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||||
"""Convert conllu files into JSON format for use with train cli.
|
"""
|
||||||
|
Convert conllu files into JSON format for use with train cli.
|
||||||
use_morphology parameter enables appending morphology to tags, which is
|
use_morphology parameter enables appending morphology to tags, which is
|
||||||
useful for languages such as Spanish, where UD tags are not so rich.
|
useful for languages such as Spanish, where UD tags are not so rich.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pip
|
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
|
@ -18,7 +18,6 @@ def info(model=None, markdown=False):
|
||||||
else:
|
else:
|
||||||
data['source'] = str(model_path)
|
data['source'] = str(model_path)
|
||||||
print_info(data, "model " + model, markdown)
|
print_info(data, "model " + model, markdown)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
data = get_spacy_data()
|
data = get_spacy_data()
|
||||||
print_info(data, "spaCy", markdown)
|
print_info(data, "spaCy", markdown)
|
||||||
|
@ -26,10 +25,8 @@ def info(model=None, markdown=False):
|
||||||
|
|
||||||
def print_info(data, title, markdown):
|
def print_info(data, title, markdown):
|
||||||
title = "Info about {title}".format(title=title)
|
title = "Info about {title}".format(title=title)
|
||||||
|
|
||||||
if markdown:
|
if markdown:
|
||||||
util.print_markdown(data, title=title)
|
util.print_markdown(data, title=title)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
util.print_table(data, title=title)
|
util.print_table(data, title=title)
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pip
|
import pip
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import importlib
|
import importlib
|
||||||
|
from ..compat import unicode_, symlink_to
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,7 +21,6 @@ def link_package(package_name, link_name, force=False):
|
||||||
# Python's installation and import rules are very complicated.
|
# Python's installation and import rules are very complicated.
|
||||||
pkg = importlib.import_module(package_name)
|
pkg = importlib.import_module(package_name)
|
||||||
package_path = Path(pkg.__file__).parent.parent
|
package_path = Path(pkg.__file__).parent.parent
|
||||||
|
|
||||||
meta = get_meta(package_path, package_name)
|
meta = get_meta(package_path, package_name)
|
||||||
model_name = package_name + '-' + meta['version']
|
model_name = package_name + '-' + meta['version']
|
||||||
model_path = package_path / package_name / model_name
|
model_path = package_path / package_name / model_name
|
||||||
|
@ -43,23 +43,17 @@ def symlink(model_path, link_name, force):
|
||||||
elif link_path.exists():
|
elif link_path.exists():
|
||||||
link_path.unlink()
|
link_path.unlink()
|
||||||
|
|
||||||
# Add workaround for Python 2 on Windows (see issue #909)
|
try:
|
||||||
if util.is_python2() and util.is_windows():
|
symlink_to(link_path, model_path)
|
||||||
import subprocess
|
except:
|
||||||
command = ['mklink', '/d', unicode(link_path), unicode(model_path)]
|
# This is quite dirty, but just making sure other errors are caught so
|
||||||
try:
|
# users at least see a proper message.
|
||||||
subprocess.call(command, shell=True)
|
util.sys_exit(
|
||||||
except:
|
"Creating a symlink in spacy/data failed. You can still import "
|
||||||
# This is quite dirty, but just making sure other Windows-specific
|
"the model as a Python package and call its load() method, or "
|
||||||
# errors are caught so users at least see a proper error message.
|
"create the symlink manually:",
|
||||||
util.sys_exit(
|
"{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
|
||||||
"Creating a symlink in spacy/data failed. You can still import "
|
title="Error: Couldn't link model to '{l}'".format(l=link_name))
|
||||||
"the model as a Python package and call its load() method, or "
|
|
||||||
"create the symlink manually:",
|
|
||||||
"{a} --> {b}".format(a=unicode(model_path), b=unicode(link_path)),
|
|
||||||
title="Error: Couldn't link model to '{l}'".format(l=link_name))
|
|
||||||
else:
|
|
||||||
link_path.symlink_to(model_path)
|
|
||||||
|
|
||||||
util.print_msg(
|
util.print_msg(
|
||||||
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
||||||
|
|
|
@ -1,20 +1,13 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import json
|
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
import requests
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import six
|
from ..compat import unicode_, json_dumps
|
||||||
|
|
||||||
from .. import about
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
if six.PY2:
|
|
||||||
json_dumps = lambda data: json.dumps(data, indent=2).decode("utf8")
|
|
||||||
elif six.PY3:
|
|
||||||
json_dumps = lambda data: json.dumps(data, indent=2)
|
|
||||||
|
|
||||||
def package(input_dir, output_dir, force):
|
def package(input_dir, output_dir, force):
|
||||||
input_path = Path(input_dir)
|
input_path = Path(input_dir)
|
||||||
|
@ -32,31 +25,31 @@ def package(input_dir, output_dir, force):
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
|
||||||
create_dirs(package_path, force)
|
create_dirs(package_path, force)
|
||||||
shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix())
|
shutil.copytree(unicode_(input_path), unicode_(package_path / model_name_v))
|
||||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||||
create_file(main_path / 'setup.py', template_setup)
|
create_file(main_path / 'setup.py', template_setup)
|
||||||
create_file(main_path / 'MANIFEST.in', template_manifest)
|
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||||
create_file(package_path / '__init__.py', template_init)
|
create_file(package_path / '__init__.py', template_init)
|
||||||
|
|
||||||
util.print_msg(
|
util.print_msg(
|
||||||
main_path.as_posix(),
|
unicode_(main_path),
|
||||||
"To build the package, run `python setup.py sdist` in that directory.",
|
"To build the package, run `python setup.py sdist` in that directory.",
|
||||||
title="Successfully created package {p}".format(p=model_name_v))
|
title="Successfully created package {p}".format(p=model_name_v))
|
||||||
|
|
||||||
|
|
||||||
def check_dirs(input_path, output_path):
|
def check_dirs(input_path, output_path):
|
||||||
if not input_path.exists():
|
if not input_path.exists():
|
||||||
util.sys_exit(input_path.as_poisx(), title="Model directory not found")
|
util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
util.sys_exit(output_path.as_posix(), title="Output directory not found")
|
util.sys_exit(unicode_(output_path), title="Output directory not found")
|
||||||
|
|
||||||
|
|
||||||
def create_dirs(package_path, force):
|
def create_dirs(package_path, force):
|
||||||
if package_path.exists():
|
if package_path.exists():
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(package_path.as_posix())
|
shutil.rmtree(unicode_(package_path.as_posix))
|
||||||
else:
|
else:
|
||||||
util.sys_exit(package_path.as_posix(),
|
util.sys_exit(unicode_(package_path.as_posix),
|
||||||
"Please delete the directory and try again.",
|
"Please delete the directory and try again.",
|
||||||
title="Package directory already exists")
|
title="Package directory already exists")
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
|
|
|
@ -5,8 +5,6 @@ import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tagger import Tagger
|
|
||||||
from ..syntax.parser import Parser
|
|
||||||
from ..gold import GoldParse, merge_sents
|
from ..gold import GoldParse, merge_sents
|
||||||
from ..gold import read_json_file as read_gold_json
|
from ..gold import read_json_file as read_gold_json
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
|
||||||
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
||||||
|
|
||||||
with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
||||||
loss = 0
|
|
||||||
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
|
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
|
||||||
for doc, gold in epoch:
|
for doc, gold in epoch:
|
||||||
trainer.update(doc, gold)
|
trainer.update(doc, gold)
|
||||||
|
|
54
spacy/compat.py
Normal file
54
spacy/compat.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import six
|
||||||
|
import sys
|
||||||
|
import ujson
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cPickle as pickle
|
||||||
|
except ImportError:
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
try:
|
||||||
|
import copy_reg
|
||||||
|
except ImportError:
|
||||||
|
import copyreg as copy_reg
|
||||||
|
|
||||||
|
|
||||||
|
is_python2 = six.PY2
|
||||||
|
is_python3 = six.PY3
|
||||||
|
is_windows = sys.platform.startswith('win')
|
||||||
|
is_linux = sys.platform.startswith('linux')
|
||||||
|
is_osx = sys.platform == 'darwin'
|
||||||
|
|
||||||
|
|
||||||
|
if is_python2:
|
||||||
|
bytes_ = str
|
||||||
|
unicode_ = unicode
|
||||||
|
basestring_ = basestring
|
||||||
|
input_ = raw_input
|
||||||
|
json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')
|
||||||
|
|
||||||
|
elif is_python3:
|
||||||
|
bytes_ = bytes
|
||||||
|
unicode_ = str
|
||||||
|
basestring_ = str
|
||||||
|
input_ = input
|
||||||
|
json_dumps = lambda data: ujson.dumps(data, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def symlink_to(orig, dest):
|
||||||
|
if is_python2 and is_windows:
|
||||||
|
import subprocess
|
||||||
|
subprocess.call(['mklink', '/d', unicode(orig), unicode(dest)], shell=True)
|
||||||
|
else:
|
||||||
|
orig.symlink_to(dest)
|
||||||
|
|
||||||
|
|
||||||
|
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
||||||
|
return ((python2 == None or python2 == is_python2) and
|
||||||
|
(python3 == None or python3 == is_python3) and
|
||||||
|
(windows == None or windows == is_windows) and
|
||||||
|
(linux == None or linux == is_linux) and
|
||||||
|
(osx == None or osx == is_osx))
|
|
@ -1,16 +1,14 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from . import about
|
from . import about
|
||||||
from . import util
|
from . import util
|
||||||
from .cli import download
|
from .cli import download
|
||||||
from .cli import link
|
from .cli import link
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
basestring
|
|
||||||
except NameError:
|
|
||||||
basestring = str
|
|
||||||
|
|
||||||
|
|
||||||
def read_lang_data(package):
|
def read_lang_data(package):
|
||||||
tokenization = package.load_json(('tokenizer', 'specials.json'))
|
tokenization = package.load_json(('tokenizer', 'specials.json'))
|
||||||
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
|
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
|
||||||
|
@ -36,7 +34,8 @@ def align_tokens(ref, indices): # Deprecated, surely?
|
||||||
|
|
||||||
|
|
||||||
def detokenize(token_rules, words): # Deprecated?
|
def detokenize(token_rules, words): # Deprecated?
|
||||||
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
"""
|
||||||
|
To align with treebanks, return a list of "chunks", where a chunk is a
|
||||||
sequence of tokens that are separated by whitespace in actual strings. Each
|
sequence of tokens that are separated by whitespace in actual strings. Each
|
||||||
chunk should be a tuple of token indices, e.g.
|
chunk should be a tuple of token indices, e.g.
|
||||||
|
|
||||||
|
@ -57,10 +56,30 @@ def detokenize(token_rules, words): # Deprecated?
|
||||||
return positions
|
return positions
|
||||||
|
|
||||||
|
|
||||||
def fix_glove_vectors_loading(overrides):
|
def match_best_version(target_name, target_version, path):
|
||||||
"""Special-case hack for loading the GloVe vectors, to support deprecated
|
path = util.ensure_path(path)
|
||||||
<1.0 stuff. Phase this out once the data is fixed."""
|
if path is None or not path.exists():
|
||||||
|
return None
|
||||||
|
matches = []
|
||||||
|
for data_name in path.iterdir():
|
||||||
|
name, version = split_data_name(data_name.parts[-1])
|
||||||
|
if name == target_name:
|
||||||
|
matches.append((tuple(float(v) for v in version.split('.')), data_name))
|
||||||
|
if matches:
|
||||||
|
return Path(max(matches)[1])
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def split_data_name(name):
|
||||||
|
return name.split('-', 1) if '-' in name else (name, '')
|
||||||
|
|
||||||
|
|
||||||
|
def fix_glove_vectors_loading(overrides):
|
||||||
|
"""
|
||||||
|
Special-case hack for loading the GloVe vectors, to support deprecated
|
||||||
|
<1.0 stuff. Phase this out once the data is fixed.
|
||||||
|
"""
|
||||||
if 'data_dir' in overrides and 'path' not in overrides:
|
if 'data_dir' in overrides and 'path' not in overrides:
|
||||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||||
if overrides.get('path') is False:
|
if overrides.get('path') is False:
|
||||||
|
@ -68,18 +87,16 @@ def fix_glove_vectors_loading(overrides):
|
||||||
if overrides.get('path') in (None, True):
|
if overrides.get('path') in (None, True):
|
||||||
data_path = util.get_data_path()
|
data_path = util.get_data_path()
|
||||||
else:
|
else:
|
||||||
path = overrides['path']
|
path = util.ensure_path(overrides['path'])
|
||||||
if isinstance(path, basestring):
|
|
||||||
path = Path(path)
|
|
||||||
data_path = path.parent
|
data_path = path.parent
|
||||||
vec_path = None
|
vec_path = None
|
||||||
if 'add_vectors' not in overrides:
|
if 'add_vectors' not in overrides:
|
||||||
if 'vectors' in overrides:
|
if 'vectors' in overrides:
|
||||||
vec_path = util.match_best_version(overrides['vectors'], None, data_path)
|
vec_path = match_best_version(overrides['vectors'], None, data_path)
|
||||||
if vec_path is None:
|
if vec_path is None:
|
||||||
return overrides
|
return overrides
|
||||||
else:
|
else:
|
||||||
vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path)
|
||||||
if vec_path is not None:
|
if vec_path is not None:
|
||||||
vec_path = vec_path / 'vocab' / 'vec.bin'
|
vec_path = vec_path / 'vocab' / 'vec.bin'
|
||||||
if vec_path is not None:
|
if vec_path is not None:
|
||||||
|
@ -88,13 +105,13 @@ def fix_glove_vectors_loading(overrides):
|
||||||
|
|
||||||
|
|
||||||
def resolve_model_name(name):
|
def resolve_model_name(name):
|
||||||
"""If spaCy is loaded with 'de', check if symlink already exists. If
|
"""
|
||||||
|
If spaCy is loaded with 'de', check if symlink already exists. If
|
||||||
not, user have upgraded from older version and have old models installed.
|
not, user have upgraded from older version and have old models installed.
|
||||||
Check if old model directory exists and if so, return that instead and create
|
Check if old model directory exists and if so, return that instead and create
|
||||||
shortcut link. If English model is found and no shortcut exists, raise error
|
shortcut link. If English model is found and no shortcut exists, raise error
|
||||||
and tell user to install new model.
|
and tell user to install new model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if name == 'en' or name == 'de':
|
if name == 'en' or name == 'de':
|
||||||
versions = ['1.0.0', '1.1.0']
|
versions = ['1.0.0', '1.1.0']
|
||||||
data_path = Path(util.get_data_path())
|
data_path = Path(util.get_data_path())
|
||||||
|
@ -117,9 +134,11 @@ def resolve_model_name(name):
|
||||||
|
|
||||||
|
|
||||||
class ModelDownload():
|
class ModelDownload():
|
||||||
"""Replace download modules within en and de with deprecation warning and
|
"""
|
||||||
|
Replace download modules within en and de with deprecation warning and
|
||||||
download default language model (using shortcut). Use classmethods to allow
|
download default language model (using shortcut). Use classmethods to allow
|
||||||
importing ModelDownload as download and calling download.en() etc."""
|
importing ModelDownload as download and calling download.en() etc.
|
||||||
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(self, lang):
|
def load(self, lang):
|
||||||
|
|
|
@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
basestring
|
|
||||||
except NameError:
|
|
||||||
basestring = str
|
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
lang = 'en'
|
lang = 'en'
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,11 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import json
|
|
||||||
import re
|
import re
|
||||||
import os
|
import ujson
|
||||||
from os import path
|
from pathlib import Path
|
||||||
|
|
||||||
import ujson as json
|
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
|
|
||||||
|
@ -141,12 +139,13 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None):
|
def read_json_file(loc, docs_filter=None):
|
||||||
if path.isdir(loc):
|
loc = Path(loc)
|
||||||
for filename in os.listdir(loc):
|
if loc.is_dir():
|
||||||
yield from read_json_file(path.join(loc, filename))
|
for filename in loc.iterdir():
|
||||||
|
yield from read_json_file(loc / filename)
|
||||||
else:
|
else:
|
||||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||||
docs = json.load(file_)
|
docs = ujson.load(file_)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
if docs_filter is not None and not docs_filter(doc):
|
if docs_filter is not None and not docs_filter(doc):
|
||||||
continue
|
continue
|
||||||
|
@ -220,7 +219,8 @@ cdef class GoldParse:
|
||||||
|
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||||
deps=None, entities=None, make_projective=False):
|
deps=None, entities=None, make_projective=False):
|
||||||
"""Create a GoldParse.
|
"""
|
||||||
|
Create a GoldParse.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc):
|
doc (Doc):
|
||||||
|
@ -302,7 +302,8 @@ cdef class GoldParse:
|
||||||
self.heads = proj_heads
|
self.heads = proj_heads
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""Get the number of gold-standard tokens.
|
"""
|
||||||
|
Get the number of gold-standard tokens.
|
||||||
|
|
||||||
Returns (int): The number of gold-standard tokens.
|
Returns (int): The number of gold-standard tokens.
|
||||||
"""
|
"""
|
||||||
|
@ -310,13 +311,16 @@ cdef class GoldParse:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_projective(self):
|
def is_projective(self):
|
||||||
"""Whether the provided syntactic annotations form a projective dependency
|
"""
|
||||||
tree."""
|
Whether the provided syntactic annotations form a projective dependency
|
||||||
|
tree.
|
||||||
|
"""
|
||||||
return not nonproj.is_nonproj_tree(self.heads)
|
return not nonproj.is_nonproj_tree(self.heads)
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities):
|
def biluo_tags_from_offsets(doc, entities):
|
||||||
'''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
"""
|
||||||
|
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||||
scheme (biluo).
|
scheme (biluo).
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -347,7 +351,7 @@ def biluo_tags_from_offsets(doc, entities):
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = biluo_tags_from_offsets(doc, entities)
|
||||||
|
|
||||||
assert tags == ['O', 'O', 'U-LOC', 'O']
|
assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||||
'''
|
"""
|
||||||
starts = {token.idx: token.i for token in doc}
|
starts = {token.idx: token.i for token in doc}
|
||||||
ends = {token.idx+len(token): token.i for token in doc}
|
ends = {token.idx+len(token): token.i for token in doc}
|
||||||
biluo = ['-' for _ in doc]
|
biluo = ['-' for _ in doc]
|
||||||
|
|
|
@ -1,39 +1,26 @@
|
||||||
from __future__ import absolute_import
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
import pathlib
|
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
basestring
|
|
||||||
except NameError:
|
|
||||||
basestring = str
|
|
||||||
|
|
||||||
try:
|
|
||||||
unicode
|
|
||||||
except NameError:
|
|
||||||
unicode = str
|
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .matcher import Matcher
|
from .matcher import Matcher
|
||||||
from . import attrs
|
|
||||||
from . import orth
|
|
||||||
from . import util
|
|
||||||
from . import language_data
|
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .train import Trainer
|
from .train import Trainer
|
||||||
|
|
||||||
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
|
|
||||||
from .syntax.parser import get_templates
|
from .syntax.parser import get_templates
|
||||||
from .syntax.nonproj import PseudoProjectivity
|
from .syntax.nonproj import PseudoProjectivity
|
||||||
from .pipeline import DependencyParser, EntityRecognizer
|
from .pipeline import DependencyParser, EntityRecognizer
|
||||||
from .syntax.arc_eager import ArcEager
|
from .syntax.arc_eager import ArcEager
|
||||||
from .syntax.ner import BiluoPushDown
|
from .syntax.ner import BiluoPushDown
|
||||||
|
from .compat import unicode_
|
||||||
|
from .attrs import IS_STOP
|
||||||
|
from . import attrs
|
||||||
|
from . import orth
|
||||||
|
from . import util
|
||||||
|
from . import language_data
|
||||||
|
|
||||||
|
|
||||||
class BaseDefaults(object):
|
class BaseDefaults(object):
|
||||||
|
@ -150,25 +137,15 @@ class BaseDefaults(object):
|
||||||
return pipeline
|
return pipeline
|
||||||
|
|
||||||
token_match = language_data.TOKEN_MATCH
|
token_match = language_data.TOKEN_MATCH
|
||||||
|
|
||||||
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
|
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
|
||||||
|
|
||||||
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
|
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||||
|
|
||||||
infixes = tuple(language_data.TOKENIZER_INFIXES)
|
infixes = tuple(language_data.TOKENIZER_INFIXES)
|
||||||
|
|
||||||
tag_map = dict(language_data.TAG_MAP)
|
tag_map = dict(language_data.TAG_MAP)
|
||||||
|
|
||||||
tokenizer_exceptions = {}
|
tokenizer_exceptions = {}
|
||||||
|
|
||||||
parser_features = get_templates('parser')
|
parser_features = get_templates('parser')
|
||||||
|
|
||||||
entity_features = get_templates('ner')
|
entity_features = get_templates('ner')
|
||||||
|
|
||||||
tagger_features = Tagger.feature_templates # TODO -- fix this
|
tagger_features = Tagger.feature_templates # TODO -- fix this
|
||||||
|
|
||||||
stop_words = set()
|
stop_words = set()
|
||||||
|
|
||||||
lemma_rules = {}
|
lemma_rules = {}
|
||||||
lemma_exc = {}
|
lemma_exc = {}
|
||||||
lemma_index = {}
|
lemma_index = {}
|
||||||
|
@ -202,53 +179,42 @@ class BaseDefaults(object):
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
'''A text-processing pipeline. Usually you'll load this once per process, and
|
"""
|
||||||
|
A text-processing pipeline. Usually you'll load this once per process, and
|
||||||
pass the instance around your program.
|
pass the instance around your program.
|
||||||
'''
|
"""
|
||||||
Defaults = BaseDefaults
|
Defaults = BaseDefaults
|
||||||
lang = None
|
lang = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@contextmanager
|
def setup_directory(cls, path, **configs):
|
||||||
def train(cls, path, gold_tuples, *configs):
|
for name, config in configs.items():
|
||||||
if isinstance(path, basestring):
|
directory = path / name
|
||||||
path = pathlib.Path(path)
|
if directory.exists():
|
||||||
tagger_cfg, parser_cfg, entity_cfg = configs
|
shutil.rmtree(str(directory))
|
||||||
dep_model_dir = path / 'deps'
|
directory.mkdir()
|
||||||
ner_model_dir = path / 'ner'
|
with (directory / 'config.json').open('wb') as file_:
|
||||||
pos_model_dir = path / 'pos'
|
data = ujson.dumps(config, indent=2)
|
||||||
if dep_model_dir.exists():
|
if isinstance(data, unicode_):
|
||||||
shutil.rmtree(str(dep_model_dir))
|
data = data.encode('utf8')
|
||||||
if ner_model_dir.exists():
|
file_.write(data)
|
||||||
shutil.rmtree(str(ner_model_dir))
|
if not (path / 'vocab').exists():
|
||||||
if pos_model_dir.exists():
|
(path / 'vocab').mkdir()
|
||||||
shutil.rmtree(str(pos_model_dir))
|
|
||||||
dep_model_dir.mkdir()
|
|
||||||
ner_model_dir.mkdir()
|
|
||||||
pos_model_dir.mkdir()
|
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@contextmanager
|
||||||
|
def train(cls, path, gold_tuples, **configs):
|
||||||
if parser_cfg['pseudoprojective']:
|
if parser_cfg['pseudoprojective']:
|
||||||
# preprocess training data here before ArcEager.get_labels() is called
|
# preprocess training data here before ArcEager.get_labels() is called
|
||||||
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
|
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
|
||||||
|
|
||||||
parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
|
for subdir in ('deps', 'ner', 'pos'):
|
||||||
entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
|
if subdir not in configs:
|
||||||
|
configs[subdir] = {}
|
||||||
|
configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples)
|
||||||
|
configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples)
|
||||||
|
|
||||||
with (dep_model_dir / 'config.json').open('wb') as file_:
|
cls.setup_directory(path, **configs)
|
||||||
data = ujson.dumps(parser_cfg)
|
|
||||||
if isinstance(data, unicode):
|
|
||||||
data = data.encode('utf8')
|
|
||||||
file_.write(data)
|
|
||||||
with (ner_model_dir / 'config.json').open('wb') as file_:
|
|
||||||
data = ujson.dumps(entity_cfg)
|
|
||||||
if isinstance(data, unicode):
|
|
||||||
data = data.encode('utf8')
|
|
||||||
file_.write(data)
|
|
||||||
with (pos_model_dir / 'config.json').open('wb') as file_:
|
|
||||||
data = ujson.dumps(tagger_cfg)
|
|
||||||
if isinstance(data, unicode):
|
|
||||||
data = data.encode('utf8')
|
|
||||||
file_.write(data)
|
|
||||||
|
|
||||||
self = cls(
|
self = cls(
|
||||||
path=path,
|
path=path,
|
||||||
|
@ -269,14 +235,14 @@ class Language(object):
|
||||||
self.entity = self.Defaults.create_entity(self)
|
self.entity = self.Defaults.create_entity(self)
|
||||||
self.pipeline = self.Defaults.create_pipeline(self)
|
self.pipeline = self.Defaults.create_pipeline(self)
|
||||||
yield Trainer(self, gold_tuples)
|
yield Trainer(self, gold_tuples)
|
||||||
self.end_training(path=path)
|
self.end_training()
|
||||||
|
self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg,
|
||||||
|
pos=self.tagger.cfg)
|
||||||
|
|
||||||
def __init__(self, **overrides):
|
def __init__(self, **overrides):
|
||||||
if 'data_dir' in overrides and 'path' not in overrides:
|
if 'data_dir' in overrides and 'path' not in overrides:
|
||||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||||
path = overrides.get('path', True)
|
path = util.ensure_path(overrides.get('path', True))
|
||||||
if isinstance(path, basestring):
|
|
||||||
path = pathlib.Path(path)
|
|
||||||
if path is True:
|
if path is True:
|
||||||
path = util.get_data_path() / self.lang
|
path = util.get_data_path() / self.lang
|
||||||
if not path.exists() and 'path' not in overrides:
|
if not path.exists() and 'path' not in overrides:
|
||||||
|
@ -322,7 +288,8 @@ class Language(object):
|
||||||
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
|
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
|
||||||
|
|
||||||
def __call__(self, text, tag=True, parse=True, entity=True):
|
def __call__(self, text, tag=True, parse=True, entity=True):
|
||||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
"""
|
||||||
|
Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
and can contain arbtrary whitespace. Alignment into the original string
|
||||||
is preserved.
|
is preserved.
|
||||||
|
|
||||||
|
@ -352,7 +319,8 @@ class Language(object):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
|
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
|
||||||
'''Process texts as a stream, and yield Doc objects in order.
|
"""
|
||||||
|
Process texts as a stream, and yield Doc objects in order.
|
||||||
|
|
||||||
Supports GIL-free multi-threading.
|
Supports GIL-free multi-threading.
|
||||||
|
|
||||||
|
@ -361,7 +329,7 @@ class Language(object):
|
||||||
tag (bool)
|
tag (bool)
|
||||||
parse (bool)
|
parse (bool)
|
||||||
entity (bool)
|
entity (bool)
|
||||||
'''
|
"""
|
||||||
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
||||||
stream = (self.make_doc(text) for text in texts)
|
stream = (self.make_doc(text) for text in texts)
|
||||||
for proc in self.pipeline:
|
for proc in self.pipeline:
|
||||||
|
@ -373,51 +341,35 @@ class Language(object):
|
||||||
for doc in stream:
|
for doc in stream:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def end_training(self, path=None):
|
def save_to_directory(self, path):
|
||||||
if path is None:
|
configs = {
|
||||||
path = self.path
|
'pos': self.tagger.cfg if self.tagger else {},
|
||||||
elif isinstance(path, basestring):
|
'deps': self.parser.cfg if self.parser else {},
|
||||||
path = pathlib.Path(path)
|
'ner': self.entity.cfg if self.entity else {},
|
||||||
|
}
|
||||||
|
|
||||||
if self.tagger:
|
self.setup_directory(path, **configs)
|
||||||
self.tagger.model.end_training()
|
|
||||||
self.tagger.model.dump(str(path / 'pos' / 'model'))
|
|
||||||
if self.parser:
|
|
||||||
self.parser.model.end_training()
|
|
||||||
self.parser.model.dump(str(path / 'deps' / 'model'))
|
|
||||||
if self.entity:
|
|
||||||
self.entity.model.end_training()
|
|
||||||
self.entity.model.dump(str(path / 'ner' / 'model'))
|
|
||||||
|
|
||||||
strings_loc = path / 'vocab' / 'strings.json'
|
strings_loc = path / 'vocab' / 'strings.json'
|
||||||
with strings_loc.open('w', encoding='utf8') as file_:
|
with strings_loc.open('w', encoding='utf8') as file_:
|
||||||
self.vocab.strings.dump(file_)
|
self.vocab.strings.dump(file_)
|
||||||
self.vocab.dump(path / 'vocab' / 'lexemes.bin')
|
self.vocab.dump(path / 'vocab' / 'lexemes.bin')
|
||||||
|
# TODO: Word vectors?
|
||||||
if self.tagger:
|
if self.tagger:
|
||||||
tagger_freqs = list(self.tagger.freqs[TAG].items())
|
self.tagger.model.dump(str(path / 'pos' / 'model'))
|
||||||
else:
|
|
||||||
tagger_freqs = []
|
|
||||||
if self.parser:
|
if self.parser:
|
||||||
dep_freqs = list(self.parser.moves.freqs[DEP].items())
|
self.parser.model.dump(str(path / 'deps' / 'model'))
|
||||||
head_freqs = list(self.parser.moves.freqs[HEAD].items())
|
|
||||||
else:
|
|
||||||
dep_freqs = []
|
|
||||||
head_freqs = []
|
|
||||||
if self.entity:
|
if self.entity:
|
||||||
entity_iob_freqs = list(self.entity.moves.freqs[ENT_IOB].items())
|
self.entity.model.dump(str(path / 'ner' / 'model'))
|
||||||
entity_type_freqs = list(self.entity.moves.freqs[ENT_TYPE].items())
|
|
||||||
else:
|
def end_training(self, path=None):
|
||||||
entity_iob_freqs = []
|
if self.tagger:
|
||||||
entity_type_freqs = []
|
self.tagger.model.end_training()
|
||||||
with (path / 'vocab' / 'serializer.json').open('wb') as file_:
|
if self.parser:
|
||||||
data = ujson.dumps([
|
self.parser.model.end_training()
|
||||||
(TAG, tagger_freqs),
|
if self.entity:
|
||||||
(DEP, dep_freqs),
|
self.entity.model.end_training()
|
||||||
(ENT_IOB, entity_iob_freqs),
|
# NB: This is slightly different from before --- we no longer default
|
||||||
(ENT_TYPE, entity_type_freqs),
|
# to taking nlp.path
|
||||||
(HEAD, head_freqs)
|
if path is not None:
|
||||||
])
|
self.save_to_directory(path)
|
||||||
if isinstance(data, unicode):
|
|
||||||
data = data.encode('utf8')
|
|
||||||
file_.write(data)
|
|
||||||
|
|
|
@ -1,13 +1,8 @@
|
||||||
from __future__ import unicode_literals, print_function
|
# coding: utf8
|
||||||
import codecs
|
from __future__ import unicode_literals
|
||||||
import pathlib
|
|
||||||
|
|
||||||
import ujson as json
|
|
||||||
|
|
||||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||||
from .symbols import VerbForm_inf, VerbForm_none
|
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||||
from .symbols import Number_sing
|
|
||||||
from .symbols import Degree_pos
|
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
|
@ -38,8 +33,10 @@ class Lemmatizer(object):
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
def is_base_form(self, univ_pos, morphology=None):
|
def is_base_form(self, univ_pos, morphology=None):
|
||||||
'''Check whether we're dealing with an uninflected paradigm, so we can
|
"""
|
||||||
avoid lemmatization entirely.'''
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
|
avoid lemmatization entirely.
|
||||||
|
"""
|
||||||
morphology = {} if morphology is None else morphology
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||||
true_morph_key = morphology.get('morph', 0)
|
true_morph_key = morphology.get('morph', 0)
|
||||||
|
|
|
@ -1,4 +1,7 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
from cpython.ref cimport Py_INCREF
|
from cpython.ref cimport Py_INCREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
@ -9,14 +12,11 @@ from cython.view cimport array as cvarray
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
np.import_array()
|
np.import_array()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
import numpy
|
||||||
|
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
from .typedefs cimport attr_t, flags_t
|
from .typedefs cimport attr_t, flags_t
|
||||||
import numpy
|
|
||||||
|
|
||||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from .attrs cimport IS_BRACKET
|
from .attrs cimport IS_BRACKET
|
||||||
|
@ -30,13 +30,15 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
"""An entry in the vocabulary. A Lexeme has no string context --- it's a
|
"""
|
||||||
|
An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||||
tag).
|
tag).
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, int orth):
|
def __init__(self, Vocab vocab, int orth):
|
||||||
"""Create a Lexeme object.
|
"""
|
||||||
|
Create a Lexeme object.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab): The parent vocabulary
|
vocab (Vocab): The parent vocabulary
|
||||||
|
@ -80,7 +82,8 @@ cdef class Lexeme:
|
||||||
return self.c.orth
|
return self.c.orth
|
||||||
|
|
||||||
def set_flag(self, attr_id_t flag_id, bint value):
|
def set_flag(self, attr_id_t flag_id, bint value):
|
||||||
"""Change the value of a boolean flag.
|
"""
|
||||||
|
Change the value of a boolean flag.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
flag_id (int): The attribute ID of the flag to set.
|
flag_id (int): The attribute ID of the flag to set.
|
||||||
|
@ -89,7 +92,8 @@ cdef class Lexeme:
|
||||||
Lexeme.c_set_flag(self.c, flag_id, value)
|
Lexeme.c_set_flag(self.c, flag_id, value)
|
||||||
|
|
||||||
def check_flag(self, attr_id_t flag_id):
|
def check_flag(self, attr_id_t flag_id):
|
||||||
"""Check the value of a boolean flag.
|
"""
|
||||||
|
Check the value of a boolean flag.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
flag_id (int): The attribute ID of the flag to query.
|
flag_id (int): The attribute ID of the flag to query.
|
||||||
|
@ -98,7 +102,8 @@ cdef class Lexeme:
|
||||||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
"""
|
||||||
|
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
other:
|
other:
|
||||||
|
@ -106,7 +111,7 @@ cdef class Lexeme:
|
||||||
Token and Lexeme objects.
|
Token and Lexeme objects.
|
||||||
Returns:
|
Returns:
|
||||||
score (float): A scalar similarity score. Higher is more similar.
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
'''
|
"""
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import ujson
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .attrs cimport attr_id_t
|
from .attrs cimport attr_id_t
|
||||||
|
@ -52,12 +55,6 @@ from .attrs import FLAG36 as L9_ENT
|
||||||
from .attrs import FLAG35 as L10_ENT
|
from .attrs import FLAG35 as L10_ENT
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import ujson as json
|
|
||||||
except ImportError:
|
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum quantifier_t:
|
cpdef enum quantifier_t:
|
||||||
_META
|
_META
|
||||||
ONE
|
ONE
|
||||||
|
@ -164,7 +161,7 @@ def _convert_strings(token_specs, string_store):
|
||||||
def merge_phrase(matcher, doc, i, matches):
|
def merge_phrase(matcher, doc, i, matches):
|
||||||
'''Callback to merge a phrase on match'''
|
'''Callback to merge a phrase on match'''
|
||||||
ent_id, label, start, end = matches[i]
|
ent_id, label, start, end = matches[i]
|
||||||
span = doc[start : end]
|
span = doc[start : end]
|
||||||
span.merge(ent_type=label, ent_id=ent_id)
|
span.merge(ent_type=label, ent_id=ent_id)
|
||||||
|
|
||||||
|
|
||||||
|
@ -180,7 +177,8 @@ cdef class Matcher:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, vocab):
|
def load(cls, path, vocab):
|
||||||
'''Load the matcher and patterns from a file path.
|
"""
|
||||||
|
Load the matcher and patterns from a file path.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
path (Path):
|
path (Path):
|
||||||
|
@ -189,16 +187,17 @@ cdef class Matcher:
|
||||||
The vocabulary that the documents to match over will refer to.
|
The vocabulary that the documents to match over will refer to.
|
||||||
Returns:
|
Returns:
|
||||||
Matcher: The newly constructed object.
|
Matcher: The newly constructed object.
|
||||||
'''
|
"""
|
||||||
if (path / 'gazetteer.json').exists():
|
if (path / 'gazetteer.json').exists():
|
||||||
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
||||||
patterns = json.load(file_)
|
patterns = ujson.load(file_)
|
||||||
else:
|
else:
|
||||||
patterns = {}
|
patterns = {}
|
||||||
return cls(vocab, patterns)
|
return cls(vocab, patterns)
|
||||||
|
|
||||||
def __init__(self, vocab, patterns={}):
|
def __init__(self, vocab, patterns={}):
|
||||||
"""Create the Matcher.
|
"""
|
||||||
|
Create the Matcher.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab):
|
vocab (Vocab):
|
||||||
|
@ -227,7 +226,8 @@ cdef class Matcher:
|
||||||
|
|
||||||
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
||||||
acceptor=None, on_match=None):
|
acceptor=None, on_match=None):
|
||||||
"""Add an entity to the matcher.
|
"""
|
||||||
|
Add an entity to the matcher.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
entity_key (unicode or int):
|
entity_key (unicode or int):
|
||||||
|
@ -264,7 +264,8 @@ cdef class Matcher:
|
||||||
self._callbacks[entity_key] = on_match
|
self._callbacks[entity_key] = on_match
|
||||||
|
|
||||||
def add_pattern(self, entity_key, token_specs, label=""):
|
def add_pattern(self, entity_key, token_specs, label=""):
|
||||||
"""Add a pattern to the matcher.
|
"""
|
||||||
|
Add a pattern to the matcher.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
entity_key (unicode or int):
|
entity_key (unicode or int):
|
||||||
|
@ -307,7 +308,8 @@ cdef class Matcher:
|
||||||
return entity_key
|
return entity_key
|
||||||
|
|
||||||
def has_entity(self, entity_key):
|
def has_entity(self, entity_key):
|
||||||
"""Check whether the matcher has an entity.
|
"""
|
||||||
|
Check whether the matcher has an entity.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
entity_key (string or int): The entity key to check.
|
entity_key (string or int): The entity key to check.
|
||||||
|
@ -318,7 +320,8 @@ cdef class Matcher:
|
||||||
return entity_key in self._entities
|
return entity_key in self._entities
|
||||||
|
|
||||||
def get_entity(self, entity_key):
|
def get_entity(self, entity_key):
|
||||||
"""Retrieve the attributes stored for an entity.
|
"""
|
||||||
|
Retrieve the attributes stored for an entity.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
entity_key (unicode or int): The entity to retrieve.
|
entity_key (unicode or int): The entity to retrieve.
|
||||||
|
@ -332,7 +335,8 @@ cdef class Matcher:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def __call__(self, Doc doc, acceptor=None):
|
def __call__(self, Doc doc, acceptor=None):
|
||||||
"""Find all token sequences matching the supplied patterns on the Doc.
|
"""
|
||||||
|
Find all token sequences matching the supplied patterns on the Doc.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc):
|
doc (Doc):
|
||||||
|
@ -445,7 +449,8 @@ cdef class Matcher:
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""
|
||||||
|
Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
docs: A stream of documents.
|
docs: A stream of documents.
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
# cython: infer_types
|
# cython: infer_types
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
try:
|
|
||||||
import ujson as json
|
|
||||||
except ImportError:
|
|
||||||
import json
|
|
||||||
|
|
||||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
||||||
from .attrs cimport POS, IS_SPACE
|
from .attrs cimport POS, IS_SPACE
|
||||||
from .parts_of_speech import IDS as POS_IDS
|
from .parts_of_speech import IDS as POS_IDS
|
||||||
|
@ -16,7 +12,9 @@ from .attrs import LEMMA, intify_attrs
|
||||||
|
|
||||||
|
|
||||||
def _normalize_props(props):
|
def _normalize_props(props):
|
||||||
'''Transform deprecated string keys to correct names.'''
|
"""
|
||||||
|
Transform deprecated string keys to correct names.
|
||||||
|
"""
|
||||||
out = {}
|
out = {}
|
||||||
for key, value in props.items():
|
for key, value in props.items():
|
||||||
if key == POS:
|
if key == POS:
|
||||||
|
@ -98,13 +96,14 @@ cdef class Morphology:
|
||||||
flags[0] &= ~(one << flag_id)
|
flags[0] &= ~(one << flag_id)
|
||||||
|
|
||||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
||||||
'''Add a special-case rule to the morphological analyser. Tokens whose
|
"""
|
||||||
|
Add a special-case rule to the morphological analyser. Tokens whose
|
||||||
tag and orth match the rule will receive the specified properties.
|
tag and orth match the rule will receive the specified properties.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
orth (unicode): The word-form to key the exception.
|
||||||
'''
|
"""
|
||||||
tag = self.strings[tag_str]
|
tag = self.strings[tag_str]
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
orth = self.strings[orth_str]
|
orth = self.strings[orth_str]
|
||||||
|
|
|
@ -1,8 +0,0 @@
|
||||||
class RegexMerger(object):
|
|
||||||
def __init__(self, regexes):
|
|
||||||
self.regexes = regexes
|
|
||||||
|
|
||||||
def __call__(self, tokens):
|
|
||||||
for tag, entity_type, regex in self.regexes:
|
|
||||||
for m in regex.finditer(tokens.string):
|
|
||||||
tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)
|
|
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf8
|
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .syntax.parser cimport Parser
|
from .syntax.parser cimport Parser
|
||||||
from .syntax.beam_parser cimport BeamParser
|
from .syntax.beam_parser cimport BeamParser
|
||||||
from .syntax.ner cimport BiluoPushDown
|
from .syntax.ner cimport BiluoPushDown
|
||||||
|
@ -11,44 +14,40 @@ from .attrs import DEP, ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
"""Annotate named entities on Doc objects."""
|
"""
|
||||||
|
Annotate named entities on Doc objects.
|
||||||
|
"""
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
feature_templates = get_feature_templates('ner')
|
feature_templates = get_feature_templates('ner')
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
Parser.add_label(self, label)
|
||||||
self.moves.add_action(action, label)
|
|
||||||
if 'actions' in self.cfg:
|
|
||||||
self.cfg['actions'].setdefault(action,
|
|
||||||
{}).setdefault(label, True)
|
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
|
# Set label into serializer. Super hacky :(
|
||||||
for attr, freqs in self.vocab.serializer_freqs:
|
for attr, freqs in self.vocab.serializer_freqs:
|
||||||
if attr == ENT_TYPE and label not in freqs:
|
if attr == ENT_TYPE and label not in freqs:
|
||||||
freqs.append([label, 1])
|
freqs.append([label, 1])
|
||||||
# Super hacky :(
|
|
||||||
self.vocab._serializer = None
|
self.vocab._serializer = None
|
||||||
|
|
||||||
|
|
||||||
cdef class BeamEntityRecognizer(BeamParser):
|
cdef class BeamEntityRecognizer(BeamParser):
|
||||||
"""Annotate named entities on Doc objects."""
|
"""
|
||||||
|
Annotate named entities on Doc objects.
|
||||||
|
"""
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
feature_templates = get_feature_templates('ner')
|
feature_templates = get_feature_templates('ner')
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
Parser.add_label(self, label)
|
||||||
self.moves.add_action(action, label)
|
|
||||||
if 'actions' in self.cfg:
|
|
||||||
self.cfg['actions'].setdefault(action,
|
|
||||||
{}).setdefault(label, True)
|
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
|
# Set label into serializer. Super hacky :(
|
||||||
for attr, freqs in self.vocab.serializer_freqs:
|
for attr, freqs in self.vocab.serializer_freqs:
|
||||||
if attr == ENT_TYPE and label not in freqs:
|
if attr == ENT_TYPE and label not in freqs:
|
||||||
freqs.append([label, 1])
|
freqs.append([label, 1])
|
||||||
# Super hacky :(
|
|
||||||
self.vocab._serializer = None
|
self.vocab._serializer = None
|
||||||
|
|
||||||
|
|
||||||
|
@ -58,11 +57,7 @@ cdef class DependencyParser(Parser):
|
||||||
feature_templates = get_feature_templates('basic')
|
feature_templates = get_feature_templates('basic')
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
Parser.add_label(self, label)
|
||||||
self.moves.add_action(action, label)
|
|
||||||
if 'actions' in self.cfg:
|
|
||||||
self.cfg['actions'].setdefault(action,
|
|
||||||
{}).setdefault(label, True)
|
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
for attr, freqs in self.vocab.serializer_freqs:
|
for attr, freqs in self.vocab.serializer_freqs:
|
||||||
|
@ -78,11 +73,7 @@ cdef class BeamDependencyParser(BeamParser):
|
||||||
feature_templates = get_feature_templates('basic')
|
feature_templates = get_feature_templates('basic')
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
Parser.add_label(self, label)
|
||||||
self.moves.add_action(action, label)
|
|
||||||
if 'actions' in self.cfg:
|
|
||||||
self.cfg['actions'].setdefault(action,
|
|
||||||
{}).setdefault(label, True)
|
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
for attr, freqs in self.vocab.serializer_freqs:
|
for attr, freqs in self.vocab.serializer_freqs:
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
from __future__ import division
|
# coding: utf8
|
||||||
from __future__ import print_function
|
from __future__ import division, print_function, unicode_literals
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .gold import tags_to_entities
|
from .gold import tags_to_entities
|
||||||
|
|
||||||
|
|
||||||
class PRFScore(object):
|
class PRFScore(object):
|
||||||
"""A precision / recall / F score"""
|
"""
|
||||||
|
A precision / recall / F score
|
||||||
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.tp = 0
|
self.tp = 0
|
||||||
self.fp = 0
|
self.fp = 0
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, absolute_import
|
from __future__ import unicode_literals, absolute_import
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from libc.stdint cimport uint64_t, uint32_t
|
from libc.stdint cimport uint64_t, uint32_t
|
||||||
|
|
||||||
from murmurhash.mrmr cimport hash64, hash32
|
from murmurhash.mrmr cimport hash64, hash32
|
||||||
|
|
||||||
from preshed.maps cimport map_iter, key_t
|
from preshed.maps cimport map_iter, key_t
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
@ -73,13 +72,16 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
||||||
|
|
||||||
|
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
'''Map strings to and from integer IDs.'''
|
"""
|
||||||
|
Map strings to and from integer IDs.
|
||||||
|
"""
|
||||||
def __init__(self, strings=None, freeze=False):
|
def __init__(self, strings=None, freeze=False):
|
||||||
'''Create the StringStore.
|
"""
|
||||||
|
Create the StringStore.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
strings: A sequence of unicode strings to add to the store.
|
strings: A sequence of unicode strings to add to the store.
|
||||||
'''
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self._oov = PreshMap()
|
self._oov = PreshMap()
|
||||||
|
@ -104,7 +106,8 @@ cdef class StringStore:
|
||||||
return (StringStore, (list(self),))
|
return (StringStore, (list(self),))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of strings in the store.
|
"""
|
||||||
|
The number of strings in the store.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
int The number of strings in the store.
|
int The number of strings in the store.
|
||||||
|
@ -112,8 +115,9 @@ cdef class StringStore:
|
||||||
return self.size-1
|
return self.size-1
|
||||||
|
|
||||||
def __getitem__(self, object string_or_id):
|
def __getitem__(self, object string_or_id):
|
||||||
"""Retrieve a string from a given integer ID, or vice versa.
|
"""
|
||||||
|
Retrieve a string from a given integer ID, or vice versa.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string_or_id (bytes or unicode or int):
|
string_or_id (bytes or unicode or int):
|
||||||
The value to encode.
|
The value to encode.
|
||||||
|
@ -149,17 +153,18 @@ cdef class StringStore:
|
||||||
raise TypeError(type(string_or_id))
|
raise TypeError(type(string_or_id))
|
||||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
||||||
if utf8str is NULL:
|
if utf8str is NULL:
|
||||||
# TODO: We need to use 32 bit here, for compatibility with the
|
# TODO: We need to use 32 bit here, for compatibility with the
|
||||||
# vocabulary values. This makes birthday paradox probabilities
|
# vocabulary values. This makes birthday paradox probabilities
|
||||||
# pretty bad.
|
# pretty bad.
|
||||||
# We could also get unlucky here, and hash into a value that
|
# We could also get unlucky here, and hash into a value that
|
||||||
# collides with the 'real' strings.
|
# collides with the 'real' strings.
|
||||||
return hash32_utf8(byte_string, len(byte_string))
|
return hash32_utf8(byte_string, len(byte_string))
|
||||||
else:
|
else:
|
||||||
return utf8str - self.c
|
return utf8str - self.c
|
||||||
|
|
||||||
def __contains__(self, unicode string not None):
|
def __contains__(self, unicode string not None):
|
||||||
"""Check whether a string is in the store.
|
"""
|
||||||
|
Check whether a string is in the store.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string (unicode): The string to check.
|
string (unicode): The string to check.
|
||||||
|
@ -172,7 +177,8 @@ cdef class StringStore:
|
||||||
return self._map.get(key) is not NULL
|
return self._map.get(key) is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the strings in the store, in order.
|
"""
|
||||||
|
Iterate over the strings in the store, in order.
|
||||||
|
|
||||||
Yields: unicode A string in the store.
|
Yields: unicode A string in the store.
|
||||||
"""
|
"""
|
||||||
|
@ -230,7 +236,8 @@ cdef class StringStore:
|
||||||
return &self.c[self.size-1]
|
return &self.c[self.size-1]
|
||||||
|
|
||||||
def dump(self, file_):
|
def dump(self, file_):
|
||||||
"""Save the strings to a JSON file.
|
"""
|
||||||
|
Save the strings to a JSON file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
file_ (buffer): The file to save the strings.
|
file_ (buffer): The file to save the strings.
|
||||||
|
@ -244,7 +251,8 @@ cdef class StringStore:
|
||||||
file_.write(string_data)
|
file_.write(string_data)
|
||||||
|
|
||||||
def load(self, file_):
|
def load(self, file_):
|
||||||
"""Load the strings from a JSON file.
|
"""
|
||||||
|
Load the strings from a JSON file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
file_ (buffer): The file from which to load the strings.
|
file_ (buffer): The file from which to load the strings.
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
|
|
|
@ -7,17 +7,17 @@ out of "context") is in features/extractor.pyx
|
||||||
The atomic feature names are listed in a big enum, so that the feature tuples
|
The atomic feature names are listed in a big enum, so that the feature tuples
|
||||||
can refer to them.
|
can refer to them.
|
||||||
"""
|
"""
|
||||||
from libc.string cimport memset
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from libc.string cimport memset
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
if token is NULL:
|
if token is NULL:
|
||||||
|
|
|
@ -1,29 +1,26 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# cython: cdivision=True
|
# cython: cdivision=True
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
|
|
||||||
import ctypes
|
import ctypes
|
||||||
import os
|
from libc.stdint cimport uint32_t
|
||||||
|
from libc.string cimport memcpy
|
||||||
from ..structs cimport TokenC
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
from .stateclass cimport StateClass
|
||||||
|
from ._state cimport StateC, is_space_token
|
||||||
|
from .nonproj import PseudoProjectivity
|
||||||
|
from .nonproj import is_nonproj_tree
|
||||||
from .transition_system cimport do_func_t, get_cost_func_t
|
from .transition_system cimport do_func_t, get_cost_func_t
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
|
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
from ..structs cimport TokenC
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
from libc.string cimport memcpy
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from .stateclass cimport StateClass
|
|
||||||
from ._state cimport StateC, is_space_token
|
|
||||||
from .nonproj import PseudoProjectivity
|
|
||||||
from .nonproj import is_nonproj_tree
|
|
||||||
|
|
||||||
|
|
||||||
DEF NON_MONOTONIC = True
|
DEF NON_MONOTONIC = True
|
||||||
|
@ -317,17 +314,20 @@ cdef class ArcEager(TransitionSystem):
|
||||||
def get_actions(cls, **kwargs):
|
def get_actions(cls, **kwargs):
|
||||||
actions = kwargs.get('actions',
|
actions = kwargs.get('actions',
|
||||||
{
|
{
|
||||||
SHIFT: {'': True},
|
SHIFT: [''],
|
||||||
REDUCE: {'': True},
|
REDUCE: [''],
|
||||||
RIGHT: {},
|
RIGHT: [],
|
||||||
LEFT: {},
|
LEFT: [],
|
||||||
BREAK: {'ROOT': True}})
|
BREAK: ['ROOT']})
|
||||||
|
seen_actions = set()
|
||||||
for label in kwargs.get('left_labels', []):
|
for label in kwargs.get('left_labels', []):
|
||||||
if label.upper() != 'ROOT':
|
if label.upper() != 'ROOT':
|
||||||
actions[LEFT][label] = True
|
if (LEFT, label) not in seen_actions:
|
||||||
|
actions[LEFT].append(label)
|
||||||
for label in kwargs.get('right_labels', []):
|
for label in kwargs.get('right_labels', []):
|
||||||
if label.upper() != 'ROOT':
|
if label.upper() != 'ROOT':
|
||||||
actions[RIGHT][label] = True
|
if (RIGHT, label) not in seen_actions:
|
||||||
|
actions[RIGHT].append(label)
|
||||||
|
|
||||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||||
|
@ -336,9 +336,11 @@ cdef class ArcEager(TransitionSystem):
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
if label != 'ROOT':
|
if label != 'ROOT':
|
||||||
if head < child:
|
if head < child:
|
||||||
actions[RIGHT][label] = True
|
if (RIGHT, label) not in seen_actions:
|
||||||
|
actions[RIGHT].append(label)
|
||||||
elif head > child:
|
elif head > child:
|
||||||
actions[LEFT][label] = True
|
if (LEFT, label) not in seen_actions:
|
||||||
|
actions[LEFT].append(label)
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
property action_types:
|
property action_types:
|
||||||
|
|
|
@ -1,50 +1,34 @@
|
||||||
|
"""
|
||||||
|
MALT-style dependency parser
|
||||||
|
"""
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# cython: experimental_cpp_class_def=True
|
# cython: experimental_cpp_class_def=True
|
||||||
# cython: cdivision=True
|
# cython: cdivision=True
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
"""
|
# coding: utf-8
|
||||||
MALT-style dependency parser
|
|
||||||
"""
|
from __future__ import unicode_literals, print_function
|
||||||
from __future__ import unicode_literals
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport rand
|
from libc.stdlib cimport rand
|
||||||
from libc.math cimport log, exp, isnan, isinf
|
from libc.math cimport log, exp, isnan, isinf
|
||||||
import random
|
|
||||||
import os.path
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import json
|
|
||||||
import math
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool, Address
|
from cymem.cymem cimport Pool, Address
|
||||||
from murmurhash.mrmr cimport real_hash64 as hash64
|
from murmurhash.mrmr cimport real_hash64 as hash64
|
||||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||||
|
|
||||||
|
|
||||||
from util import Config
|
|
||||||
|
|
||||||
from thinc.linear.features cimport ConjunctionExtracter
|
from thinc.linear.features cimport ConjunctionExtracter
|
||||||
from thinc.structs cimport FeatureC, ExampleC
|
from thinc.structs cimport FeatureC, ExampleC
|
||||||
|
from thinc.extra.search cimport Beam, MaxViolation
|
||||||
from thinc.extra.search cimport Beam
|
|
||||||
from thinc.extra.search cimport MaxViolation
|
|
||||||
from thinc.extra.eg cimport Example
|
from thinc.extra.eg cimport Example
|
||||||
from thinc.extra.mb cimport Minibatch
|
from thinc.extra.mb cimport Minibatch
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..strings cimport StringStore
|
from ..strings cimport StringStore
|
||||||
|
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
|
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
from ._parse_features cimport CONTEXT_SIZE
|
from ._parse_features cimport CONTEXT_SIZE
|
||||||
from ._parse_features cimport fill_context
|
from ._parse_features cimport fill_context
|
||||||
|
@ -266,4 +250,3 @@ def is_gold(StateClass state, GoldParse gold, StringStore strings):
|
||||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||||
truth.add((id_, head, dep))
|
truth.add((id_, head, dep))
|
||||||
return truth == predicted
|
return truth == predicted
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
from spacy.parts_of_speech cimport NOUN, PROPN, PRON
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..parts_of_speech cimport NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
|
||||||
def english_noun_chunks(obj):
|
def english_noun_chunks(obj):
|
||||||
'''Detect base noun phrases from a dependency parse.
|
"""
|
||||||
Works on both Doc and Span.'''
|
Detect base noun phrases from a dependency parse.
|
||||||
|
Works on both Doc and Span.
|
||||||
|
"""
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||||
'attr', 'ROOT', 'root']
|
'attr', 'ROOT', 'root']
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
|
|
|
@ -1,17 +1,16 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .transition_system cimport Transition
|
|
||||||
from .transition_system cimport do_func_t
|
|
||||||
|
|
||||||
from ..structs cimport TokenC, Entity
|
|
||||||
|
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from ..gold cimport GoldParseC
|
|
||||||
from ..gold cimport GoldParse
|
|
||||||
from ..attrs cimport ENT_TYPE, ENT_IOB
|
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
from .transition_system cimport Transition
|
||||||
|
from .transition_system cimport do_func_t
|
||||||
|
from ..structs cimport TokenC, Entity
|
||||||
|
from ..gold cimport GoldParseC
|
||||||
|
from ..gold cimport GoldParse
|
||||||
|
from ..attrs cimport ENT_TYPE, ENT_IOB
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
cdef enum:
|
||||||
|
@ -21,6 +20,7 @@ cdef enum:
|
||||||
LAST
|
LAST
|
||||||
UNIT
|
UNIT
|
||||||
OUT
|
OUT
|
||||||
|
ISNT
|
||||||
N_MOVES
|
N_MOVES
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,6 +31,7 @@ MOVE_NAMES[IN] = 'I'
|
||||||
MOVE_NAMES[LAST] = 'L'
|
MOVE_NAMES[LAST] = 'L'
|
||||||
MOVE_NAMES[UNIT] = 'U'
|
MOVE_NAMES[UNIT] = 'U'
|
||||||
MOVE_NAMES[OUT] = 'O'
|
MOVE_NAMES[OUT] = 'O'
|
||||||
|
MOVE_NAMES[ISNT] = 'x'
|
||||||
|
|
||||||
|
|
||||||
cdef do_func_t[N_MOVES] do_funcs
|
cdef do_func_t[N_MOVES] do_funcs
|
||||||
|
@ -54,16 +55,20 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
def get_actions(cls, **kwargs):
|
def get_actions(cls, **kwargs):
|
||||||
actions = kwargs.get('actions',
|
actions = kwargs.get('actions',
|
||||||
{
|
{
|
||||||
MISSING: {'': True},
|
MISSING: [''],
|
||||||
BEGIN: {},
|
BEGIN: [],
|
||||||
IN: {},
|
IN: [],
|
||||||
LAST: {},
|
LAST: [],
|
||||||
UNIT: {},
|
UNIT: [],
|
||||||
OUT: {'': True}
|
OUT: ['']
|
||||||
})
|
})
|
||||||
|
seen_entities = set()
|
||||||
for entity_type in kwargs.get('entity_types', []):
|
for entity_type in kwargs.get('entity_types', []):
|
||||||
|
if entity_type in seen_entities:
|
||||||
|
continue
|
||||||
|
seen_entities.add(entity_type)
|
||||||
for action in (BEGIN, IN, LAST, UNIT):
|
for action in (BEGIN, IN, LAST, UNIT):
|
||||||
actions[action][entity_type] = True
|
actions[action].append(entity_type)
|
||||||
moves = ('M', 'B', 'I', 'L', 'U')
|
moves = ('M', 'B', 'I', 'L', 'U')
|
||||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
for raw_text, sents in kwargs.get('gold_parses', []):
|
||||||
for (ids, words, tags, heads, labels, biluo), _ in sents:
|
for (ids, words, tags, heads, labels, biluo), _ in sents:
|
||||||
|
@ -72,8 +77,10 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
if ner_tag.count('-') != 1:
|
if ner_tag.count('-') != 1:
|
||||||
raise ValueError(ner_tag)
|
raise ValueError(ner_tag)
|
||||||
_, label = ner_tag.split('-')
|
_, label = ner_tag.split('-')
|
||||||
for move_str in ('B', 'I', 'L', 'U'):
|
if label not in seen_entities:
|
||||||
actions[moves.index(move_str)][label] = True
|
seen_entities.add(label)
|
||||||
|
for move_str in ('B', 'I', 'L', 'U'):
|
||||||
|
actions[moves.index(move_str)].append(label)
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
property action_types:
|
property action_types:
|
||||||
|
@ -111,11 +118,17 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
label = 0
|
label = 0
|
||||||
elif '-' in name:
|
elif '-' in name:
|
||||||
move_str, label_str = name.split('-', 1)
|
move_str, label_str = name.split('-', 1)
|
||||||
|
# Hacky way to denote 'not this entity'
|
||||||
|
if label_str.startswith('!'):
|
||||||
|
label_str = label_str[1:]
|
||||||
|
move_str = 'x'
|
||||||
label = self.strings[label_str]
|
label = self.strings[label_str]
|
||||||
else:
|
else:
|
||||||
move_str = name
|
move_str = name
|
||||||
label = 0
|
label = 0
|
||||||
move = MOVE_NAMES.index(move_str)
|
move = MOVE_NAMES.index(move_str)
|
||||||
|
if move == ISNT:
|
||||||
|
return Transition(clas=0, move=ISNT, label=label, score=0)
|
||||||
for i in range(self.n_moves):
|
for i in range(self.n_moves):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
|
@ -225,6 +238,9 @@ cdef class Begin:
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# B, Gold B --> Label match
|
# B, Gold B --> Label match
|
||||||
return label != g_tag
|
return label != g_tag
|
||||||
|
# Support partial supervision in the form of "not this label"
|
||||||
|
elif g_act == ISNT:
|
||||||
|
return label == g_tag
|
||||||
else:
|
else:
|
||||||
# B, Gold I --> False (P)
|
# B, Gold I --> False (P)
|
||||||
# B, Gold L --> False (P)
|
# B, Gold L --> False (P)
|
||||||
|
@ -359,6 +375,9 @@ cdef class Unit:
|
||||||
elif g_act == UNIT:
|
elif g_act == UNIT:
|
||||||
# U, Gold U --> True iff tag match
|
# U, Gold U --> True iff tag match
|
||||||
return label != g_tag
|
return label != g_tag
|
||||||
|
# Support partial supervision in the form of "not this label"
|
||||||
|
elif g_act == ISNT:
|
||||||
|
return label == g_tag
|
||||||
else:
|
else:
|
||||||
# U, Gold B --> False
|
# U, Gold B --> False
|
||||||
# U, Gold I --> False
|
# U, Gold I --> False
|
||||||
|
@ -388,7 +407,7 @@ cdef class Out:
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef int g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING or g_act == ISNT:
|
||||||
return 0
|
return 0
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# O, Gold B --> False
|
# O, Gold B --> False
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from spacy.attrs import DEP, HEAD
|
from ..attrs import DEP, HEAD
|
||||||
|
|
||||||
|
|
||||||
def ancestors(tokenid, heads):
|
def ancestors(tokenid, heads):
|
||||||
|
@ -201,5 +202,3 @@ class PseudoProjectivity:
|
||||||
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
||||||
filtered.append((raw_text, filtered_sents))
|
filtered.append((raw_text, filtered_sents))
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,58 +1,46 @@
|
||||||
# cython: infer_types=True
|
|
||||||
"""
|
"""
|
||||||
MALT-style dependency parser
|
MALT-style dependency parser
|
||||||
"""
|
"""
|
||||||
|
# coding: utf-8
|
||||||
|
# cython: infer_types=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
import ujson
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
from cpython.exc cimport PyErr_CheckSignals
|
from cpython.exc cimport PyErr_CheckSignals
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
|
|
||||||
import os.path
|
|
||||||
from collections import Counter
|
|
||||||
from os import path
|
|
||||||
import shutil
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from .nonproj import PseudoProjectivity
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool, Address
|
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||||
from thinc.linalg cimport VecVec
|
from thinc.linalg cimport VecVec
|
||||||
from thinc.structs cimport SparseArrayC
|
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
||||||
|
from thinc.extra.eg cimport Example
|
||||||
|
from cymem.cymem cimport Pool, Address
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
from preshed.maps cimport MapStruct
|
from preshed.maps cimport MapStruct
|
||||||
from preshed.maps cimport map_get
|
from preshed.maps cimport map_get
|
||||||
|
|
||||||
from thinc.structs cimport FeatureC
|
|
||||||
from thinc.structs cimport ExampleC
|
|
||||||
from thinc.extra.eg cimport Example
|
|
||||||
|
|
||||||
from util import Config
|
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
from ..strings cimport StringStore
|
|
||||||
|
|
||||||
from .transition_system import OracleError
|
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
|
||||||
|
|
||||||
from ..gold cimport GoldParse
|
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
from ._parse_features cimport CONTEXT_SIZE
|
from ._parse_features cimport CONTEXT_SIZE
|
||||||
from ._parse_features cimport fill_context
|
from ._parse_features cimport fill_context
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
from .nonproj import PseudoProjectivity
|
||||||
|
from .transition_system import OracleError
|
||||||
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
|
from ..structs cimport TokenC
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
from ..strings cimport StringStore
|
||||||
|
from ..gold cimport GoldParse
|
||||||
|
|
||||||
USE_FTRL = True
|
|
||||||
|
USE_FTRL = False
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
def set_debug(val):
|
def set_debug(val):
|
||||||
global DEBUG
|
global DEBUG
|
||||||
|
@ -80,7 +68,9 @@ cdef class ParserModel(AveragedPerceptron):
|
||||||
return nr_feat
|
return nr_feat
|
||||||
|
|
||||||
def update(self, Example eg, itn=0):
|
def update(self, Example eg, itn=0):
|
||||||
'''Does regression on negative cost. Sort of cute?'''
|
"""
|
||||||
|
Does regression on negative cost. Sort of cute?
|
||||||
|
"""
|
||||||
self.time += 1
|
self.time += 1
|
||||||
cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
|
cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
|
||||||
cdef int guess = eg.guess
|
cdef int guess = eg.guess
|
||||||
|
@ -132,10 +122,13 @@ cdef class ParserModel(AveragedPerceptron):
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser:
|
||||||
"""Base class of the DependencyParser and EntityRecognizer."""
|
"""
|
||||||
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
|
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
|
||||||
"""Load the statistical model from the supplied path.
|
"""
|
||||||
|
Load the statistical model from the supplied path.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
path (Path):
|
path (Path):
|
||||||
|
@ -148,10 +141,16 @@ cdef class Parser:
|
||||||
The newly constructed object.
|
The newly constructed object.
|
||||||
"""
|
"""
|
||||||
with (path / 'config.json').open() as file_:
|
with (path / 'config.json').open() as file_:
|
||||||
cfg = json.load(file_)
|
cfg = ujson.load(file_)
|
||||||
# TODO: remove this shim when we don't have to support older data
|
# TODO: remove this shim when we don't have to support older data
|
||||||
if 'labels' in cfg and 'actions' not in cfg:
|
if 'labels' in cfg and 'actions' not in cfg:
|
||||||
cfg['actions'] = cfg.pop('labels')
|
cfg['actions'] = cfg.pop('labels')
|
||||||
|
# TODO: remove this shim when we don't have to support older data
|
||||||
|
for action_name, labels in dict(cfg['actions']).items():
|
||||||
|
# We need this to be sorted
|
||||||
|
if isinstance(labels, dict):
|
||||||
|
labels = list(sorted(labels.keys()))
|
||||||
|
cfg['actions'][action_name] = labels
|
||||||
self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
|
self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
|
||||||
if (path / 'model').exists():
|
if (path / 'model').exists():
|
||||||
self.model.load(str(path / 'model'))
|
self.model.load(str(path / 'model'))
|
||||||
|
@ -161,7 +160,8 @@ cdef class Parser:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
|
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
|
||||||
"""Create a Parser.
|
"""
|
||||||
|
Create a Parser.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab):
|
vocab (Vocab):
|
||||||
|
@ -186,12 +186,18 @@ cdef class Parser:
|
||||||
self.model.learn_rate = cfg.get('learn_rate', 0.001)
|
self.model.learn_rate = cfg.get('learn_rate', 0.001)
|
||||||
|
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
|
# TODO: This is a pretty hacky fix to the problem of adding more
|
||||||
|
# labels. The issue is they come in out of order, if labels are
|
||||||
|
# added during training
|
||||||
|
for label in cfg.get('extra_labels', []):
|
||||||
|
self.add_label(label)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the entity recognizer, setting the annotations onto the Doc object.
|
"""
|
||||||
|
Apply the entity recognizer, setting the annotations onto the Doc object.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc): The document to be processed.
|
doc (Doc): The document to be processed.
|
||||||
|
@ -208,7 +214,8 @@ cdef class Parser:
|
||||||
self.moves.finalize_doc(tokens)
|
self.moves.finalize_doc(tokens)
|
||||||
|
|
||||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||||
"""Process a stream of documents.
|
"""
|
||||||
|
Process a stream of documents.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
stream: The sequence of documents to process.
|
stream: The sequence of documents to process.
|
||||||
|
@ -296,7 +303,8 @@ cdef class Parser:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||||
"""Update the statistical model.
|
"""
|
||||||
|
Update the statistical model.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc):
|
doc (Doc):
|
||||||
|
@ -334,15 +342,17 @@ cdef class Parser:
|
||||||
self.moves.finalize_state(stcls.c)
|
self.moves.finalize_state(stcls.c)
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def step_through(self, Doc doc):
|
def step_through(self, Doc doc, GoldParse gold=None):
|
||||||
"""Set up a stepwise state, to introspect and control the transition sequence.
|
"""
|
||||||
|
Set up a stepwise state, to introspect and control the transition sequence.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc): The document to step through.
|
doc (Doc): The document to step through.
|
||||||
|
gold (GoldParse): Optional gold parse
|
||||||
Returns (StepwiseState):
|
Returns (StepwiseState):
|
||||||
A state object, to step through the annotation process.
|
A state object, to step through the annotation process.
|
||||||
"""
|
"""
|
||||||
return StepwiseState(self, doc)
|
return StepwiseState(self, doc, gold=gold)
|
||||||
|
|
||||||
def from_transition_sequence(self, Doc doc, sequence):
|
def from_transition_sequence(self, Doc doc, sequence):
|
||||||
"""Control the annotations on a document by specifying a transition sequence
|
"""Control the annotations on a document by specifying a transition sequence
|
||||||
|
@ -360,18 +370,28 @@ cdef class Parser:
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
# Doesn't set label into serializer -- subclasses override it to do that.
|
# Doesn't set label into serializer -- subclasses override it to do that.
|
||||||
for action in self.moves.action_types:
|
for action in self.moves.action_types:
|
||||||
self.moves.add_action(action, label)
|
added = self.moves.add_action(action, label)
|
||||||
|
if added:
|
||||||
|
# Important that the labels be stored as a list! We need the
|
||||||
|
# order, or the model goes out of synch
|
||||||
|
self.cfg.setdefault('extra_labels', []).append(label)
|
||||||
|
|
||||||
|
|
||||||
cdef class StepwiseState:
|
cdef class StepwiseState:
|
||||||
cdef readonly StateClass stcls
|
cdef readonly StateClass stcls
|
||||||
cdef readonly Example eg
|
cdef readonly Example eg
|
||||||
cdef readonly Doc doc
|
cdef readonly Doc doc
|
||||||
|
cdef readonly GoldParse gold
|
||||||
cdef readonly Parser parser
|
cdef readonly Parser parser
|
||||||
|
|
||||||
def __init__(self, Parser parser, Doc doc):
|
def __init__(self, Parser parser, Doc doc, GoldParse gold=None):
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
|
if gold is not None:
|
||||||
|
self.gold = gold
|
||||||
|
self.parser.moves.preprocess_gold(self.gold)
|
||||||
|
else:
|
||||||
|
self.gold = GoldParse(doc)
|
||||||
self.stcls = StateClass.init(doc.c, doc.length)
|
self.stcls = StateClass.init(doc.c, doc.length)
|
||||||
self.parser.moves.initialize_state(self.stcls.c)
|
self.parser.moves.initialize_state(self.stcls.c)
|
||||||
self.eg = Example(
|
self.eg = Example(
|
||||||
|
@ -406,6 +426,24 @@ cdef class StepwiseState:
|
||||||
return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
|
return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
|
||||||
for i in range(self.stcls.c.length)]
|
for i in range(self.stcls.c.length)]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def costs(self):
|
||||||
|
"""
|
||||||
|
Find the action-costs for the current state.
|
||||||
|
"""
|
||||||
|
if not self.gold:
|
||||||
|
raise ValueError("Can't set costs: No GoldParse provided")
|
||||||
|
self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
|
||||||
|
self.stcls, self.gold)
|
||||||
|
costs = {}
|
||||||
|
for i in range(self.parser.moves.n_moves):
|
||||||
|
if not self.eg.c.is_valid[i]:
|
||||||
|
continue
|
||||||
|
transition = self.parser.moves.c[i]
|
||||||
|
name = self.parser.moves.move_name(transition.move, transition.label)
|
||||||
|
costs[name] = self.eg.c.costs[i]
|
||||||
|
return costs
|
||||||
|
|
||||||
def predict(self):
|
def predict(self):
|
||||||
self.eg.reset()
|
self.eg.reset()
|
||||||
self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features,
|
self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features,
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
from ..vocab cimport EMPTY_LEXEME
|
||||||
from ..structs cimport Entity
|
from ..structs cimport Entity
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
@ -28,6 +32,6 @@ cdef class StateClass:
|
||||||
top = words[self.S(0)] + '_%d' % self.S_(0).head
|
top = words[self.S(0)] + '_%d' % self.S_(0).head
|
||||||
second = words[self.S(1)] + '_%d' % self.S_(1).head
|
second = words[self.S(1)] + '_%d' % self.S_(1).head
|
||||||
third = words[self.S(2)] + '_%d' % self.S_(2).head
|
third = words[self.S(2)] + '_%d' % self.S_(2).head
|
||||||
n0 = words[self.B(0)]
|
n0 = words[self.B(0)]
|
||||||
n1 = words[self.B(1)]
|
n1 = words[self.B(1)]
|
||||||
return ' '.join((third, second, top, '|', n0, n1))
|
return ' '.join((third, second, top, '|', n0, n1))
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
@ -6,7 +10,6 @@ from collections import defaultdict
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
|
||||||
|
|
||||||
|
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
||||||
|
@ -32,7 +35,7 @@ cdef class TransitionSystem:
|
||||||
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
|
self.c = <Transition*>self.mem.alloc(self._size, sizeof(Transition))
|
||||||
|
|
||||||
for action, label_strs in sorted(labels_by_action.items()):
|
for action, label_strs in sorted(labels_by_action.items()):
|
||||||
for label_str in sorted(label_strs):
|
for label_str in label_strs:
|
||||||
self.add_action(int(action), label_str)
|
self.add_action(int(action), label_str)
|
||||||
self.root_label = self.strings['ROOT']
|
self.root_label = self.strings['ROOT']
|
||||||
self.freqs = {} if _freqs is None else _freqs
|
self.freqs = {} if _freqs is None else _freqs
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
from os import path
|
|
||||||
import json
|
|
||||||
|
|
||||||
class Config(object):
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
setattr(self, key, value)
|
|
||||||
|
|
||||||
def get(self, attr, default=None):
|
|
||||||
return self.__dict__.get(attr, default)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def write(cls, model_dir, name, **kwargs):
|
|
||||||
open(path.join(model_dir, '%s.json' % name), 'w').write(json.dumps(kwargs))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def read(cls, model_dir, name):
|
|
||||||
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
|
|
|
@ -1,5 +1,7 @@
|
||||||
import json
|
# coding: utf8
|
||||||
import pathlib
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import ujson
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
@ -12,8 +14,8 @@ from thinc.linalg cimport VecVec
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .attrs cimport TAG
|
from .attrs cimport TAG
|
||||||
from .gold cimport GoldParse
|
from .gold cimport GoldParse
|
||||||
|
|
||||||
from .attrs cimport *
|
from .attrs cimport *
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
|
@ -106,10 +108,13 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
"""Annotate part-of-speech tags on Doc objects."""
|
"""
|
||||||
|
Annotate part-of-speech tags on Doc objects.
|
||||||
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, vocab, require=False):
|
def load(cls, path, vocab, require=False):
|
||||||
"""Load the statistical model from the supplied path.
|
"""
|
||||||
|
Load the statistical model from the supplied path.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
path (Path):
|
path (Path):
|
||||||
|
@ -123,10 +128,10 @@ cdef class Tagger:
|
||||||
"""
|
"""
|
||||||
# TODO: Change this to expect config.json when we don't have to
|
# TODO: Change this to expect config.json when we don't have to
|
||||||
# support old data.
|
# support old data.
|
||||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
path = util.ensure_path(path)
|
||||||
if (path / 'templates.json').exists():
|
if (path / 'templates.json').exists():
|
||||||
with (path / 'templates.json').open('r', encoding='utf8') as file_:
|
with (path / 'templates.json').open('r', encoding='utf8') as file_:
|
||||||
templates = json.load(file_)
|
templates = ujson.load(file_)
|
||||||
elif require:
|
elif require:
|
||||||
raise IOError(
|
raise IOError(
|
||||||
"Required file %s/templates.json not found when loading Tagger" % str(path))
|
"Required file %s/templates.json not found when loading Tagger" % str(path))
|
||||||
|
@ -142,7 +147,8 @@ cdef class Tagger:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
||||||
"""Create a Tagger.
|
"""
|
||||||
|
Create a Tagger.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab):
|
vocab (Vocab):
|
||||||
|
@ -180,7 +186,8 @@ cdef class Tagger:
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
"""
|
||||||
|
Apply the tagger, setting the POS tags onto the Doc object.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc): The tokens to be tagged.
|
doc (Doc): The tokens to be tagged.
|
||||||
|
@ -208,7 +215,8 @@ cdef class Tagger:
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||||
"""Tag a stream of documents.
|
"""
|
||||||
|
Tag a stream of documents.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
stream: The sequence of documents to tag.
|
stream: The sequence of documents to tag.
|
||||||
|
@ -225,7 +233,8 @@ cdef class Tagger:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||||
"""Update the statistical model, with tags supplied for the given document.
|
"""
|
||||||
|
Update the statistical model, with tags supplied for the given document.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc):
|
doc (Doc):
|
||||||
|
|
|
@ -1,17 +1,11 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pathlib
|
import ujson
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from cython.operator cimport preincrement as preinc
|
from cython.operator cimport preincrement as preinc
|
||||||
|
|
||||||
try:
|
|
||||||
import ujson as json
|
|
||||||
except ImportError:
|
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
|
@ -23,12 +17,15 @@ from .tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
"""Segment text, and create Doc objects with the discovered segment boundaries."""
|
"""
|
||||||
|
Segment text, and create Doc objects with the discovered segment boundaries.
|
||||||
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
|
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
|
||||||
infix_finditer=None, token_match=None):
|
infix_finditer=None, token_match=None):
|
||||||
'''Load a Tokenizer, reading unsupplied components from the path.
|
"""
|
||||||
|
Load a Tokenizer, reading unsupplied components from the path.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
path (Path):
|
path (Path):
|
||||||
The path to load from.
|
The path to load from.
|
||||||
|
@ -45,13 +42,11 @@ cdef class Tokenizer:
|
||||||
infix_finditer:
|
infix_finditer:
|
||||||
Signature of re.compile(string).finditer
|
Signature of re.compile(string).finditer
|
||||||
Returns Tokenizer
|
Returns Tokenizer
|
||||||
'''
|
"""
|
||||||
if isinstance(path, basestring):
|
path = util.ensure_path(path)
|
||||||
path = pathlib.Path(path)
|
|
||||||
|
|
||||||
if rules is None:
|
if rules is None:
|
||||||
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
|
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
|
||||||
rules = json.load(file_)
|
rules = ujson.load(file_)
|
||||||
if prefix_search in (None, True):
|
if prefix_search in (None, True):
|
||||||
with (path / 'tokenizer' / 'prefix.txt').open() as file_:
|
with (path / 'tokenizer' / 'prefix.txt').open() as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
|
@ -67,8 +62,9 @@ cdef class Tokenizer:
|
||||||
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
|
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
||||||
'''Create a Tokenizer, to create Doc objects given unicode text.
|
"""
|
||||||
|
Create a Tokenizer, to create Doc objects given unicode text.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab):
|
vocab (Vocab):
|
||||||
A storage container for lexical types.
|
A storage container for lexical types.
|
||||||
|
@ -85,7 +81,7 @@ cdef class Tokenizer:
|
||||||
to find infixes.
|
to find infixes.
|
||||||
token_match:
|
token_match:
|
||||||
A boolean function matching strings that becomes tokens.
|
A boolean function matching strings that becomes tokens.
|
||||||
'''
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._cache = PreshMap()
|
self._cache = PreshMap()
|
||||||
self._specials = PreshMap()
|
self._specials = PreshMap()
|
||||||
|
@ -107,7 +103,7 @@ cdef class Tokenizer:
|
||||||
self.token_match)
|
self.token_match)
|
||||||
|
|
||||||
return (self.__class__, args, None, None)
|
return (self.__class__, args, None, None)
|
||||||
|
|
||||||
cpdef Doc tokens_from_list(self, list strings):
|
cpdef Doc tokens_from_list(self, list strings):
|
||||||
return Doc(self.vocab, words=strings)
|
return Doc(self.vocab, words=strings)
|
||||||
#raise NotImplementedError(
|
#raise NotImplementedError(
|
||||||
|
@ -117,7 +113,8 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
def __call__(self, unicode string):
|
def __call__(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""
|
||||||
|
Tokenize a string.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string (unicode): The string to tokenize.
|
string (unicode): The string to tokenize.
|
||||||
|
@ -170,7 +167,8 @@ cdef class Tokenizer:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def pipe(self, texts, batch_size=1000, n_threads=2):
|
def pipe(self, texts, batch_size=1000, n_threads=2):
|
||||||
"""Tokenize a stream of texts.
|
"""
|
||||||
|
Tokenize a stream of texts.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
texts: A sequence of unicode texts.
|
texts: A sequence of unicode texts.
|
||||||
|
@ -270,7 +268,7 @@ cdef class Tokenizer:
|
||||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||||
if cache_hit:
|
if cache_hit:
|
||||||
pass
|
pass
|
||||||
elif self.token_match and self.token_match(string):
|
elif self.token_match and self.token_match(string):
|
||||||
# We're always saying 'no' to spaces here -- the caller will
|
# We're always saying 'no' to spaces here -- the caller will
|
||||||
# fix up the outermost one, with reference to the original.
|
# fix up the outermost one, with reference to the original.
|
||||||
# See Issue #859
|
# See Issue #859
|
||||||
|
@ -324,7 +322,8 @@ cdef class Tokenizer:
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
def find_infix(self, unicode string):
|
def find_infix(self, unicode string):
|
||||||
"""Find internal split points of the string, such as hyphens.
|
"""
|
||||||
|
Find internal split points of the string, such as hyphens.
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (unicode): The string to segment.
|
||||||
|
|
||||||
|
@ -337,7 +336,8 @@ cdef class Tokenizer:
|
||||||
return list(self.infix_finditer(string))
|
return list(self.infix_finditer(string))
|
||||||
|
|
||||||
def find_prefix(self, unicode string):
|
def find_prefix(self, unicode string):
|
||||||
"""Find the length of a prefix that should be segmented from the string,
|
"""
|
||||||
|
Find the length of a prefix that should be segmented from the string,
|
||||||
or None if no prefix rules match.
|
or None if no prefix rules match.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -350,7 +350,8 @@ cdef class Tokenizer:
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def find_suffix(self, unicode string):
|
def find_suffix(self, unicode string):
|
||||||
"""Find the length of a suffix that should be segmented from the string,
|
"""
|
||||||
|
Find the length of a suffix that should be segmented from the string,
|
||||||
or None if no suffix rules match.
|
or None if no suffix rules match.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -363,13 +364,15 @@ cdef class Tokenizer:
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, special_cases):
|
def _load_special_tokenization(self, special_cases):
|
||||||
'''Add special-case tokenization rules.
|
"""
|
||||||
'''
|
Add special-case tokenization rules.
|
||||||
|
"""
|
||||||
for chunk, substrings in sorted(special_cases.items()):
|
for chunk, substrings in sorted(special_cases.items()):
|
||||||
self.add_special_case(chunk, substrings)
|
self.add_special_case(chunk, substrings)
|
||||||
|
|
||||||
def add_special_case(self, unicode string, substrings):
|
def add_special_case(self, unicode string, substrings):
|
||||||
'''Add a special-case tokenization rule.
|
"""
|
||||||
|
Add a special-case tokenization rule.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string (unicode): The string to specially tokenize.
|
string (unicode): The string to specially tokenize.
|
||||||
|
@ -378,7 +381,7 @@ cdef class Tokenizer:
|
||||||
attributes. The ORTH fields of the attributes must exactly match
|
attributes. The ORTH fields of the attributes must exactly match
|
||||||
the string when they are concatenated.
|
the string when they are concatenated.
|
||||||
Returns None
|
Returns None
|
||||||
'''
|
"""
|
||||||
substrings = list(substrings)
|
substrings = list(substrings)
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = len(substrings)
|
cached.length = len(substrings)
|
||||||
|
|
|
@ -1,15 +1,18 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
cimport numpy as np
|
||||||
|
import numpy
|
||||||
|
import numpy.linalg
|
||||||
|
import struct
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
|
||||||
import numpy
|
from .span cimport Span
|
||||||
import numpy.linalg
|
from .token cimport Token
|
||||||
import struct
|
|
||||||
cimport numpy as np
|
|
||||||
import six
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
|
@ -19,11 +22,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .span cimport Span
|
|
||||||
from .token cimport Token
|
|
||||||
from ..serialize.bits cimport BitArray
|
from ..serialize.bits cimport BitArray
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..syntax.iterators import CHUNKERS
|
from ..syntax.iterators import CHUNKERS
|
||||||
|
from ..compat import is_config
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -76,7 +78,7 @@ cdef class Doc:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
||||||
'''
|
"""
|
||||||
Create a Doc object.
|
Create a Doc object.
|
||||||
|
|
||||||
Aside: Implementation
|
Aside: Implementation
|
||||||
|
@ -97,7 +99,7 @@ cdef class Doc:
|
||||||
A list of boolean values, of the same length as words. True
|
A list of boolean values, of the same length as words. True
|
||||||
means that the word is followed by a space, False means it is not.
|
means that the word is followed by a space, False means it is not.
|
||||||
If None, defaults to [True]*len(words)
|
If None, defaults to [True]*len(words)
|
||||||
'''
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
size = 20
|
size = 20
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -158,7 +160,7 @@ cdef class Doc:
|
||||||
self.is_parsed = True
|
self.is_parsed = True
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
'''
|
"""
|
||||||
doc[i]
|
doc[i]
|
||||||
Get the Token object at position i, where i is an integer.
|
Get the Token object at position i, where i is an integer.
|
||||||
Negative indexing is supported, and follows the usual Python
|
Negative indexing is supported, and follows the usual Python
|
||||||
|
@ -172,7 +174,7 @@ cdef class Doc:
|
||||||
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||||
You can use negative indices and open-ended ranges, which have their
|
You can use negative indices and open-ended ranges, which have their
|
||||||
normal Python semantics.
|
normal Python semantics.
|
||||||
'''
|
"""
|
||||||
if isinstance(i, slice):
|
if isinstance(i, slice):
|
||||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||||
return Span(self, start, stop, label=0)
|
return Span(self, start, stop, label=0)
|
||||||
|
@ -186,7 +188,7 @@ cdef class Doc:
|
||||||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
'''
|
"""
|
||||||
for token in doc
|
for token in doc
|
||||||
Iterate over `Token` objects, from which the annotations can
|
Iterate over `Token` objects, from which the annotations can
|
||||||
be easily accessed. This is the main way of accessing Token
|
be easily accessed. This is the main way of accessing Token
|
||||||
|
@ -194,7 +196,7 @@ cdef class Doc:
|
||||||
Python. If faster-than-Python speeds are required, you can
|
Python. If faster-than-Python speeds are required, you can
|
||||||
instead access the annotations as a numpy array, or access the
|
instead access the annotations as a numpy array, or access the
|
||||||
underlying C data directly from Cython.
|
underlying C data directly from Cython.
|
||||||
'''
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
if self._py_tokens[i] is not None:
|
if self._py_tokens[i] is not None:
|
||||||
|
@ -203,10 +205,10 @@ cdef class Doc:
|
||||||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
'''
|
"""
|
||||||
len(doc)
|
len(doc)
|
||||||
The number of tokens in the document.
|
The number of tokens in the document.
|
||||||
'''
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
|
@ -216,7 +218,7 @@ cdef class Doc:
|
||||||
return u''.join([t.text_with_ws for t in self]).encode('utf-8')
|
return u''.join([t.text_with_ws for t in self]).encode('utf-8')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if six.PY3:
|
if is_config(python3=True):
|
||||||
return self.__unicode__()
|
return self.__unicode__()
|
||||||
return self.__bytes__()
|
return self.__bytes__()
|
||||||
|
|
||||||
|
@ -228,7 +230,8 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
"""
|
||||||
|
Make a semantic similarity estimate. The default estimate is cosine
|
||||||
similarity using an average of word vectors.
|
similarity using an average of word vectors.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -237,7 +240,7 @@ cdef class Doc:
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
score (float): A scalar similarity score. Higher is more similar.
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
'''
|
"""
|
||||||
if 'similarity' in self.user_hooks:
|
if 'similarity' in self.user_hooks:
|
||||||
return self.user_hooks['similarity'](self, other)
|
return self.user_hooks['similarity'](self, other)
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
@ -245,9 +248,9 @@ cdef class Doc:
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
'''
|
"""
|
||||||
A boolean value indicating whether a word vector is associated with the object.
|
A boolean value indicating whether a word vector is associated with the object.
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.user_hooks:
|
if 'has_vector' in self.user_hooks:
|
||||||
return self.user_hooks['has_vector'](self)
|
return self.user_hooks['has_vector'](self)
|
||||||
|
@ -255,11 +258,11 @@ cdef class Doc:
|
||||||
return any(token.has_vector for token in self)
|
return any(token.has_vector for token in self)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
'''
|
"""
|
||||||
A real-valued meaning representation. Defaults to an average of the token vectors.
|
A real-valued meaning representation. Defaults to an average of the token vectors.
|
||||||
|
|
||||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.user_hooks:
|
if 'vector' in self.user_hooks:
|
||||||
return self.user_hooks['vector'](self)
|
return self.user_hooks['vector'](self)
|
||||||
|
@ -294,17 +297,21 @@ cdef class Doc:
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
'''A unicode representation of the document text.'''
|
"""
|
||||||
|
A unicode representation of the document text.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return u''.join(t.text_with_ws for t in self)
|
return u''.join(t.text_with_ws for t in self)
|
||||||
|
|
||||||
property text_with_ws:
|
property text_with_ws:
|
||||||
'''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
|
"""
|
||||||
|
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property ents:
|
property ents:
|
||||||
'''
|
"""
|
||||||
Yields named-entity `Span` objects, if the entity recognizer
|
Yields named-entity `Span` objects, if the entity recognizer
|
||||||
has been applied to the document. Iterate over the span to get
|
has been applied to the document. Iterate over the span to get
|
||||||
individual Token objects, or access the label:
|
individual Token objects, or access the label:
|
||||||
|
@ -318,7 +325,7 @@ cdef class Doc:
|
||||||
assert ents[0].label_ == 'PERSON'
|
assert ents[0].label_ == 'PERSON'
|
||||||
assert ents[0].orth_ == 'Best'
|
assert ents[0].orth_ == 'Best'
|
||||||
assert ents[0].text == 'Mr. Best'
|
assert ents[0].text == 'Mr. Best'
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef const TokenC* token
|
cdef const TokenC* token
|
||||||
|
@ -382,13 +389,13 @@ cdef class Doc:
|
||||||
self.c[start].ent_iob = 3
|
self.c[start].ent_iob = 3
|
||||||
|
|
||||||
property noun_chunks:
|
property noun_chunks:
|
||||||
'''
|
"""
|
||||||
Yields base noun-phrase #[code Span] objects, if the document
|
Yields base noun-phrase #[code Span] objects, if the document
|
||||||
has been syntactically parsed. A base noun phrase, or
|
has been syntactically parsed. A base noun phrase, or
|
||||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||||
be nested within it – so no NP-level coordination, no prepositional
|
be nested within it – so no NP-level coordination, no prepositional
|
||||||
phrases, and no relative clauses. For example:
|
phrases, and no relative clauses.
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if not self.is_parsed:
|
if not self.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -496,7 +503,8 @@ cdef class Doc:
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
||||||
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
"""
|
||||||
|
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||||
by the values of the given attribute ID.
|
by the values of the given attribute ID.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -563,8 +571,9 @@ cdef class Doc:
|
||||||
self.c[i] = parsed[i]
|
self.c[i] = parsed[i]
|
||||||
|
|
||||||
def from_array(self, attrs, array):
|
def from_array(self, attrs, array):
|
||||||
'''Write to a `Doc` object, from an `(M, N)` array of attributes.
|
"""
|
||||||
'''
|
Write to a `Doc` object, from an `(M, N)` array of attributes.
|
||||||
|
"""
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
|
@ -603,19 +612,23 @@ cdef class Doc:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
'''Serialize, producing a byte string.'''
|
"""
|
||||||
|
Serialize, producing a byte string.
|
||||||
|
"""
|
||||||
byte_string = self.vocab.serializer.pack(self)
|
byte_string = self.vocab.serializer.pack(self)
|
||||||
cdef uint32_t length = len(byte_string)
|
cdef uint32_t length = len(byte_string)
|
||||||
return struct.pack('I', length) + byte_string
|
return struct.pack('I', length) + byte_string
|
||||||
|
|
||||||
def from_bytes(self, data):
|
def from_bytes(self, data):
|
||||||
'''Deserialize, loading from bytes.'''
|
"""
|
||||||
|
Deserialize, loading from bytes.
|
||||||
|
"""
|
||||||
self.vocab.serializer.unpack_into(data[4:], self)
|
self.vocab.serializer.unpack_into(data[4:], self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_bytes(file_):
|
def read_bytes(file_):
|
||||||
'''
|
"""
|
||||||
A static method, used to read serialized #[code Doc] objects from
|
A static method, used to read serialized #[code Doc] objects from
|
||||||
a file. For example:
|
a file. For example:
|
||||||
|
|
||||||
|
@ -630,7 +643,7 @@ cdef class Doc:
|
||||||
for byte_string in Doc.read_bytes(file_):
|
for byte_string in Doc.read_bytes(file_):
|
||||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
||||||
assert len(docs) == 2
|
assert len(docs) == 2
|
||||||
'''
|
"""
|
||||||
keep_reading = True
|
keep_reading = True
|
||||||
while keep_reading:
|
while keep_reading:
|
||||||
try:
|
try:
|
||||||
|
@ -644,7 +657,8 @@ cdef class Doc:
|
||||||
yield n_bytes_str + data
|
yield n_bytes_str + data
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
"""
|
||||||
|
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
||||||
is merged into a single token. If start_idx and end_idx do not mark start
|
is merged into a single token. If start_idx and end_idx do not mark start
|
||||||
and end token boundaries, the document remains unchanged.
|
and end token boundaries, the document remains unchanged.
|
||||||
|
|
||||||
|
@ -658,7 +672,6 @@ cdef class Doc:
|
||||||
token (Token):
|
token (Token):
|
||||||
The newly merged token, or None if the start and end indices did
|
The newly merged token, or None if the start and end indices did
|
||||||
not fall at token boundaries.
|
not fall at token boundaries.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
cdef unicode tag, lemma, ent_type
|
cdef unicode tag, lemma, ent_type
|
||||||
if len(args) == 3:
|
if len(args) == 3:
|
||||||
|
|
|
@ -1,26 +1,31 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
cimport numpy as np
|
||||||
import numpy
|
import numpy
|
||||||
import numpy.linalg
|
import numpy.linalg
|
||||||
cimport numpy as np
|
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
import six
|
|
||||||
|
|
||||||
|
from .doc cimport token_by_start, token_by_end
|
||||||
from ..structs cimport TokenC, LexemeC
|
from ..structs cimport TokenC, LexemeC
|
||||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from .doc cimport token_by_start, token_by_end
|
|
||||||
from ..attrs cimport IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_PUNCT, IS_SPACE
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
from ..compat import is_config
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
"""A slice from a Doc object."""
|
"""
|
||||||
|
A slice from a Doc object.
|
||||||
|
"""
|
||||||
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
||||||
vector_norm=None):
|
vector_norm=None):
|
||||||
'''Create a Span object from the slice doc[start : end]
|
"""
|
||||||
|
Create a Span object from the slice doc[start : end]
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
|
@ -30,7 +35,7 @@ cdef class Span:
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||||
Returns:
|
Returns:
|
||||||
Span The newly constructed object.
|
Span The newly constructed object.
|
||||||
'''
|
"""
|
||||||
if not (0 <= start <= end <= len(doc)):
|
if not (0 <= start <= end <= len(doc)):
|
||||||
raise IndexError
|
raise IndexError
|
||||||
|
|
||||||
|
@ -68,7 +73,7 @@ cdef class Span:
|
||||||
return self.end - self.start
|
return self.end - self.start
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
if six.PY3:
|
if is_config(python3=True):
|
||||||
return self.text
|
return self.text
|
||||||
return self.text.encode('utf-8')
|
return self.text.encode('utf-8')
|
||||||
|
|
||||||
|
@ -89,7 +94,8 @@ cdef class Span:
|
||||||
yield self.doc[i]
|
yield self.doc[i]
|
||||||
|
|
||||||
def merge(self, *args, **attributes):
|
def merge(self, *args, **attributes):
|
||||||
"""Retokenize the document, such that the span is merged into a single token.
|
"""
|
||||||
|
Retokenize the document, such that the span is merged into a single token.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
**attributes:
|
**attributes:
|
||||||
|
@ -102,7 +108,8 @@ cdef class Span:
|
||||||
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
"""
|
||||||
|
Make a semantic similarity estimate. The default estimate is cosine
|
||||||
similarity using an average of word vectors.
|
similarity using an average of word vectors.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -111,7 +118,7 @@ cdef class Span:
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
score (float): A scalar similarity score. Higher is more similar.
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
'''
|
"""
|
||||||
if 'similarity' in self.doc.user_span_hooks:
|
if 'similarity' in self.doc.user_span_hooks:
|
||||||
self.doc.user_span_hooks['similarity'](self, other)
|
self.doc.user_span_hooks['similarity'](self, other)
|
||||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||||
|
@ -133,11 +140,12 @@ cdef class Span:
|
||||||
self.end = end + 1
|
self.end = end + 1
|
||||||
|
|
||||||
property sent:
|
property sent:
|
||||||
'''The sentence span that this span is a part of.
|
"""
|
||||||
|
The sentence span that this span is a part of.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Span The sentence this is part of.
|
Span The sentence this is part of.
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sent' in self.doc.user_span_hooks:
|
if 'sent' in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks['sent'](self)
|
return self.doc.user_span_hooks['sent'](self)
|
||||||
|
@ -198,13 +206,13 @@ cdef class Span:
|
||||||
return u''.join([t.text_with_ws for t in self])
|
return u''.join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
property noun_chunks:
|
property noun_chunks:
|
||||||
'''
|
"""
|
||||||
Yields base noun-phrase #[code Span] objects, if the document
|
Yields base noun-phrase #[code Span] objects, if the document
|
||||||
has been syntactically parsed. A base noun phrase, or
|
has been syntactically parsed. A base noun phrase, or
|
||||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||||
be nested within it – so no NP-level coordination, no prepositional
|
be nested within it – so no NP-level coordination, no prepositional
|
||||||
phrases, and no relative clauses. For example:
|
phrases, and no relative clauses. For example:
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if not self.doc.is_parsed:
|
if not self.doc.is_parsed:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -223,17 +231,16 @@ cdef class Span:
|
||||||
yield span
|
yield span
|
||||||
|
|
||||||
property root:
|
property root:
|
||||||
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
|
"""
|
||||||
|
The token within the span that's highest in the parse tree. If there's a
|
||||||
|
tie, the earlist is prefered.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Token: The root token.
|
Token: The root token.
|
||||||
|
|
||||||
i.e. has the
|
i.e. has the shortest path to the root of the sentence (or is the root
|
||||||
shortest path to the root of the sentence (or is the root itself).
|
itself). If multiple words are equally high in the tree, the first word
|
||||||
|
is taken. For example:
|
||||||
If multiple words are equally high in the tree, the first word is taken.
|
|
||||||
|
|
||||||
For example:
|
|
||||||
|
|
||||||
>>> toks = nlp(u'I like New York in Autumn.')
|
>>> toks = nlp(u'I like New York in Autumn.')
|
||||||
|
|
||||||
|
@ -303,7 +310,8 @@ cdef class Span:
|
||||||
return self.doc[root]
|
return self.doc[root]
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
"""Tokens that are to the left of the span, whose head is within the Span.
|
"""
|
||||||
|
Tokens that are to the left of the span, whose head is within the Span.
|
||||||
|
|
||||||
Yields: Token A left-child of a token of the span.
|
Yields: Token A left-child of a token of the span.
|
||||||
"""
|
"""
|
||||||
|
@ -314,7 +322,8 @@ cdef class Span:
|
||||||
yield left
|
yield left
|
||||||
|
|
||||||
property rights:
|
property rights:
|
||||||
"""Tokens that are to the right of the Span, whose head is within the Span.
|
"""
|
||||||
|
Tokens that are to the right of the Span, whose head is within the Span.
|
||||||
|
|
||||||
Yields: Token A right-child of a token of the span.
|
Yields: Token A right-child of a token of the span.
|
||||||
"""
|
"""
|
||||||
|
@ -325,7 +334,8 @@ cdef class Span:
|
||||||
yield right
|
yield right
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
"""
|
||||||
|
Tokens that descend from tokens in the span, but fall outside it.
|
||||||
|
|
||||||
Yields: Token A descendant of a token within the span.
|
Yields: Token A descendant of a token within the span.
|
||||||
"""
|
"""
|
||||||
|
@ -337,7 +347,9 @@ cdef class Span:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
|
"""
|
||||||
|
An (integer) entity ID. Usually assigned by patterns in the Matcher.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id
|
return self.root.ent_id
|
||||||
|
|
||||||
|
@ -345,9 +357,11 @@ cdef class Span:
|
||||||
# TODO
|
# TODO
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
||||||
"tracker: http://github.com/spacy-io/spaCy")
|
"tracker: http://github.com/explosion/spaCy/issues")
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
'''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
|
"""
|
||||||
|
A (string) entity ID. Usually assigned by patterns in the Matcher.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id_
|
return self.root.ent_id_
|
||||||
|
|
||||||
|
@ -355,7 +369,7 @@ cdef class Span:
|
||||||
# TODO
|
# TODO
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
|
"Can't yet set ent_id_ from Span. Vote for this feature on the issue "
|
||||||
"tracker: http://github.com/spacy-io/spaCy")
|
"tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -397,5 +411,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Array bounds exceeded while searching for root word. This likely "
|
"Array bounds exceeded while searching for root word. This likely "
|
||||||
"means the parse tree is in an invalid state. Please report this "
|
"means the parse tree is in an invalid state. Please report this "
|
||||||
"issue here: http://github.com/honnibal/spaCy/")
|
"issue here: http://github.com/explosion/spaCy/issues")
|
||||||
return n
|
return n
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# coding: utf8
|
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
@ -8,20 +8,15 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
||||||
from cython.view cimport array as cvarray
|
from cython.view cimport array as cvarray
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
np.import_array()
|
np.import_array()
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import six
|
|
||||||
|
|
||||||
|
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from .. import parts_of_speech
|
from .. import parts_of_speech
|
||||||
|
|
||||||
from ..attrs cimport LEMMA
|
from ..attrs cimport LEMMA
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport POS, LEMMA, TAG, DEP
|
from ..attrs cimport POS, LEMMA, TAG, DEP
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT
|
from ..parts_of_speech cimport CCONJ, PUNCT
|
||||||
|
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport IS_BRACKET
|
from ..attrs cimport IS_BRACKET
|
||||||
from ..attrs cimport IS_QUOTE
|
from ..attrs cimport IS_QUOTE
|
||||||
|
@ -29,12 +24,13 @@ from ..attrs cimport IS_LEFT_PUNCT
|
||||||
from ..attrs cimport IS_RIGHT_PUNCT
|
from ..attrs cimport IS_RIGHT_PUNCT
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from ..attrs cimport IS_OOV
|
from ..attrs cimport IS_OOV
|
||||||
|
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
from ..compat import is_config
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
|
"""
|
||||||
|
An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -46,7 +42,9 @@ cdef class Token:
|
||||||
return hash((self.doc, self.i))
|
return hash((self.doc, self.i))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
'''Number of unicode characters in token.text'''
|
"""
|
||||||
|
Number of unicode characters in token.text.
|
||||||
|
"""
|
||||||
return self.c.lex.length
|
return self.c.lex.length
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
|
@ -56,7 +54,7 @@ cdef class Token:
|
||||||
return self.text.encode('utf8')
|
return self.text.encode('utf8')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if six.PY3:
|
if is_config(python3=True):
|
||||||
return self.__unicode__()
|
return self.__unicode__()
|
||||||
return self.__bytes__()
|
return self.__bytes__()
|
||||||
|
|
||||||
|
@ -83,27 +81,30 @@ cdef class Token:
|
||||||
raise ValueError(op)
|
raise ValueError(op)
|
||||||
|
|
||||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||||
'''Check the value of a boolean flag.
|
"""
|
||||||
|
Check the value of a boolean flag.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
flag_id (int): The ID of the flag attribute.
|
flag_id (int): The ID of the flag attribute.
|
||||||
Returns:
|
Returns:
|
||||||
is_set (bool): Whether the flag is set.
|
is_set (bool): Whether the flag is set.
|
||||||
'''
|
"""
|
||||||
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
||||||
|
|
||||||
def nbor(self, int i=1):
|
def nbor(self, int i=1):
|
||||||
'''Get a neighboring token.
|
"""
|
||||||
|
Get a neighboring token.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
i (int): The relative position of the token to get. Defaults to 1.
|
i (int): The relative position of the token to get. Defaults to 1.
|
||||||
Returns:
|
Returns:
|
||||||
neighbor (Token): The token at position self.doc[self.i+i]
|
neighbor (Token): The token at position self.doc[self.i+i]
|
||||||
'''
|
"""
|
||||||
return self.doc[self.i+i]
|
return self.doc[self.i+i]
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
"""
|
||||||
|
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
other:
|
other:
|
||||||
|
@ -111,7 +112,7 @@ cdef class Token:
|
||||||
Token and Lexeme objects.
|
Token and Lexeme objects.
|
||||||
Returns:
|
Returns:
|
||||||
score (float): A scalar similarity score. Higher is more similar.
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
'''
|
"""
|
||||||
if 'similarity' in self.doc.user_token_hooks:
|
if 'similarity' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['similarity'](self)
|
return self.doc.user_token_hooks['similarity'](self)
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
@ -209,9 +210,9 @@ cdef class Token:
|
||||||
self.c.dep = label
|
self.c.dep = label
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
'''
|
"""
|
||||||
A boolean value indicating whether a word vector is associated with the object.
|
A boolean value indicating whether a word vector is associated with the object.
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.user_token_hooks:
|
if 'has_vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['has_vector'](self)
|
return self.doc.user_token_hooks['has_vector'](self)
|
||||||
|
@ -223,11 +224,11 @@ cdef class Token:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
'''
|
"""
|
||||||
A real-valued meaning representation.
|
A real-valued meaning representation.
|
||||||
|
|
||||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.doc.user_token_hooks:
|
if 'vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['vector'](self)
|
return self.doc.user_token_hooks['vector'](self)
|
||||||
|
@ -245,6 +246,7 @@ cdef class Token:
|
||||||
property repvec:
|
property repvec:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
raise AttributeError("repvec was renamed to vector in v0.100")
|
raise AttributeError("repvec was renamed to vector in v0.100")
|
||||||
|
|
||||||
property has_repvec:
|
property has_repvec:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
raise AttributeError("has_repvec was renamed to has_vector in v0.100")
|
raise AttributeError("has_repvec was renamed to has_vector in v0.100")
|
||||||
|
@ -265,7 +267,8 @@ cdef class Token:
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""The leftward immediate children of the word, in the syntactic
|
"""
|
||||||
|
The leftward immediate children of the word, in the syntactic
|
||||||
dependency parse.
|
dependency parse.
|
||||||
"""
|
"""
|
||||||
cdef int nr_iter = 0
|
cdef int nr_iter = 0
|
||||||
|
@ -282,8 +285,10 @@ cdef class Token:
|
||||||
|
|
||||||
property rights:
|
property rights:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""The rightward immediate children of the word, in the syntactic
|
"""
|
||||||
dependency parse."""
|
The rightward immediate children of the word, in the syntactic
|
||||||
|
dependency parse.
|
||||||
|
"""
|
||||||
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
|
||||||
tokens = []
|
tokens = []
|
||||||
cdef int nr_iter = 0
|
cdef int nr_iter = 0
|
||||||
|
@ -300,19 +305,21 @@ cdef class Token:
|
||||||
yield t
|
yield t
|
||||||
|
|
||||||
property children:
|
property children:
|
||||||
'''A sequence of the token's immediate syntactic children.
|
"""
|
||||||
|
A sequence of the token's immediate syntactic children.
|
||||||
|
|
||||||
Yields: Token A child token such that child.head==self
|
Yields: Token A child token such that child.head==self
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
yield from self.lefts
|
yield from self.lefts
|
||||||
yield from self.rights
|
yield from self.rights
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
'''A sequence of all the token's syntactic descendents.
|
"""
|
||||||
|
A sequence of all the token's syntactic descendents.
|
||||||
|
|
||||||
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for word in self.lefts:
|
for word in self.lefts:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
@ -321,26 +328,29 @@ cdef class Token:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
property left_edge:
|
property left_edge:
|
||||||
'''The leftmost token of this token's syntactic descendents.
|
"""
|
||||||
|
The leftmost token of this token's syntactic descendents.
|
||||||
|
|
||||||
Returns: Token The first token such that self.is_ancestor(token)
|
Returns: Token The first token such that self.is_ancestor(token)
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc[self.c.l_edge]
|
return self.doc[self.c.l_edge]
|
||||||
|
|
||||||
property right_edge:
|
property right_edge:
|
||||||
'''The rightmost token of this token's syntactic descendents.
|
"""
|
||||||
|
The rightmost token of this token's syntactic descendents.
|
||||||
|
|
||||||
Returns: Token The last token such that self.is_ancestor(token)
|
Returns: Token The last token such that self.is_ancestor(token)
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc[self.c.r_edge]
|
return self.doc[self.c.r_edge]
|
||||||
|
|
||||||
property ancestors:
|
property ancestors:
|
||||||
'''A sequence of this token's syntactic ancestors.
|
"""
|
||||||
|
A sequence of this token's syntactic ancestors.
|
||||||
|
|
||||||
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
|
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* head_ptr = self.c
|
cdef const TokenC* head_ptr = self.c
|
||||||
# guard against infinite loop, no token can have
|
# guard against infinite loop, no token can have
|
||||||
|
@ -356,25 +366,29 @@ cdef class Token:
|
||||||
return self.is_ancestor(descendant)
|
return self.is_ancestor(descendant)
|
||||||
|
|
||||||
def is_ancestor(self, descendant):
|
def is_ancestor(self, descendant):
|
||||||
'''Check whether this token is a parent, grandparent, etc. of another
|
"""
|
||||||
|
Check whether this token is a parent, grandparent, etc. of another
|
||||||
in the dependency tree.
|
in the dependency tree.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
descendant (Token): Another token.
|
descendant (Token): Another token.
|
||||||
Returns:
|
Returns:
|
||||||
is_ancestor (bool): Whether this token is the ancestor of the descendant.
|
is_ancestor (bool): Whether this token is the ancestor of the descendant.
|
||||||
'''
|
"""
|
||||||
if self.doc is not descendant.doc:
|
if self.doc is not descendant.doc:
|
||||||
return False
|
return False
|
||||||
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
'''The syntactic parent, or "governor", of this token.
|
"""
|
||||||
|
The syntactic parent, or "governor", of this token.
|
||||||
|
|
||||||
Returns: Token
|
Returns: Token
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""
|
||||||
|
The token predicted by the parser to be the head of the current token.
|
||||||
|
"""
|
||||||
return self.doc[self.i + self.c.head]
|
return self.doc[self.i + self.c.head]
|
||||||
def __set__(self, Token new_head):
|
def __set__(self, Token new_head):
|
||||||
# this function sets the head of self to new_head
|
# this function sets the head of self to new_head
|
||||||
|
@ -467,10 +481,11 @@ cdef class Token:
|
||||||
self.c.head = rel_newhead_i
|
self.c.head = rel_newhead_i
|
||||||
|
|
||||||
property conjuncts:
|
property conjuncts:
|
||||||
'''A sequence of coordinated tokens, including the token itself.
|
"""
|
||||||
|
A sequence of coordinated tokens, including the token itself.
|
||||||
|
|
||||||
Yields: Token A coordinated token
|
Yields: Token A coordinated token
|
||||||
'''
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""Get a list of conjoined words."""
|
"""Get a list of conjoined words."""
|
||||||
cdef Token word
|
cdef Token word
|
||||||
|
@ -501,7 +516,9 @@ cdef class Token:
|
||||||
return iob_strings[self.c.ent_iob]
|
return iob_strings[self.c.ent_iob]
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
'''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
|
"""
|
||||||
|
An (integer) entity ID. Usually assigned by patterns in the Matcher.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.c.ent_id
|
return self.c.ent_id
|
||||||
|
|
||||||
|
@ -509,7 +526,9 @@ cdef class Token:
|
||||||
self.c.ent_id = key
|
self.c.ent_id = key
|
||||||
|
|
||||||
property ent_id_:
|
property ent_id_:
|
||||||
'''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
|
"""
|
||||||
|
A (string) entity ID. Usually assigned by patterns in the Matcher.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.ent_id]
|
return self.vocab.strings[self.c.ent_id]
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,16 @@
|
||||||
from __future__ import absolute_import
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import tqdm
|
import tqdm
|
||||||
from .gold import GoldParse
|
from .gold import GoldParse, merge_sents
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .gold import merge_sents
|
|
||||||
|
|
||||||
|
|
||||||
class Trainer(object):
|
class Trainer(object):
|
||||||
'''Manage training of an NLP pipeline.'''
|
"""
|
||||||
|
Manage training of an NLP pipeline.
|
||||||
|
"""
|
||||||
def __init__(self, nlp, gold_tuples):
|
def __init__(self, nlp, gold_tuples):
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.gold_tuples = gold_tuples
|
self.gold_tuples = gold_tuples
|
||||||
|
|
|
@ -1,29 +1,18 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
import os
|
|
||||||
import io
|
import io
|
||||||
import json
|
import ujson
|
||||||
import re
|
import re
|
||||||
import os.path
|
from pathlib import Path
|
||||||
import pathlib
|
|
||||||
import sys
|
import sys
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
|
from .compat import basestring_, unicode_, input_
|
||||||
try:
|
|
||||||
basestring
|
|
||||||
except NameError:
|
|
||||||
basestring = str
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
raw_input
|
|
||||||
except NameError: # Python 3
|
|
||||||
raw_input = input
|
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
_data_path = pathlib.Path(__file__).parent / 'data'
|
_data_path = Path(__file__).parent / 'data'
|
||||||
|
|
||||||
|
|
||||||
def set_lang_class(name, cls):
|
def set_lang_class(name, cls):
|
||||||
|
@ -47,9 +36,14 @@ def get_data_path(require_exists=True):
|
||||||
|
|
||||||
def set_data_path(path):
|
def set_data_path(path):
|
||||||
global _data_path
|
global _data_path
|
||||||
if isinstance(path, basestring):
|
_data_path = ensure_path(path)
|
||||||
path = pathlib.Path(path)
|
|
||||||
_data_path = path
|
|
||||||
|
def ensure_path(path):
|
||||||
|
if isinstance(path, basestring_):
|
||||||
|
return Path(path)
|
||||||
|
else:
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
def or_(val1, val2):
|
def or_(val1, val2):
|
||||||
|
@ -61,41 +55,8 @@ def or_(val1, val2):
|
||||||
return val2
|
return val2
|
||||||
|
|
||||||
|
|
||||||
def match_best_version(target_name, target_version, path):
|
|
||||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
|
||||||
if path is None or not path.exists():
|
|
||||||
return None
|
|
||||||
matches = []
|
|
||||||
for data_name in path.iterdir():
|
|
||||||
name, version = split_data_name(data_name.parts[-1])
|
|
||||||
if name == target_name and constraint_match(target_version, version):
|
|
||||||
matches.append((tuple(float(v) for v in version.split('.')), data_name))
|
|
||||||
if matches:
|
|
||||||
return pathlib.Path(max(matches)[1])
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def split_data_name(name):
|
|
||||||
return name.split('-', 1) if '-' in name else (name, '')
|
|
||||||
|
|
||||||
|
|
||||||
def constraint_match(constraint_string, version):
|
|
||||||
# From http://github.com/spacy-io/sputnik
|
|
||||||
if not constraint_string:
|
|
||||||
return True
|
|
||||||
|
|
||||||
constraints = [c.strip() for c in constraint_string.split(',') if c.strip()]
|
|
||||||
|
|
||||||
for c in constraints:
|
|
||||||
if not re.match(r'[><=][=]?\d+(\.\d+)*', c):
|
|
||||||
raise ValueError('invalid constraint: %s' % c)
|
|
||||||
|
|
||||||
return all(semver.match(version, c) for c in constraints)
|
|
||||||
|
|
||||||
|
|
||||||
def read_regex(path):
|
def read_regex(path):
|
||||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
path = ensure_path(path)
|
||||||
with path.open() as file_:
|
with path.open() as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||||
|
@ -152,21 +113,11 @@ def check_renamed_kwargs(renamed, kwargs):
|
||||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||||
|
|
||||||
|
|
||||||
def is_windows():
|
|
||||||
"""Check if user is on Windows."""
|
|
||||||
return sys.platform.startswith('win')
|
|
||||||
|
|
||||||
|
|
||||||
def is_python2():
|
|
||||||
"""Check if Python 2 is used."""
|
|
||||||
return sys.version.startswith('2.')
|
|
||||||
|
|
||||||
|
|
||||||
def parse_package_meta(package_path, package, require=True):
|
def parse_package_meta(package_path, package, require=True):
|
||||||
location = os.path.join(str(package_path), package, 'meta.json')
|
location = package_path / package / 'meta.json'
|
||||||
if os.path.isfile(location):
|
if location.is_file():
|
||||||
with io.open(location, encoding='utf8') as f:
|
with location.open('r', encoding='utf8') as f:
|
||||||
meta = json.load(f)
|
meta = ujson.load(f)
|
||||||
return meta
|
return meta
|
||||||
elif require:
|
elif require:
|
||||||
raise IOError("Could not read meta.json from %s" % location)
|
raise IOError("Could not read meta.json from %s" % location)
|
||||||
|
@ -181,7 +132,7 @@ def get_raw_input(description, default=False):
|
||||||
|
|
||||||
additional = ' (default: {d})'.format(d=default) if default else ''
|
additional = ' (default: {d})'.format(d=default) if default else ''
|
||||||
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
||||||
user_input = raw_input(prompt)
|
user_input = input_(prompt)
|
||||||
return user_input
|
return user_input
|
||||||
|
|
||||||
|
|
||||||
|
@ -209,10 +160,9 @@ def print_markdown(data, **kwargs):
|
||||||
which will be converted to a list of tuples."""
|
which will be converted to a list of tuples."""
|
||||||
|
|
||||||
def excl_value(value):
|
def excl_value(value):
|
||||||
# don't print value if it contains absolute path of directory
|
# don't print value if it contains absolute path of directory (i.e.
|
||||||
# (i.e. personal info that shouldn't need to be shared)
|
# personal info). Other conditions can be included here if necessary.
|
||||||
# other conditions can be included here if necessary
|
if unicode_(Path(__file__).parent) in value:
|
||||||
if str(pathlib.Path(__file__).parent) in value:
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if type(data) == dict:
|
if type(data) == dict:
|
||||||
|
|
119
spacy/vocab.pyx
119
spacy/vocab.pyx
|
@ -1,41 +1,29 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import bz2
|
||||||
|
import ujson
|
||||||
|
import re
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
from cymem.cymem cimport Address
|
||||||
from pathlib import Path
|
|
||||||
import bz2
|
|
||||||
import ujson as json
|
|
||||||
import re
|
|
||||||
|
|
||||||
try:
|
|
||||||
import cPickle as pickle
|
|
||||||
except ImportError:
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .cfile cimport CFile, StringCFile
|
from .cfile cimport CFile, StringCFile
|
||||||
from .lemmatizer import Lemmatizer
|
|
||||||
from .attrs import intify_attrs
|
|
||||||
from .tokens.token cimport Token
|
from .tokens.token cimport Token
|
||||||
|
|
||||||
from . import attrs
|
|
||||||
from . import symbols
|
|
||||||
|
|
||||||
from cymem.cymem cimport Address
|
|
||||||
from .serialize.packer cimport Packer
|
from .serialize.packer cimport Packer
|
||||||
from .attrs cimport PROB, LANG
|
from .attrs cimport PROB, LANG
|
||||||
|
|
||||||
|
from .compat import copy_reg, pickle
|
||||||
|
from .lemmatizer import Lemmatizer
|
||||||
|
from .attrs import intify_attrs
|
||||||
from . import util
|
from . import util
|
||||||
|
from . import attrs
|
||||||
|
from . import symbols
|
||||||
try:
|
|
||||||
import copy_reg
|
|
||||||
except ImportError:
|
|
||||||
import copyreg as copy_reg
|
|
||||||
|
|
||||||
|
|
||||||
DEF MAX_VEC_SIZE = 100000
|
DEF MAX_VEC_SIZE = 100000
|
||||||
|
@ -48,8 +36,9 @@ EMPTY_LEXEME.vector = EMPTY_VEC
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
'''A map container for a language's LexemeC structs.
|
"""
|
||||||
'''
|
A map container for a language's LexemeC structs.
|
||||||
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
||||||
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
||||||
|
@ -72,8 +61,7 @@ cdef class Vocab:
|
||||||
Returns:
|
Returns:
|
||||||
Vocab: The newly constructed vocab object.
|
Vocab: The newly constructed vocab object.
|
||||||
"""
|
"""
|
||||||
if isinstance(path, basestring):
|
path = util.ensure_path(path)
|
||||||
path = Path(path)
|
|
||||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||||
if 'vectors' in deprecated_kwargs:
|
if 'vectors' in deprecated_kwargs:
|
||||||
raise AttributeError(
|
raise AttributeError(
|
||||||
|
@ -81,7 +69,7 @@ cdef class Vocab:
|
||||||
"Install vectors after loading.")
|
"Install vectors after loading.")
|
||||||
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
|
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
|
||||||
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
|
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
|
||||||
tag_map = json.load(file_)
|
tag_map = ujson.load(file_)
|
||||||
elif tag_map is True:
|
elif tag_map is True:
|
||||||
tag_map = None
|
tag_map = None
|
||||||
if lex_attr_getters is not None \
|
if lex_attr_getters is not None \
|
||||||
|
@ -94,12 +82,12 @@ cdef class Vocab:
|
||||||
lemmatizer = Lemmatizer.load(path)
|
lemmatizer = Lemmatizer.load(path)
|
||||||
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
|
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
|
||||||
with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
|
with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
|
||||||
serializer_freqs = json.load(file_)
|
serializer_freqs = ujson.load(file_)
|
||||||
else:
|
else:
|
||||||
serializer_freqs = None
|
serializer_freqs = None
|
||||||
|
|
||||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||||
strings_list = json.load(file_)
|
strings_list = ujson.load(file_)
|
||||||
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
||||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
|
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
|
||||||
strings=strings_list)
|
strings=strings_list)
|
||||||
|
@ -108,7 +96,8 @@ cdef class Vocab:
|
||||||
|
|
||||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||||
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
|
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
|
||||||
'''Create the vocabulary.
|
"""
|
||||||
|
Create the vocabulary.
|
||||||
|
|
||||||
lex_attr_getters (dict):
|
lex_attr_getters (dict):
|
||||||
A dictionary mapping attribute IDs to functions to compute them.
|
A dictionary mapping attribute IDs to functions to compute them.
|
||||||
|
@ -123,7 +112,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Vocab: The newly constructed vocab object.
|
Vocab: The newly constructed vocab object.
|
||||||
'''
|
"""
|
||||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||||
|
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
|
@ -172,17 +161,19 @@ cdef class Vocab:
|
||||||
return langfunc('_') if langfunc else ''
|
return langfunc('_') if langfunc else ''
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""
|
||||||
|
The current number of lexemes stored.
|
||||||
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
def resize_vectors(self, int new_size):
|
def resize_vectors(self, int new_size):
|
||||||
'''
|
"""
|
||||||
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
||||||
vectors if necessary. The memory will be zeroed.
|
vectors if necessary. The memory will be zeroed.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
new_size (int): The new size of the vectors.
|
new_size (int): The new size of the vectors.
|
||||||
'''
|
"""
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
if new_size > self.vectors_length:
|
if new_size > self.vectors_length:
|
||||||
|
@ -193,7 +184,8 @@ cdef class Vocab:
|
||||||
self.vectors_length = new_size
|
self.vectors_length = new_size
|
||||||
|
|
||||||
def add_flag(self, flag_getter, int flag_id=-1):
|
def add_flag(self, flag_getter, int flag_id=-1):
|
||||||
'''Set a new boolean flag to words in the vocabulary.
|
"""
|
||||||
|
Set a new boolean flag to words in the vocabulary.
|
||||||
|
|
||||||
The flag_setter function will be called over the words currently in the
|
The flag_setter function will be called over the words currently in the
|
||||||
vocab, and then applied to new words as they occur. You'll then be able
|
vocab, and then applied to new words as they occur. You'll then be able
|
||||||
|
@ -213,7 +205,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
flag_id (int): The integer ID by which the flag value can be checked.
|
flag_id (int): The integer ID by which the flag value can be checked.
|
||||||
'''
|
"""
|
||||||
if flag_id == -1:
|
if flag_id == -1:
|
||||||
for bit in range(1, 64):
|
for bit in range(1, 64):
|
||||||
if bit not in self.lex_attr_getters:
|
if bit not in self.lex_attr_getters:
|
||||||
|
@ -234,9 +226,11 @@ cdef class Vocab:
|
||||||
return flag_id
|
return flag_id
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
"""
|
||||||
|
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
|
"""
|
||||||
if string == u'':
|
if string == u'':
|
||||||
return &EMPTY_LEXEME
|
return &EMPTY_LEXEME
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
|
@ -252,9 +246,11 @@ cdef class Vocab:
|
||||||
return self._new_lexeme(mem, string)
|
return self._new_lexeme(mem, string)
|
||||||
|
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
"""
|
||||||
|
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
|
"""
|
||||||
if orth == 0:
|
if orth == 0:
|
||||||
return &EMPTY_LEXEME
|
return &EMPTY_LEXEME
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
|
@ -297,30 +293,33 @@ cdef class Vocab:
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
def __contains__(self, unicode string):
|
def __contains__(self, unicode string):
|
||||||
'''Check whether the string has an entry in the vocabulary.
|
"""
|
||||||
|
Check whether the string has an entry in the vocabulary.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string (unicode): The ID string.
|
string (unicode): The ID string.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool Whether the string has an entry in the vocabulary.
|
bool Whether the string has an entry in the vocabulary.
|
||||||
'''
|
"""
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
lex = self._by_hash.get(key)
|
lex = self._by_hash.get(key)
|
||||||
return lex is not NULL
|
return lex is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
'''Iterate over the lexemes in the vocabulary.
|
"""
|
||||||
|
Iterate over the lexemes in the vocabulary.
|
||||||
|
|
||||||
Yields: Lexeme An entry in the vocabulary.
|
Yields: Lexeme An entry in the vocabulary.
|
||||||
'''
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
for orth, addr in self._by_orth.items():
|
for orth, addr in self._by_orth.items():
|
||||||
yield Lexeme(self, orth)
|
yield Lexeme(self, orth)
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
"""
|
||||||
|
Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||||
unseen unicode string is given, a new lexeme is created and stored.
|
unseen unicode string is given, a new lexeme is created and stored.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -332,7 +331,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (Lexeme): The lexeme indicated by the given ID.
|
lexeme (Lexeme): The lexeme indicated by the given ID.
|
||||||
'''
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if type(id_or_string) == unicode:
|
if type(id_or_string) == unicode:
|
||||||
orth = self.strings[id_or_string]
|
orth = self.strings[id_or_string]
|
||||||
|
@ -355,7 +354,8 @@ cdef class Vocab:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def dump(self, loc=None):
|
def dump(self, loc=None):
|
||||||
"""Save the lexemes binary data to the given location, or
|
"""
|
||||||
|
Save the lexemes binary data to the given location, or
|
||||||
return a byte-string with the data if loc is None.
|
return a byte-string with the data if loc is None.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -392,14 +392,15 @@ cdef class Vocab:
|
||||||
return fp.string_data()
|
return fp.string_data()
|
||||||
|
|
||||||
def load_lexemes(self, loc):
|
def load_lexemes(self, loc):
|
||||||
'''Load the binary vocabulary data from the given location.
|
"""
|
||||||
|
Load the binary vocabulary data from the given location.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loc (Path): The path to load from.
|
loc (Path): The path to load from.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
'''
|
"""
|
||||||
fp = CFile(loc, 'rb',
|
fp = CFile(loc, 'rb',
|
||||||
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
||||||
cdef LexemeC* lexeme = NULL
|
cdef LexemeC* lexeme = NULL
|
||||||
|
@ -440,8 +441,9 @@ cdef class Vocab:
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
def _deserialize_lexemes(self, CFile fp):
|
def _deserialize_lexemes(self, CFile fp):
|
||||||
'''Load the binary vocabulary data from the given CFile.
|
"""
|
||||||
'''
|
Load the binary vocabulary data from the given CFile.
|
||||||
|
"""
|
||||||
cdef LexemeC* lexeme = NULL
|
cdef LexemeC* lexeme = NULL
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef unicode py_str
|
cdef unicode py_str
|
||||||
|
@ -494,13 +496,14 @@ cdef class Vocab:
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
def dump_vectors(self, out_loc):
|
def dump_vectors(self, out_loc):
|
||||||
'''Save the word vectors to a binary file.
|
"""
|
||||||
|
Save the word vectors to a binary file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loc (Path): The path to save to.
|
loc (Path): The path to save to.
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
'''
|
"""
|
||||||
cdef int32_t vec_len = self.vectors_length
|
cdef int32_t vec_len = self.vectors_length
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
cdef bytes word_str
|
cdef bytes word_str
|
||||||
|
@ -522,7 +525,8 @@ cdef class Vocab:
|
||||||
out_file.close()
|
out_file.close()
|
||||||
|
|
||||||
def load_vectors(self, file_):
|
def load_vectors(self, file_):
|
||||||
"""Load vectors from a text-based file.
|
"""
|
||||||
|
Load vectors from a text-based file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
||||||
|
@ -561,7 +565,8 @@ cdef class Vocab:
|
||||||
return vec_len
|
return vec_len
|
||||||
|
|
||||||
def load_vectors_from_bin_loc(self, loc):
|
def load_vectors_from_bin_loc(self, loc):
|
||||||
"""Load vectors from the location of a binary file.
|
"""
|
||||||
|
Load vectors from the location of a binary file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loc (unicode): The path of the binary file to load from.
|
loc (unicode): The path of the binary file to load from.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user