Merge branch 'develop' into feature/fix-matcher-operators

This commit is contained in:
Matthew Honnibal 2017-10-16 13:38:36 +02:00
commit a928ae2f35
78 changed files with 2319 additions and 779 deletions

View File

@ -0,0 +1,52 @@
# coding: utf-8
"""This example contains several snippets of methods that can be set via custom
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
they're "bound" to the object and are partially applied i.e. the object
they're called on is passed in as the first argument."""
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.tokens import Doc, Span
from spacy import displacy
from pathlib import Path
def to_html(doc, output='/tmp', style='dep'):
"""Doc method extension for saving the current state as a displaCy
visualization.
"""
# generate filename from first six non-punct tokens
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
output_path = Path(output) / file_name
html = displacy.render(doc, style=style, page=True) # render markup
output_path.open('w', encoding='utf-8').write(html) # save to file
print('Saved HTML to {}'.format(output_path))
Doc.set_extension('to_html', method=to_html)
nlp = English()
doc = nlp(u"This is a sentence about Apple.")
# add entity manually for demo purposes, to make it work without a model
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
doc._.to_html(style='ent')
def overlap_tokens(doc, other_doc):
"""Get the tokens from the original Doc that are also in the comparison Doc.
"""
overlap = []
other_tokens = [token.text for token in other_doc]
for token in doc:
if token.text in other_tokens:
overlap.append(token)
return overlap
Doc.set_extension('overlap', method=overlap_tokens)
nlp = English()
doc1 = nlp(u"Peach emoji is where it has always been.")
doc2 = nlp(u"Peach is the superior emoji.")
tokens = doc1._.overlap(doc2)
print(tokens)

View File

@ -0,0 +1,108 @@
# coding: utf-8
from __future__ import unicode_literals
import requests
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
class RESTCountriesComponent(object):
"""Example of a spaCy v2.0 pipeline component that requests all countries
via the REST Countries API, merges country names into one token, assigns
entity labels and sets attributes on country tokens, e.g. the capital and
lat/lng coordinates. Can be extended with more details from the API.
REST Countries API: https://restcountries.eu
API License: Mozilla Public License MPL 2.0
"""
name = 'rest_countries' # component name, will show up in the pipeline
def __init__(self, nlp, label='GPE'):
"""Initialise the pipeline component. The shared nlp instance is used
to initialise the matcher with the shared vocab, get the label ID and
generate Doc objects as phrase match patterns.
"""
# Make request once on initialisation and store the data
r = requests.get('https://restcountries.eu/rest/v2/all')
r.raise_for_status() # make sure requests raises an error if it fails
countries = r.json()
# Convert API response to dict keyed by country name for easy lookup
# This could also be extended using the alternative and foreign language
# names provided by the API
self.countries = {c['name']: c for c in countries}
self.label = nlp.vocab.strings[label] # get entity label ID
# Set up the PhraseMatcher with Doc patterns for each country name
patterns = [nlp(c) for c in self.countries.keys()]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add('COUNTRIES', None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter.
# If no default value is set, it defaults to None.
Token.set_extension('is_country', default=False)
Token.set_extension('country_capital')
Token.set_extension('country_latlng')
Token.set_extension('country_flag')
# Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_country == True.
Doc.set_extension('has_country', getter=self.has_country)
Span.set_extension('has_country', getter=self.has_country)
def __call__(self, doc):
"""Apply the pipeline component on a Doc object and modify it if matches
are found. Return the Doc, so it can be processed by the next component
in the pipeline, if available.
"""
matches = self.matcher(doc)
spans = [] # keep the spans for later so we can merge them afterwards
for _, start, end in matches:
# Generate Span representing the entity & set label
entity = Span(doc, start, end, label=self.label)
spans.append(entity)
# Set custom attribute on each token of the entity
# Can be extended with other data returned by the API, like
# currencies, country code, flag, calling code etc.
for token in entity:
token._.set('is_country', True)
token._.set('country_capital', self.countries[entity.text]['capital'])
token._.set('country_latlng', self.countries[entity.text]['latlng'])
token._.set('country_flag', self.countries[entity.text]['flag'])
# Overwrite doc.ents and add entity be careful not to replace!
doc.ents = list(doc.ents) + [entity]
for span in spans:
# Iterate over all spans and merge them into one token. This is done
# after setting the entities otherwise, it would cause mismatched
# indices!
span.merge()
return doc # don't forget to return the Doc!
def has_country(self, tokens):
"""Getter for Doc and Span attributes. Returns True if one of the tokens
is a country. Since the getter is only called when we access the
attribute, we can refer to the Token's 'is_country' attribute here,
which is already set in the processing step."""
return any([t._.get('is_country') for t in tokens])
# For simplicity, we start off with only the blank English Language class and
# no model or pre-defined pipeline loaded.
nlp = English()
rest_countries = RESTCountriesComponent(nlp) # initialise component
nlp.add_pipe(rest_countries) # add it to the pipeline
doc = nlp(u"Some text about Colombia and the Czech Republic")
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Doc has countries', doc._.has_country) # Doc contains countries
for token in doc:
if token._.is_country:
print(token.text, token._.country_capital, token._.country_latlng,
token._.country_flag) # country data
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities

View File

@ -0,0 +1,85 @@
# coding: utf-8
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
class TechCompanyRecognizer(object):
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
based on list of single or multiple-word company names. Companies are
labelled as ORG and their spans are merged into one token. Additionally,
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
respectively."""
name = 'tech_companies' # component name, will show up in the pipeline
def __init__(self, nlp, companies=tuple(), label='ORG'):
"""Initialise the pipeline component. The shared nlp instance is used
to initialise the matcher with the shared vocab, get the label ID and
generate Doc objects as phrase match patterns.
"""
self.label = nlp.vocab.strings[label] # get entity label ID
# Set up the PhraseMatcher it can now take Doc objects as patterns,
# so even if the list of companies is long, it's very efficient
patterns = [nlp(org) for org in companies]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add('TECH_ORGS', None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter.
Token.set_extension('is_tech_org', default=False)
# Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_tech_org == True.
Doc.set_extension('has_tech_org', getter=self.has_tech_org)
Span.set_extension('has_tech_org', getter=self.has_tech_org)
def __call__(self, doc):
"""Apply the pipeline component on a Doc object and modify it if matches
are found. Return the Doc, so it can be processed by the next component
in the pipeline, if available.
"""
matches = self.matcher(doc)
spans = [] # keep the spans for later so we can merge them afterwards
for _, start, end in matches:
# Generate Span representing the entity & set label
entity = Span(doc, start, end, label=self.label)
spans.append(entity)
# Set custom attribute on each token of the entity
for token in entity:
token._.set('is_tech_org', True)
# Overwrite doc.ents and add entity be careful not to replace!
doc.ents = list(doc.ents) + [entity]
for span in spans:
# Iterate over all spans and merge them into one token. This is done
# after setting the entities otherwise, it would cause mismatched
# indices!
span.merge()
return doc # don't forget to return the Doc!
def has_tech_org(self, tokens):
"""Getter for Doc and Span attributes. Returns True if one of the tokens
is a tech org. Since the getter is only called when we access the
attribute, we can refer to the Token's 'is_tech_org' attribute here,
which is already set in the processing step."""
return any([t._.get('is_tech_org') for t in tokens])
# For simplicity, we start off with only the blank English Language class and
# no model or pre-defined pipeline loaded.
nlp = English()
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
component = TechCompanyRecognizer(nlp, companies) # initialise component
nlp.add_pipe(component, last=True) # add it to the pipeline as the last element
doc = nlp(u"Alphabet Inc. is the company behind Google.")
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Tokens', [t.text for t in doc]) # company names from the list are merged
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities

View File

@ -6,7 +6,7 @@ To achieve that, it duplicates some of spaCy's internal functionality.
Specifically, in this example, we don't use spaCy's built-in Language class to
wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write
our own simle Pipeline class, so that it's easier to see how the pieces
our own simple Pipeline class, so that it's easier to see how the pieces
interact.
Input data:
@ -142,16 +142,15 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5):
inputs, annots = zip(*batch)
nlp.update(list(inputs), list(annots), sgd, losses=losses)
scores = nlp.evaluate(dev_examples)
report_scores(i, losses['ner'], scores)
scores = nlp.evaluate(dev_examples)
report_scores(channels, i+1, loss, scores)
report_scores(i+1, losses['ner'], scores)
def report_scores(i, loss, scores):
precision = '%.2f' % scores['ents_p']
recall = '%.2f' % scores['ents_r']
f_measure = '%.2f' % scores['ents_f']
print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
print('Epoch %d: %d %s %s %s' % (
i, int(loss), precision, recall, f_measure))
def read_examples(path):

View File

@ -7,7 +7,7 @@ if __name__ == '__main__':
import plac
import sys
from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile, evaluate
from spacy.cli import profile, evaluate, validate
from spacy.util import prints
commands = {
@ -20,6 +20,7 @@ if __name__ == '__main__':
'package': package,
'model': model,
'profile': profile,
'validate': validate
}
if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1)

View File

@ -311,7 +311,7 @@ def link_vectors_to_models(vocab):
def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = kwargs.get('pretrained_dims', 0)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
'*': reapply}):

View File

@ -7,3 +7,4 @@ from .train import train
from .evaluate import evaluate
from .convert import convert
from .model import model
from .validate import validate

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
from .converters import conllu2json, iob2json
from .converters import conllu2json, iob2json, conll_ner2json
from ..util import prints
# Converters are matched by file extension. To add a converter, add a new entry
@ -12,9 +12,10 @@ from ..util import prints
# from /converters.
CONVERTERS = {
'.conllu': conllu2json,
'.conll': conllu2json,
'.iob': iob2json,
'conllu': conllu2json,
'conll': conllu2json,
'ner': conll_ner2json,
'iob': iob2json,
}
@ -22,9 +23,11 @@ CONVERTERS = {
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", int),
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
converter='auto'):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
@ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
prints(input_path, title="Input file not found", exits=1)
if not output_path.exists():
prints(output_path, title="Output directory not found", exits=1)
file_ext = input_path.suffix
if not file_ext in CONVERTERS:
prints("Can't find converter for %s" % input_path.parts[-1],
title="Unknown format", exits=1)
CONVERTERS[file_ext](input_path, output_path,
n_sents=n_sents, use_morphology=morphology)
if converter == 'auto':
converter = input_path.suffix[1:]
if not converter in CONVERTERS:
prints("Can't find converter for %s" % converter,
title="Unknown format", exits=1)
func = CONVERTERS[converter]
func(input_path, output_path,
n_sents=n_sents, use_morphology=morphology)

View File

@ -1,2 +1,3 @@
from .conllu2json import conllu2json
from .iob2json import iob2json
from .conll_ner2json import conll_ner2json

View File

@ -0,0 +1,50 @@
# coding: utf8
from __future__ import unicode_literals
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
"""
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
"""
docs = read_conll_ner(input_path)
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints("Created %d documents" % len(docs),
title="Generated output file %s" % path2str(output_file))
def read_conll_ner(input_path):
text = input_path.open('r', encoding='utf-8').read()
i = 0
delimit_docs = '-DOCSTART- -X- O O'
output_docs = []
for doc in text.strip().split(delimit_docs):
doc = doc.strip()
if not doc:
continue
output_doc = []
for sent in doc.split('\n\n'):
sent = sent.strip()
if not sent:
continue
lines = [line.strip() for line in sent.split('\n') if line.strip()]
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
biluo_ents = iob_to_biluo(iob_ents)
output_doc.append({'tokens': [
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
zip(words, tags, biluo_ents)
]})
output_docs.append({
'id': len(output_docs),
'paragraphs': [{'sentences': output_doc}]
})
output_doc = []
return output_docs

View File

@ -44,7 +44,7 @@ numpy.random.seed(0)
version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
)
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False, version="0.0.0", meta_path=None):
"""
@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
if not isinstance(meta, dict):
prints("Expected dict but got: {}".format(type(meta)),
title="Not a valid meta.json format", exits=1)
meta.setdefault('lang', lang)
meta.setdefault('name', 'unnamed')
pipeline = ['tagger', 'parser', 'ner']
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
@ -88,9 +90,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
n_train_words = corpus.count_train()
lang_class = util.get_lang_class(lang)
nlp = lang_class(pipeline=pipeline)
nlp = lang_class()
meta['pipeline'] = pipeline
nlp.meta.update(meta)
if vectors:
util.load_model(vectors, vocab=nlp.vocab)
for name in pipeline:
nlp.add_pipe(nlp.create_pipe(name), name=name)
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
nlp._optimizer = None
@ -112,17 +118,33 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path)
nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp_loaded.evaluate(
list(corpus.dev_docs(
nlp_loaded = util.load_model_from_path(epoch_model_path)
dev_docs = list(corpus.dev_docs(
nlp_loaded,
gold_preproc=gold_preproc)))
gold_preproc=gold_preproc))
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs)
end_time = timer()
if use_gpu < 0:
gpu_wps = None
cpu_wps = nwords/(end_time-start_time)
else:
gpu_wps = nwords/(end_time-start_time)
with Model.use_device('cpu'):
nlp_loaded = util.load_model_from_path(epoch_model_path)
dev_docs = list(corpus.dev_docs(
nlp_loaded, gold_preproc=gold_preproc))
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs)
end_time = timer()
cpu_wps = nwords/(end_time-start_time)
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores))
meta_loc = output_path / ('model%d' % i) / 'meta.json'
meta['accuracy'] = scorer.scores
meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps}
meta['lang'] = nlp.lang
meta['pipeline'] = pipeline
meta['spacy_version'] = '>=%s' % about.__version__
@ -132,7 +154,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
with meta_loc.open('w') as file_:
file_.write(json_dumps(meta))
util.set_env_log(True)
print_progress(i, losses, scorer.scores)
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
finally:
print("Saving model...")
try:
@ -153,16 +175,17 @@ def _render_parses(i, to_render):
file_.write(html)
def print_progress(itn, losses, dev_scores, wps=0.0):
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f', 'wps']:
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
scores[col] = 0.0
scores['dep_loss'] = losses.get('parser', 0.0)
scores['ner_loss'] = losses.get('ner', 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0)
scores.update(dev_scores)
scores['wps'] = wps
scores['cpu_wps'] = cpu_wps
scores['gpu_wps'] = gpu_wps or 0.0
tpl = '\t'.join((
'{:d}',
'{dep_loss:.3f}',
@ -173,7 +196,9 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
'{ents_f:.3f}',
'{tags_acc:.3f}',
'{token_acc:.3f}',
'{wps:.1f}'))
'{cpu_wps:.1f}',
'{gpu_wps:.1f}',
))
print(tpl.format(itn, **scores))

123
spacy/cli/validate.py Normal file
View File

@ -0,0 +1,123 @@
# coding: utf8
from __future__ import unicode_literals
import requests
import pkg_resources
from pathlib import Path
from ..compat import path2str, locale_escape
from ..util import prints, get_data_path, read_json
from .. import about
def validate(cmd):
"""Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`.
"""
r = requests.get(about.__compatibility__)
if r.status_code != 200:
prints("Couldn't fetch compatibility table.",
title="Server error (%d)" % r.status_code, exits=1)
compat = r.json()['spacy']
all_models = set()
for spacy_v, models in dict(compat).items():
all_models.update(models.keys())
for model, model_vs in models.items():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
current_compat = compat[about.__version__]
model_links = get_model_links(current_compat)
model_pkgs = get_model_pkgs(current_compat, all_models)
incompat_links = {l for l, d in model_links.items() if not d['compat']}
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
na_models = [m for m in incompat_models if m not in current_compat]
update_models = [m for m in incompat_models if m in current_compat]
prints(path2str(Path(__file__).parent.parent),
title="Installed models (spaCy v{})".format(about.__version__))
if model_links or model_pkgs:
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
for name, data in model_pkgs.items():
print(get_model_row(current_compat, name, data, 'package'))
for name, data in model_links.items():
print(get_model_row(current_compat, name, data, 'link'))
else:
prints("No models found in your current environment.", exits=0)
if update_models:
cmd = ' python -m spacy download {}'
print("\n Use the following commands to update the model packages:")
print('\n'.join([cmd.format(pkg) for pkg in update_models]))
if na_models:
prints("The following models are not available for spaCy v{}: {}"
.format(about.__version__, ', '.join(na_models)))
if incompat_links:
prints("You may also want to overwrite the incompatible links using "
"the `spacy link` command with `--force`, or remove them from "
"the data directory. Data path: {}"
.format(path2str(get_data_path())))
def get_model_links(compat):
links = {}
data_path = get_data_path()
if data_path:
models = [p for p in data_path.iterdir() if is_model_path(p)]
for model in models:
meta_path = Path(model) / 'meta.json'
if not meta_path.exists():
continue
meta = read_json(meta_path)
link = model.parts[-1]
name = meta['lang'] + '_' + meta['name']
links[link] = {'name': name, 'version': meta['version'],
'compat': is_compat(compat, name, meta['version'])}
return links
def get_model_pkgs(compat, all_models):
pkgs = {}
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
package = pkg_name.replace('-', '_')
if package in all_models:
version = pkg_data.version
pkgs[pkg_name] = {'name': package, 'version': version,
'compat': is_compat(compat, package, version)}
return pkgs
def get_model_row(compat, name, data, type='package'):
tpl_row = ' {:<10}' + (' {:<20}' * 4)
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
if data['compat']:
comp = tpl_green.format(locale_escape('', errors='ignore'))
version = tpl_green.format(data['version'])
else:
comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
version = tpl_red.format(data['version'])
return get_row(type, name, data['name'], version, comp)
def get_row(*args):
tpl_row = ' {:<10}' + (' {:<20}' * 4)
return tpl_row.format(*args)
def is_model_path(model_path):
exclude = ['cache', 'pycache', '__pycache__']
name = model_path.parts[-1]
return model_path.is_dir() and name not in exclude and not name.startswith('.')
def is_compat(compat, name, version):
return name in compat and version in compat[name]
def reformat_version(version):
if version.endswith('-alpha'):
return version.replace('-alpha', 'a0')
return version.replace('-alpha', 'a')

View File

@ -6,6 +6,7 @@ import ftfy
import sys
import ujson
import itertools
import locale
from thinc.neural.util import copy_array
@ -113,3 +114,12 @@ def import_file(name, loc):
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def locale_escape(string, errors='replace'):
'''
Mangle non-supported characters, for savages with ascii terminals.
'''
encoding = locale.getpreferredencoding()
string = string.encode(encoding, errors).decode('utf8')
return string

View File

@ -213,7 +213,7 @@ class GoldCorpus(object):
train_tuples = self.train_tuples
if projectivize:
train_tuples = nonproj.preprocess_training_data(
self.train_tuples)
self.train_tuples, label_freq_cutoff=100)
random.shuffle(train_tuples)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length,

View File

@ -16,15 +16,13 @@ from ...util import update_exc
class BengaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
class Bengali(Language):

View File

@ -15,9 +15,8 @@ class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
stop_words = STOP_WORDS
class Danish(Language):

View File

@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
@ -22,16 +21,12 @@ class GermanDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'de'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
NORM_EXCEPTIONS, BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
class German(Language):

View File

@ -7,7 +7,7 @@ from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
@ -23,15 +23,15 @@ class EnglishDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'en'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
BASE_NORMS, NORM_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
morph_rules = dict(MORPH_RULES)
lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC)
syntax_iterators = dict(SYNTAX_ITERATORS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
morph_rules = MORPH_RULES
lemma_rules = LEMMA_RULES
lemma_index = LEMMA_INDEX
lemma_exc = LEMMA_EXC
lemma_lookup = LOOKUP
syntax_iterators = SYNTAX_ITERATORS
class English(Language):

View File

@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
@ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'es'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
sytax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
tag_map = TAG_MAP
stop_words = STOP_WORDS
sytax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
class Spanish(Language):

View File

@ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
stop_words = STOP_WORDS
class Finnish(Language):

View File

@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
@ -21,17 +20,13 @@ class FrenchDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
stop_words = STOP_WORDS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
token_match = TOKEN_MATCH
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
class French(Language):

View File

@ -12,9 +12,8 @@ from ...util import update_exc
class HebrewDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
stop_words = STOP_WORDS
class Hebrew(Language):

View File

@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
@ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
lemma_lookup = LOOKUP
class Hungarian(Language):

View File

@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG
from ...util import update_exc
@ -19,19 +18,14 @@ from ...util import update_exc
class IndonesianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'id'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
stop_words = STOP_WORDS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
syntax_iterators = SYNTAX_ITERATORS
lemma_lookup = LOOKUP
class Indonesian(Language):

View File

@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
'noniliun', 'desiliun',
]
'noniliun', 'desiliun']
def like_num(text):

View File

@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
@ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
class Italian(Language):

View File

@ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
stop_words = STOP_WORDS
class Norwegian(Language):

View File

@ -16,9 +16,8 @@ class DutchDefaults(Language.Defaults):
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'nl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
stop_words = STOP_WORDS
class Dutch(Language):

View File

@ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
stop_words = STOP_WORDS
class Polish(Language):

View File

@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
@ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults):
lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
class Portuguese(Language):

View File

@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...lemmatizerlookup import Lemmatizer
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
@ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
lemma_lookup = LOOKUP
class Swedish(Language):

View File

@ -12,24 +12,27 @@ from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class ThaiDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'th'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
class Thai(Language):
lang = 'th'
Defaults = ThaiDefaults
def make_doc(self, text):
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/wannaphongcom/pythainlp/")
words = [x for x in list(word_tokenize(text,"newmm"))]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
lang = 'th'
Defaults = ThaiDefaults
def make_doc(self, text):
try:
from pythainlp.tokenize import word_tokenize
except ImportError:
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
"https://github.com/wannaphongcom/pythainlp/")
words = [x for x in list(word_tokenize(text,"newmm"))]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
__all__ = ['Thai']

View File

@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)

View File

@ -1,12 +1,9 @@
# coding: utf8
from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import dill
import numpy
from thinc.neural import Model
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.optimizers import Adam, SGD
from thinc.neural.optimizers import Adam
import random
import ujson
from collections import OrderedDict
@ -17,30 +14,27 @@ from .vocab import Vocab
from .tagger import Tagger
from .lemmatizer import Lemmatizer
from .syntax.parser import get_templates
from .syntax import nonproj
from .pipeline import NeuralDependencyParser, EntityRecognizer
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
from .pipeline import NeuralLabeller
from .pipeline import SimilarityHook
from .pipeline import TextCategorizer
from . import about
from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger
from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer
from .compat import json_dumps, izip
from .scorer import Scorer
from ._ml import link_vectors_to_models
from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS
from . import util
from .scorer import Scorer
from ._ml import link_vectors_to_models
from . import about
class BaseDefaults(object):
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules)
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules,
cls.lemma_lookup)
@classmethod
def create_vocab(cls, nlp=None):
@ -70,59 +64,7 @@ class BaseDefaults(object):
prefix_search=prefix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer, token_match=token_match)
@classmethod
def create_tagger(cls, nlp=None, **cfg):
if nlp is None:
return NeuralTagger(cls.create_vocab(nlp), **cfg)
else:
return NeuralTagger(nlp.vocab, **cfg)
@classmethod
def create_parser(cls, nlp=None, **cfg):
if nlp is None:
return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
else:
return NeuralDependencyParser(nlp.vocab, **cfg)
@classmethod
def create_entity(cls, nlp=None, **cfg):
if nlp is None:
return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
else:
return NeuralEntityRecognizer(nlp.vocab, **cfg)
@classmethod
def create_pipeline(cls, nlp=None, disable=tuple()):
meta = nlp.meta if nlp is not None else {}
# Resolve strings, like "cnn", "lstm", etc
pipeline = []
for entry in meta.get('pipeline', []):
if entry in disable or getattr(entry, 'name', entry) in disable:
continue
factory = cls.Defaults.factories[entry]
pipeline.append(factory(nlp, **meta.get(entry, {})))
return pipeline
factories = {
'make_doc': create_tokenizer,
'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'parser': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize],
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
# Temporary compatibility -- delete after pivot
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'dependencies': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize,
],
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
}
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
token_match = TOKEN_MATCH
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
@ -136,6 +78,7 @@ class BaseDefaults(object):
lemma_rules = {}
lemma_exc = {}
lemma_index = {}
lemma_lookup = {}
morph_rules = {}
lex_attr_getters = LEX_ATTRS
syntax_iterators = {}
@ -152,8 +95,17 @@ class Language(object):
Defaults = BaseDefaults
lang = None
def __init__(self, vocab=True, make_doc=True, pipeline=None,
meta={}, disable=tuple(), **kwargs):
factories = {
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
}
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
@ -179,28 +131,7 @@ class Language(object):
factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {}))
self.tokenizer = make_doc
if pipeline is True:
self.pipeline = self.Defaults.create_pipeline(self, disable)
elif pipeline:
# Careful not to do getattr(p, 'name', None) here
# If we had disable=[None], we'd disable everything!
self.pipeline = [p for p in pipeline
if p not in disable
and getattr(p, 'name', p) not in disable]
# Resolve strings, like "cnn", "lstm", etc
for i, entry in enumerate(self.pipeline):
if entry in self.Defaults.factories:
factory = self.Defaults.factories[entry]
self.pipeline[i] = factory(self, **meta.get(entry, {}))
else:
self.pipeline = []
flat_list = []
for pipe in self.pipeline:
if isinstance(pipe, list):
flat_list.extend(pipe)
else:
flat_list.append(pipe)
self.pipeline = flat_list
self.pipeline = []
self._optimizer = None
@property
@ -214,11 +145,7 @@ class Language(object):
self._meta.setdefault('email', '')
self._meta.setdefault('url', '')
self._meta.setdefault('license', '')
pipeline = []
for component in self.pipeline:
if hasattr(component, 'name'):
pipeline.append(component.name)
self._meta['pipeline'] = pipeline
self._meta['pipeline'] = self.pipe_names
return self._meta
@meta.setter
@ -228,34 +155,144 @@ class Language(object):
# Conveniences to access pipeline components
@property
def tensorizer(self):
return self.get_component('tensorizer')
return self.get_pipe('tensorizer')
@property
def tagger(self):
return self.get_component('tagger')
return self.get_pipe('tagger')
@property
def parser(self):
return self.get_component('parser')
return self.get_pipe('parser')
@property
def entity(self):
return self.get_component('ner')
return self.get_pipe('ner')
@property
def matcher(self):
return self.get_component('matcher')
return self.get_pipe('matcher')
def get_component(self, name):
if self.pipeline in (True, None):
return None
for proc in self.pipeline:
if hasattr(proc, 'name') and proc.name.endswith(name):
return proc
return None
@property
def pipe_names(self):
"""Get names of available pipeline components.
RETURNS (list): List of component name strings, in order.
"""
return [pipe_name for pipe_name, _ in self.pipeline]
def get_pipe(self, name):
"""Get a pipeline component for a given component name.
name (unicode): Name of pipeline component to get.
RETURNS (callable): The pipeline component.
"""
for pipe_name, component in self.pipeline:
if pipe_name == name:
return component
msg = "No component '{}' found in pipeline. Available names: {}"
raise KeyError(msg.format(name, self.pipe_names))
def create_pipe(self, name, config=dict()):
"""Create a pipeline component from a factory.
name (unicode): Factory name to look up in `Language.factories`.
config (dict): Configuration parameters to initialise component.
RETURNS (callable): Pipeline component.
"""
if name not in self.factories:
raise KeyError("Can't find factory for '{}'.".format(name))
factory = self.factories[name]
return factory(self, **config)
def add_pipe(self, component, name=None, before=None, after=None,
first=None, last=None):
"""Add a component to the processing pipeline. Valid components are
callables that take a `Doc` object, modify it and return it. Only one of
before, after, first or last can be set. Default behaviour is "last".
component (callable): The pipeline component.
name (unicode): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if the name already exists in the pipeline.
before (unicode): Component name to insert component directly before.
after (unicode): Component name to insert component directly after.
first (bool): Insert component first / not first in the pipeline.
last (bool): Insert component last / not last in the pipeline.
EXAMPLE:
>>> nlp.add_pipe(component, before='ner')
>>> nlp.add_pipe(component, name='custom_name', last=True)
"""
if name is None:
if hasattr(component, 'name'):
name = component.name
elif hasattr(component, '__name__'):
name = component.__name__
elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
name = component.__class__.__name__
else:
name = repr(component)
if name in self.pipe_names:
raise ValueError("'{}' already exists in pipeline.".format(name))
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
msg = ("Invalid constraints. You can only set one of the "
"following: before, after, first, last.")
raise ValueError(msg)
pipe = (name, component)
if last or not any([first, before, after]):
self.pipeline.append(pipe)
elif first:
self.pipeline.insert(0, pipe)
elif before and before in self.pipe_names:
self.pipeline.insert(self.pipe_names.index(before), pipe)
elif after and after in self.pipe_names:
self.pipeline.insert(self.pipe_names.index(after), pipe)
else:
msg = "Can't find '{}' in pipeline. Available names: {}"
unfound = before or after
raise ValueError(msg.format(unfound, self.pipe_names))
def replace_pipe(self, name, component):
"""Replace a component in the pipeline.
name (unicode): Name of the component to replace.
component (callable): Pipeline component.
"""
if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(name, self.pipe_names))
self.pipeline[self.pipe_names.index(name)] = (name, component)
def rename_pipe(self, old_name, new_name):
"""Rename a pipeline component.
old_name (unicode): Name of the component to rename.
new_name (unicode): New name of the component.
"""
if old_name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(old_name, self.pipe_names))
if new_name in self.pipe_names:
msg = "'{}' already exists in pipeline. Existing names: {}"
raise ValueError(msg.format(new_name, self.pipe_names))
i = self.pipe_names.index(old_name)
self.pipeline[i] = (new_name, self.pipeline[i][1])
def remove_pipe(self, name):
"""Remove a component from the pipeline.
name (unicode): Name of the component to remove.
RETURNS (tuple): A `(name, component)` tuple of the removed component.
"""
if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(name, self.pipe_names))
return self.pipeline.pop(self.pipe_names.index(name))
def __call__(self, text, disable=[]):
"""'Apply the pipeline to some text. The text can span multiple sentences,
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
@ -269,8 +306,7 @@ class Language(object):
('An', 'NN')
"""
doc = self.make_doc(text)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
for name, proc in self.pipeline:
if name in disable:
continue
doc = proc(doc)
@ -308,7 +344,7 @@ class Language(object):
grads[key] = (W, dW)
pipes = list(self.pipeline)
random.shuffle(pipes)
for proc in pipes:
for name, proc in pipes:
if not hasattr(proc, 'update'):
continue
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
@ -322,7 +358,7 @@ class Language(object):
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
"""
for proc in self.pipeline:
for name, proc in self.pipeline:
if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds)
for doc, gold in docs_golds:
@ -354,7 +390,7 @@ class Language(object):
get_gold_tuples (function): Function returning gold data
**cfg: Config parameters.
returns: An optimizer
RETURNS: An optimizer
"""
# Populate vocab
if get_gold_tuples is not None:
@ -371,7 +407,7 @@ class Language(object):
else:
device = None
link_vectors_to_models(self.vocab)
for proc in self.pipeline:
for name, proc in self.pipeline:
if hasattr(proc, 'begin_training'):
context = proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline)
@ -393,7 +429,7 @@ class Language(object):
docs, golds = zip(*docs_golds)
docs = list(docs)
golds = list(golds)
for pipe in self.pipeline:
for name, pipe in self.pipeline:
if not hasattr(pipe, 'pipe'):
for doc in docs:
pipe(doc)
@ -419,7 +455,7 @@ class Language(object):
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
contexts = [pipe.use_params(params) for pipe
contexts = [pipe.use_params(params) for name, pipe
in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm.
@ -466,8 +502,7 @@ class Language(object):
yield (doc, context)
return
docs = (self.make_doc(text) for text in texts)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
for name, proc in self.pipeline:
if name in disable:
continue
if hasattr(proc, 'pipe'):
@ -495,14 +530,14 @@ class Language(object):
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
for proc in self.pipeline:
for name, proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
if name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
util.to_disk(path, serializers, {p: False for p in disable})
@ -526,14 +561,12 @@ class Language(object):
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
for name, proc in self.pipeline:
if name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
exclude = {p: False for p in disable}
if not (path / 'vocab').exists():
exclude['vocab'] = True
@ -552,8 +585,8 @@ class Language(object):
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
('meta', lambda: ujson.dumps(self.meta))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
continue
if not hasattr(proc, 'to_bytes'):
continue
@ -572,8 +605,8 @@ class Language(object):
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
('meta', lambda b: self.meta.update(ujson.loads(b)))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
continue
if not hasattr(proc, 'from_bytes'):
continue

View File

@ -10,20 +10,23 @@ class Lemmatizer(object):
def load(cls, path, index=None, exc=None, rules=None):
return cls(index or {}, exc or {}, rules or {})
def __init__(self, index, exceptions, rules):
self.index = index
self.exc = exceptions
self.rules = rules
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
self.index = index if index is not None else {}
self.exc = exceptions if exceptions is not None else {}
self.rules = rules if rules is not None else {}
self.lookup_table = lookup if lookup is not None else {}
def __call__(self, string, univ_pos, morphology=None):
if univ_pos == NOUN:
if univ_pos in (NOUN, 'NOUN', 'noun'):
univ_pos = 'noun'
elif univ_pos == VERB:
elif univ_pos in (VERB, 'VERB', 'verb'):
univ_pos = 'verb'
elif univ_pos == ADJ:
elif univ_pos in (ADJ, 'ADJ', 'adj'):
univ_pos = 'adj'
elif univ_pos == PUNCT:
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
univ_pos = 'punct'
else:
return set([string.lower()])
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
return set([string.lower()])
@ -77,6 +80,11 @@ class Lemmatizer(object):
def punct(self, string, morphology=None):
return self(string, 'punct', morphology)
def lookup(self, string):
if string in self.lookup_table:
return self.lookup_table[string]
return string
def lemmatize(string, index, exceptions, rules):
string = string.lower()

View File

@ -1,19 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .lemmatizer import Lemmatizer
class Lemmatizer(Lemmatizer):
@classmethod
def load(cls, path, lookup):
return cls(lookup or {})
def __init__(self, lookup):
self.lookup = lookup
def __call__(self, string, univ_pos, morphology=None):
try:
return set([self.lookup[string]])
except:
return set([string])

View File

@ -35,6 +35,8 @@ cdef class Morphology:
cdef RichTagC* rich_tags
cdef PreshMapArray _cache
cdef int assign_untagged(self, TokenC* token) except -1
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1

View File

@ -42,7 +42,7 @@ cdef class Morphology:
self.tag_names = tuple(sorted(tag_map.keys()))
self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
self.tag_map[tag_str] = dict(attrs)
attrs = _normalize_props(attrs)
@ -52,6 +52,10 @@ cdef class Morphology:
self.rich_tags[i].morph = 0
self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i
# Add a 'null' tag, which we can reference when assign morphology to
# untagged tokens.
self.rich_tags[self.n_tags].id = self.n_tags
self._cache = PreshMapArray(self.n_tags)
self.exc = {}
if exc is not None:
@ -62,6 +66,15 @@ cdef class Morphology:
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
self.exc), None, None)
cdef int assign_untagged(self, TokenC* token) except -1:
"""Set morphological attributes on a token without a POS tag. Uses
the lemmatizer's lookup() method, which looks up the string in the
table provided by the language data as lemma_lookup (if available)."""
if token.lemma == 0:
orth_str = self.strings[token.lex.orth]
lemma = self.lemmatizer.lookup(orth_str)
token.lemma = self.strings.add(lemma)
cdef int assign_tag(self, TokenC* token, tag) except -1:
if isinstance(tag, basestring):
tag = self.strings.add(tag)
@ -72,7 +85,7 @@ cdef class Morphology:
token.tag = tag
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id >= self.n_tags:
if tag_id > self.n_tags:
raise ValueError("Unknown tag ID: %s" % tag_id)
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
# is that this is where the specific word and the tag interact. Still,
@ -151,8 +164,6 @@ cdef class Morphology:
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings.add(py_string.lower())
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
return self.strings.add(py_string.lower())
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)

View File

@ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity
from .tokens.doc cimport Doc
from .syntax.parser cimport Parser as LinearParser
from .syntax.nn_parser cimport Parser as NeuralParser
from .syntax import nonproj
from .syntax.parser import get_templates as get_feature_templates
from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown
@ -157,11 +158,13 @@ class BaseThincComponent(object):
def to_bytes(self, **exclude):
"""Serialize the pipe to a bytestring."""
serialize = OrderedDict((
('cfg', lambda: json_dumps(self.cfg)),
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes())
))
serialize = OrderedDict()
serialize['cfg'] = lambda: json_dumps(self.cfg)
if self.model in (True, False, None):
serialize['model'] = lambda: self.model
else:
serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
@ -182,11 +185,11 @@ class BaseThincComponent(object):
def to_disk(self, path, **exclude):
"""Serialize the pipe to disk."""
serialize = OrderedDict((
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
('vocab', lambda p: self.vocab.to_disk(p)),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
))
serialize = OrderedDict()
serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
if self.model not in (None, True, False):
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
@ -437,13 +440,16 @@ class NeuralTagger(BaseThincComponent):
yield
def to_bytes(self, **exclude):
serialize = OrderedDict((
('model', lambda: self.model.to_bytes()),
('vocab', lambda: self.vocab.to_bytes()),
('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
use_bin_type=True,
encoding='utf8'))
))
serialize = OrderedDict()
if self.model in (None, True, False):
serialize['model'] = lambda: self.model
else:
serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
use_bin_type=True,
encoding='utf8')
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude):
@ -778,11 +784,19 @@ cdef class DependencyParser(LinearParser):
if isinstance(label, basestring):
label = self.vocab.strings[label]
@property
def postprocesses(self):
return [nonproj.deprojectivize]
cdef class NeuralDependencyParser(NeuralParser):
name = 'parser'
TransitionSystem = ArcEager
@property
def postprocesses(self):
return [nonproj.deprojectivize]
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in []:
labeller = NeuralLabeller(self.vocab, target=target)
@ -823,6 +837,11 @@ cdef class BeamDependencyParser(BeamParser):
if isinstance(label, basestring):
label = self.vocab.strings[label]
@property
def postprocesses(self):
return [nonproj.deprojectivize]
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
'BeamEntityRecognizer', 'TokenVectorEnoder']

View File

@ -241,8 +241,8 @@ cdef class Parser:
def Model(cls, nr_class, **cfg):
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1))
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -779,6 +779,14 @@ cdef class Parser:
for i in range(doc.length):
doc.c[i] = state.c._sent[i]
self.moves.finalize_doc(doc)
for hook in self.postprocesses:
for doc in docs:
hook(doc)
@property
def postprocesses(self):
# Available for subclasses, e.g. to deprojectivize
return []
def add_label(self, label):
resized = False
@ -792,16 +800,25 @@ cdef class Parser:
if self.model not in (True, False, None) and resized:
# Weights are stored in (nr_out, nr_in) format, so we're basically
# just adding rows here.
smaller = self.model[-1]._layers[-1]
larger = Affine(self.moves.n_moves, smaller.nI)
copy_array(larger.W[:smaller.nO], smaller.W)
copy_array(larger.b[:smaller.nO], smaller.b)
self.model[-1]._layers[-1] = larger
if self.model[-1].is_noop:
smaller = self.model[1]
dims = dict(self.model[1]._dims)
dims['nO'] = self.moves.n_moves
larger = self.model[1].__class__(**dims)
copy_array(larger.W[:, :smaller.nO], smaller.W)
copy_array(larger.b[:smaller.nO], smaller.b)
self.model = (self.model[0], larger, self.model[2])
else:
smaller = self.model[-1]._layers[-1]
larger = Affine(self.moves.n_moves, smaller.nI)
copy_array(larger.W[:smaller.nO], smaller.W)
copy_array(larger.b[:smaller.nO], smaller.b)
self.model[-1]._layers[-1] = larger
def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg:
self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
actions = self.moves.get_actions(gold_parses=gold_tuples)
for action, labels in actions.items():
for label in labels:

View File

@ -58,8 +58,9 @@ def en_vocab():
@pytest.fixture
def en_parser():
return util.get_lang_class('en').Defaults.create_parser()
def en_parser(en_vocab):
nlp = util.get_lang_class('en')(en_vocab)
return nlp.create_pipe('parser')
@pytest.fixture

View File

@ -0,0 +1,37 @@
'''Test Doc sets up tokens correctly.'''
from __future__ import unicode_literals
import pytest
from ...vocab import Vocab
from ...tokens.doc import Doc
from ...lemmatizer import Lemmatizer
@pytest.fixture
def lemmatizer():
return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
@pytest.fixture
def vocab(lemmatizer):
return Vocab(lemmatizer=lemmatizer)
def test_empty_doc(vocab):
doc = Doc(vocab)
assert len(doc) == 0
def test_single_word(vocab):
doc = Doc(vocab, words=['a'])
assert doc.text == 'a '
doc = Doc(vocab, words=['a'], spaces=[False])
assert doc.text == 'a'
def test_lookup_lemmatization(vocab):
doc = Doc(vocab, words=['dogs', 'dogses'])
assert doc[0].text == 'dogs'
assert doc[0].lemma_ == 'dog'
assert doc[1].text == 'dogses'
assert doc[1].lemma_ == 'dogses'

View File

@ -0,0 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'),
('engagierte', 'engagieren'),
('schließt', 'schließen'),
('vorgebenden', 'vorgebend')])
def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
tokens = de_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -57,6 +57,5 @@ def test_en_lemmatizer_punct(en_lemmatizer):
def test_en_lemmatizer_lemma_assignment(EN):
text = "Bananas in pyjamas are geese."
doc = EN.make_doc(text)
assert all(t.lemma_ == '' for t in doc)
EN.tagger(doc)
assert all(t.lemma_ != '' for t in doc)

View File

@ -22,14 +22,14 @@ def vocab():
@pytest.fixture
def parser(vocab):
parser = NeuralDependencyParser(vocab)
parser.cfg['token_vector_width'] = 4
parser.cfg['hidden_width'] = 6
parser.cfg['token_vector_width'] = 8
parser.cfg['hidden_width'] = 30
parser.cfg['hist_size'] = 0
parser.add_label('left')
parser.begin_training([], **parser.cfg)
sgd = Adam(NumpyOps(), 0.001)
for i in range(30):
for i in range(10):
losses = {}
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
gold = GoldParse(doc, heads=[1, 1, 3, 3],
@ -37,6 +37,8 @@ def parser(vocab):
parser.update([doc], [gold], sgd=sgd, losses=losses)
return parser
def test_init_parser(parser):
pass
def test_add_label(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])

View File

@ -1,10 +1,11 @@
import spacy
# coding: utf8
from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_beam_parse():
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Australia is a country', disable=['ner'])
ents = nlp.entity(doc, beam_width=2)
print(ents)
@pytest.mark.models('en')
def test_beam_parse(EN):
doc = EN(u'Australia is a country', disable=['ner'])
ents = EN.entity(doc, beam_width=2)
print(ents)

View File

@ -35,7 +35,7 @@ def parser(vocab):
def test_no_sentences(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = parser(doc)
assert len(list(doc.sents)) == 2
assert len(list(doc.sents)) >= 1
def test_sents_1(parser):
@ -64,7 +64,7 @@ def test_sents_1_3(parser):
doc[1].sent_start = True
doc[3].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) == 4
assert len(list(doc.sents)) >= 3
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc[1].sent_start = True
doc[2].sent_start = False

View File

View File

@ -0,0 +1,84 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ...language import Language
@pytest.fixture
def nlp():
return Language()
def new_pipe(doc):
return doc
def test_add_pipe_no_name(nlp):
nlp.add_pipe(new_pipe)
assert 'new_pipe' in nlp.pipe_names
def test_add_pipe_duplicate_name(nlp):
nlp.add_pipe(new_pipe, name='duplicate_name')
with pytest.raises(ValueError):
nlp.add_pipe(new_pipe, name='duplicate_name')
@pytest.mark.parametrize('name', ['parser'])
def test_add_pipe_first(nlp, name):
nlp.add_pipe(new_pipe, name=name, first=True)
assert nlp.pipeline[0][0] == name
@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')])
def test_add_pipe_last(nlp, name1, name2):
nlp.add_pipe(lambda doc: doc, name=name2)
nlp.add_pipe(new_pipe, name=name1, last=True)
assert nlp.pipeline[0][0] != name1
assert nlp.pipeline[-1][0] == name1
def test_cant_add_pipe_first_and_last(nlp):
with pytest.raises(ValueError):
nlp.add_pipe(new_pipe, first=True, last=True)
@pytest.mark.parametrize('name', ['my_component'])
def test_get_pipe(nlp, name):
with pytest.raises(KeyError):
nlp.get_pipe(name)
nlp.add_pipe(new_pipe, name=name)
assert nlp.get_pipe(name) == new_pipe
@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)])
def test_replace_pipe(nlp, name, replacement):
with pytest.raises(ValueError):
nlp.replace_pipe(name, new_pipe)
nlp.add_pipe(new_pipe, name=name)
nlp.replace_pipe(name, replacement)
assert nlp.get_pipe(name) != new_pipe
assert nlp.get_pipe(name) == replacement
@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')])
def test_rename_pipe(nlp, old_name, new_name):
with pytest.raises(ValueError):
nlp.rename_pipe(old_name, new_name)
nlp.add_pipe(new_pipe, name=old_name)
nlp.rename_pipe(old_name, new_name)
assert nlp.pipeline[0][0] == new_name
@pytest.mark.parametrize('name', ['my_component'])
def test_remove_pipe(nlp, name):
with pytest.raises(ValueError):
nlp.remove_pipe(name)
nlp.add_pipe(new_pipe, name=name)
assert len(nlp.pipeline) == 1
removed_name, removed_component = nlp.remove_pipe(name)
assert not len(nlp.pipeline)
assert removed_name == name
assert removed_component == new_pipe

View File

@ -7,6 +7,7 @@ from ..util import get_doc
import pytest
@pytest.mark.xfail
def test_issue589():
vocab = Vocab()
vocab.strings.set_frozen(True)

View File

@ -0,0 +1,9 @@
import spacy
import spacy.lang.en
from spacy.pipeline import TextCategorizer
def test_bytes_serialize_issue_1105():
nlp = spacy.lang.en.English()
tokenizer = nlp.tokenizer
textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
textcat_bytes = textcat.to_bytes()

View File

@ -0,0 +1,53 @@
from mock import Mock
from ..tokens.underscore import Underscore
def test_create_doc_underscore():
doc = Mock()
doc.doc = doc
uscore = Underscore(Underscore.doc_extensions, doc)
assert uscore._doc is doc
assert uscore._start is None
assert uscore._end is None
def test_doc_underscore_getattr_setattr():
doc = Mock()
doc.doc = doc
doc.user_data = {}
Underscore.doc_extensions['hello'] = (False, None, None, None)
doc._ = Underscore(Underscore.doc_extensions, doc)
assert doc._.hello == False
doc._.hello = True
assert doc._.hello == True
def test_create_span_underscore():
span = Mock(doc=Mock(), start=0, end=2)
uscore = Underscore(Underscore.span_extensions, span,
start=span.start, end=span.end)
assert uscore._doc is span.doc
assert uscore._start is span.start
assert uscore._end is span.end
def test_span_underscore_getter_setter():
span = Mock(doc=Mock(), start=0, end=2)
Underscore.span_extensions['hello'] = (None, None,
lambda s: (s.start, 'hi'),
lambda s, value: setattr(s, 'start',
value))
span._ = Underscore(Underscore.span_extensions, span,
start=span.start, end=span.end)
assert span._.hello == (0, 'hi')
span._.hello = 1
assert span._.hello == (1, 'hi')
def test_token_underscore_method():
token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese')
Underscore.token_extensions['hello'] = (None, token.say_cheese,
None, None)
token._ = Underscore(Underscore.token_extensions, token, start=token.idx)
assert token._.hello() == 'cheese'

View File

@ -30,7 +30,7 @@ from ..util import normalize_slice
from ..compat import is_config
from .. import about
from .. import util
from .underscore import Underscore
DEF PADDING = 5
@ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
else:
return Lexeme.get_struct_attr(token.lex, feat_name)
def _get_chunker(lang):
try:
cls = util.get_lang_class(lang)
@ -73,6 +74,7 @@ def _get_chunker(lang):
return None
return cls.Defaults.syntax_iterators.get(u'noun_chunks')
cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings.
@ -87,6 +89,21 @@ cdef class Doc:
>>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
"""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
nr_defined = sum(t is not None for t in (default, getter, setter, method))
assert nr_defined == 1
Underscore.doc_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.doc_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.doc_extensions
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
"""Create a Doc object.
@ -159,6 +176,10 @@ cdef class Doc:
self.is_tagged = True
self.is_parsed = True
@property
def _(self):
return Underscore(Underscore.doc_extensions, self)
def __getitem__(self, object i):
"""Get a `Token` or `Span` object.
@ -512,6 +533,8 @@ cdef class Doc:
assert t.lex.orth != 0
t.spacy = has_space
self.length += 1
# Set morphological attributes, e.g. by lemma, if possible
self.vocab.morphology.assign_untagged(t)
self._py_tokens.append(None)
return t.idx + t.lex.length + t.spacy

View File

@ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme
from ..compat import is_config
from .. import about
from .underscore import Underscore
cdef class Span:
"""A slice from a Doc object."""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
Underscore.span_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.span_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`.
@ -111,10 +125,14 @@ cdef class Span:
for i in range(self.start, self.end):
yield self.doc[i]
@property
def _(self):
return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char)
def as_doc(self):
'''Create a Doc object view of the Span's data.
This is mostly useful for C-typed interfaces.
This is mostly useful for C-typed interfaces.
'''
cdef Doc doc = Doc(self.doc.vocab)
doc.length = self.end-self.start

View File

@ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
from ..attrs cimport LEMMA, POS, TAG, DEP
from ..compat import is_config
from .. import about
from .underscore import Underscore
cdef class Token:
"""An individual token i.e. a word, punctuation symbol, whitespace, etc."""
@classmethod
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
Underscore.token_extensions[name] = (default, method, getter, setter)
@classmethod
def get_extension(cls, name):
return Underscore.span_extensions.get(name)
@classmethod
def has_extension(cls, name):
return name in Underscore.span_extensions
def __cinit__(self, Vocab vocab, Doc doc, int offset):
"""Construct a `Token` object.
@ -87,6 +101,11 @@ cdef class Token:
else:
raise ValueError(op)
@property
def _(self):
return Underscore(Underscore.token_extensions, self,
start=self.idx, end=None)
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
"""Check the value of a boolean flag.
@ -266,7 +285,7 @@ cdef class Token:
def __get__(self):
if 'vector_norm' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector_norm'](self)
vector = self.vector
vector = self.vector
return numpy.sqrt((vector ** 2).sum())
property n_lefts:

View File

@ -0,0 +1,50 @@
import functools
class Underscore(object):
doc_extensions = {}
span_extensions = {}
token_extensions = {}
def __init__(self, extensions, obj, start=None, end=None):
object.__setattr__(self, '_extensions', extensions)
object.__setattr__(self, '_obj', obj)
# Assumption is that for doc values, _start and _end will both be None
# Span will set non-None values for _start and _end
# Token will have _start be non-None, _end be None
# This lets us key everything into the doc.user_data dictionary,
# (see _get_key), and lets us use a single Underscore class.
object.__setattr__(self, '_doc', obj.doc)
object.__setattr__(self, '_start', start)
object.__setattr__(self, '_end', end)
def __getattr__(self, name):
if name not in self._extensions:
raise AttributeError(name)
default, method, getter, setter = self._extensions[name]
if getter is not None:
return getter(self._obj)
elif method is not None:
return functools.partial(method, self._obj)
else:
return self._doc.user_data.get(self._get_key(name), default)
def __setattr__(self, name, value):
if name not in self._extensions:
raise AttributeError(name)
default, method, getter, setter = self._extensions[name]
if setter is not None:
return setter(self._obj, value)
else:
self._doc.user_data[self._get_key(name)] = value
def set(self, name, value):
return self.__setattr__(name, value)
def get(self, name):
return self.__getattr__(name)
def has(self, name):
return name in self._extensions
def _get_key(self, name):
return ('._.', name, self._start, self._end)

View File

@ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides):
if not meta:
meta = get_model_meta(model_path)
cls = get_lang_class(meta['lang'])
nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides)
nlp = cls(meta=meta, **overrides)
pipeline = meta.get('pipeline', [])
disable = overrides.get('disable', [])
if pipeline is True:
pipeline = nlp.Defaults.pipe_names
elif pipeline in (False, None):
pipeline = []
for name in pipeline:
if name not in disable:
config = meta.get('pipeline_args', {}).get(name, {})
component = nlp.create_pipe(name, config=config)
nlp.add_pipe(component, name=name)
return nlp.from_disk(model_path)

View File

@ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap)
//- Code blocks to display old/new versions
mixin code-wrapper()
span.u-inline-block.u-padding-top.u-width-full
block
mixin code-old()
+code(false, false, false, false, "reject").o-block-small
block

View File

@ -113,6 +113,22 @@ p
+cell flag
+cell Show help message and available arguments.
+h(3, "validate") Validate
+tag-new(2)
p
| Find all models installed in the current environment (both packages and
| shortcut links) and check whether they are compatible with the currently
| installed version of spaCy. Should be run after upgrading spaCy via
| #[code pip install -U spacy] to ensure that all installed models are
| can be used with the new version. The command is also useful to detect
| out-of-sync model links resulting from links created in different virtual
| environments. Prints a list of models, the installed versions, the latest
| compatible version (if out of date) and the commands for updating.
+code(false, "bash", "$").
spacy validate
+h(3, "convert") Convert
p

View File

@ -43,6 +43,20 @@ p
+cell #[code Language]
+cell A #[code Language] object with the loaded model.
p
| Essentially, #[code spacy.load()] is a convenience wrapper that reads
| the language ID and pipeline components from a model's #[code meta.json],
| initialises the #[code Language] class, loads in the model data and
| returns it.
+code("Abstract example").
cls = util.get_lang_class(lang) # get language for ID, e.g. 'en'
nlp = cls() # initialise the language
for name in pipeline:
component = nlp.create_pipe(name) # create each pipeline component
nlp.add_pipe(component) # add component to pipeline
nlp.from_disk(model_data_path) # load in model data
+infobox("Deprecation note", "⚠️")
.o-block
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
@ -141,37 +155,3 @@ p
+cell returns
+cell unicode
+cell The explanation, or #[code None] if not found in the glossary.
+h(3, "spacy.set_factory") spacy.set_factory
+tag function
+tag-new(2)
p
| Set a factory that returns a custom
| #[+a("/usage/processing-pipelines") processing pipeline]
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
+aside-code("Example").
def my_factory(vocab):
def my_component(doc):
return doc
return my_component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory'])
+table(["Name", "Type", "Description"])
+row
+cell #[code factory_id]
+cell unicode
+cell
| Unique name of factory. If added to a new pipeline, spaCy will
| look up the factory for this ID and use it to create the
| component.
+row
+cell #[code factory]
+cell callable
+cell
| Callable that takes a #[code Vocab] object and returns a pipeline
| component.

View File

@ -138,6 +138,109 @@ p Get the number of tokens in the document.
+cell int
+cell The number of tokens in the document.
+h(2, "set_extension") Doc.set_extension
+tag classmethod
+tag-new(2)
p
| Define a custom attribute on the #[code Doc] which becomes available via
| #[code Doc._]. For details, see the documentation on
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+aside-code("Example").
from spacy.tokens import Doc
city_getter = lambda doc: doc.text in ('New York', 'Paris', 'Berlin')
Doc.set_extension('has_city', getter=city_getter)
doc = nlp(u'I like New York')
assert doc._.has_city
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell
| Name of the attribute to set by the extension. For example,
| #[code 'my_attr'] will be available as #[code doc._.my_attr].
+row
+cell #[code default]
+cell -
+cell
| Optional default value of the attribute if no getter or method
| is defined.
+row
+cell #[code method]
+cell callable
+cell
| Set a custom method on the object, for example
| #[code doc._.compare(other_doc)].
+row
+cell #[code getter]
+cell callable
+cell
| Getter function that takes the object and returns an attribute
| value. Is called when the user accesses the #[code ._] attribute.
+row
+cell #[code setter]
+cell callable
+cell
| Setter function that takes the #[code Doc] and a value, and
| modifies the object. Is called when the user writes to the
| #[code Doc._] attribute.
+h(2, "get_extension") Doc.get_extension
+tag classmethod
+tag-new(2)
p
| Look up a previously registered extension by name. Returns a 4-tuple
| #[code.u-break (default, method, getter, setter)] if the extension is
| registered. Raises a #[code KeyError] otherwise.
+aside-code("Example").
from spacy.tokens import Doc
Doc.set_extension('is_city', default=False)
extension = Doc.get_extension('is_city')
assert extension == (False, None, None, None)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension.
+row("foot")
+cell returns
+cell tuple
+cell
| A #[code.u-break (default, method, getter, setter)] tuple of the
| extension.
+h(2, "has_extension") Doc.has_extension
+tag classmethod
+tag-new(2)
p Check whether an extension has been registered on the #[code Doc] class.
+aside-code("Example").
from spacy.tokens import Doc
Doc.set_extension('is_city', default=False)
assert Doc.has_extension('is_city')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension to check.
+row("foot")
+cell returns
+cell bool
+cell Whether the extension has been registered.
+h(2, "char_span") Doc.char_span
+tag method
+tag-new(2)

View File

@ -4,7 +4,14 @@ include ../_includes/_mixins
p
| Usually you'll load this once per process as #[code nlp] and pass the
| instance around your application.
| instance around your application. The #[code Language] class is created
| when you call #[+api("spacy#load") #[code spacy.load()]] and contains
| the shared vocabulary and #[+a("/usage/adding-languages") language data],
| optional model data loaded from a #[+a("/models") model package] or
| a path, and a #[+a("/usage/processing-pipelines") processing pipeline]
| containing components like the tagger or parser that are called on a
| document in order. You can also add your own processing pipeline
| components that take a #[code Doc] object, modify it and return it.
+h(2, "init") Language.__init__
+tag method
@ -12,9 +19,9 @@ p
p Initialise a #[code Language] object.
+aside-code("Example").
from spacy.vocab import Vocab
from spacy.language import Language
nlp = Language(pipeline=['token_vectors', 'tags',
'dependencies'])
nlp = Language(Vocab())
from spacy.lang.en import English
nlp = English()
@ -34,14 +41,6 @@ p Initialise a #[code Language] object.
| A function that takes text and returns a #[code Doc] object.
| Usually a #[code Tokenizer].
+row
+cell #[code pipeline]
+cell list
+cell
| A list of annotation processes or IDs of annotation, processes,
| e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
| up in #[code Language.Defaults.factories].
+row
+cell #[code meta]
+cell dict
@ -235,7 +234,6 @@ p
| Can be called before training to pre-process gold data. By default, it
| handles nonprojectivity and adds missing tags to the tag map.
+table(["Name", "Type", "Description"])
+row
+cell #[code docs_golds]
@ -247,6 +245,177 @@ p
+cell tuple
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
+h(2, "create_pipe") Language.create_pipe
+tag method
+tag-new(2)
p Create a pipeline component from a factory.
+aside-code("Example").
parser = nlp.create_pipe('parser')
nlp.add_pipe(parser)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell
| Factory name to look up in
| #[+api("language#class-attributes") #[code Language.factories]].
+row
+cell #[code config]
+cell dict
+cell Configuration parameters to initialise component.
+row("foot")
+cell returns
+cell callable
+cell The pipeline component.
+h(2, "add_pipe") Language.add_pipe
+tag method
+tag-new(2)
p
| Add a component to the processing pipeline. Valid components are
| callables that take a #[code Doc] object, modify it and return it. Only
| one of #[code before], #[code after], #[code first] or #[code last] can
| be set. Default behaviour is #[code last=True].
+aside-code("Example").
def component(doc):
# modify Doc and return it
return doc
nlp.add_pipe(component, before='ner')
nlp.add_pipe(component, name='custom_name', last=True)
+table(["Name", "Type", "Description"])
+row
+cell #[code component]
+cell callable
+cell The pipeline component.
+row
+cell #[code name]
+cell unicode
+cell
| Name of pipeline component. Overwrites existing
| #[code component.name] attribute if available. If no #[code name]
| is set and the component exposes no name attribute,
| #[code component.__name__] is used. An error is raised if the
| name already exists in the pipeline.
+row
+cell #[code before]
+cell unicode
+cell Component name to insert component directly before.
+row
+cell #[code after]
+cell unicode
+cell Component name to insert component directly after:
+row
+cell #[code first]
+cell bool
+cell Insert component first / not first in the pipeline.
+row
+cell #[code last]
+cell bool
+cell Insert component last / not last in the pipeline.
+h(2, "get_pipe") Language.get_pipe
+tag method
+tag-new(2)
p Get a pipeline component for a given component name.
+aside-code("Example").
parser = nlp.get_pipe('parser')
custom_component = nlp.get_pipe('custom_component')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the pipeline component to get.
+row("foot")
+cell returns
+cell callable
+cell The pipeline component.
+h(2, "replace_pipe") Language.replace_pipe
+tag method
+tag-new(2)
p Replace a component in the pipeline.
+aside-code("Example").
nlp.replace_pipe('parser', my_custom_parser)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the component to replace.
+row
+cell #[code component]
+cell callable
+cell The pipeline component to inser.
+h(2, "rename_pipe") Language.rename_pipe
+tag method
+tag-new(2)
p
| Rename a component in the pipeline. Useful to create custom names for
| pre-defined and pre-loaded components. To change the default name of
| a component added to the pipeline, you can also use the #[code name]
| argument on #[+api("language#add_pipe") #[code add_pipe]].
+aside-code("Example").
nlp.rename_pipe('parser', 'spacy_parser')
+table(["Name", "Type", "Description"])
+row
+cell #[code old_name]
+cell unicode
+cell Name of the component to rename.
+row
+cell #[code new_name]
+cell unicode
+cell New name of the component.
+h(2, "remove_pipe") Language.remove_pipe
+tag method
+tag-new(2)
p
| Remove a component from the pipeline. Returns the removed component name
| and component function.
+aside-code("Example").
name, component = nlp.remove_pipe('parser')
assert name == 'parser'
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the component to remove.
+row("foot")
+cell returns
+cell tuple
+cell A #[code (name, component)] tuple of the removed component.
+h(2, "to_disk") Language.to_disk
+tag method
+tag-new(2)
@ -399,7 +568,15 @@ p Load state from a binary string.
+row
+cell #[code pipeline]
+cell list
+cell Sequence of annotation functions.
+cell
| List of #[code (name, component)] tuples describing the current
| processing pipeline, in order.
+row
+cell #[code pipe_names]
+tag-new(2)
+cell list
+cell List of pipeline component names, in order.
+row
+cell #[code meta]
@ -424,3 +601,12 @@ p Load state from a binary string.
+cell
| Two-letter language ID, i.e.
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
+row
+cell #[code factories]
+tag-new(2)
+cell dict
+cell
| Factories that create pre-defined pipeline components, e.g. the
| tagger, parser or entity recognizer, keyed by their component
| name.

View File

@ -116,6 +116,109 @@ p Get the number of tokens in the span.
+cell int
+cell The number of tokens in the span.
+h(2, "set_extension") Span.set_extension
+tag classmethod
+tag-new(2)
p
| Define a custom attribute on the #[code Span] which becomes available via
| #[code Span._]. For details, see the documentation on
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+aside-code("Example").
from spacy.tokens import Span
city_getter = lambda span: span.text in ('New York', 'Paris', 'Berlin')
Span.set_extension('has_city', getter=city_getter)
doc = nlp(u'I like New York in Autumn')
assert doc[1:4]._.has_city
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell
| Name of the attribute to set by the extension. For example,
| #[code 'my_attr'] will be available as #[code span._.my_attr].
+row
+cell #[code default]
+cell -
+cell
| Optional default value of the attribute if no getter or method
| is defined.
+row
+cell #[code method]
+cell callable
+cell
| Set a custom method on the object, for example
| #[code span._.compare(other_span)].
+row
+cell #[code getter]
+cell callable
+cell
| Getter function that takes the object and returns an attribute
| value. Is called when the user accesses the #[code ._] attribute.
+row
+cell #[code setter]
+cell callable
+cell
| Setter function that takes the #[code Span] and a value, and
| modifies the object. Is called when the user writes to the
| #[code Span._] attribute.
+h(2, "get_extension") Span.get_extension
+tag classmethod
+tag-new(2)
p
| Look up a previously registered extension by name. Returns a 4-tuple
| #[code.u-break (default, method, getter, setter)] if the extension is
| registered. Raises a #[code KeyError] otherwise.
+aside-code("Example").
from spacy.tokens import Span
Span.set_extension('is_city', default=False)
extension = Span.get_extension('is_city')
assert extension == (False, None, None, None)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension.
+row("foot")
+cell returns
+cell tuple
+cell
| A #[code.u-break (default, method, getter, setter)] tuple of the
| extension.
+h(2, "has_extension") Span.has_extension
+tag classmethod
+tag-new(2)
p Check whether an extension has been registered on the #[code Span] class.
+aside-code("Example").
from spacy.tokens import Span
Span.set_extension('is_city', default=False)
assert Span.has_extension('is_city')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension to check.
+row("foot")
+cell returns
+cell bool
+cell Whether the extension has been registered.
+h(2, "similarity") Span.similarity
+tag method
+tag-model("vectors")

View File

@ -51,6 +51,109 @@ p The number of unicode characters in the token, i.e. #[code token.text].
+cell int
+cell The number of unicode characters in the token.
+h(2, "set_extension") Token.set_extension
+tag classmethod
+tag-new(2)
p
| Define a custom attribute on the #[code Token] which becomes available
| via #[code Token._]. For details, see the documentation on
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
+aside-code("Example").
from spacy.tokens import Token
fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana')
Token.set_extension('is_fruit', getter=fruit_getter)
doc = nlp(u'I have an apple')
assert doc[3]._.is_fruit
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell
| Name of the attribute to set by the extension. For example,
| #[code 'my_attr'] will be available as #[code token._.my_attr].
+row
+cell #[code default]
+cell -
+cell
| Optional default value of the attribute if no getter or method
| is defined.
+row
+cell #[code method]
+cell callable
+cell
| Set a custom method on the object, for example
| #[code token._.compare(other_token)].
+row
+cell #[code getter]
+cell callable
+cell
| Getter function that takes the object and returns an attribute
| value. Is called when the user accesses the #[code ._] attribute.
+row
+cell #[code setter]
+cell callable
+cell
| Setter function that takes the #[code Token] and a value, and
| modifies the object. Is called when the user writes to the
| #[code Token._] attribute.
+h(2, "get_extension") Token.get_extension
+tag classmethod
+tag-new(2)
p
| Look up a previously registered extension by name. Returns a 4-tuple
| #[code.u-break (default, method, getter, setter)] if the extension is
| registered. Raises a #[code KeyError] otherwise.
+aside-code("Example").
from spacy.tokens import Token
Token.set_extension('is_fruit', default=False)
extension = Token.get_extension('is_fruit')
assert extension == (False, None, None, None)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension.
+row("foot")
+cell returns
+cell tuple
+cell
| A #[code.u-break (default, method, getter, setter)] tuple of the
| extension.
+h(2, "has_extension") Token.has_extension
+tag classmethod
+tag-new(2)
p Check whether an extension has been registered on the #[code Token] class.
+aside-code("Example").
from spacy.tokens import Token
Token.set_extension('is_fruit', default=False)
assert Token.has_extension('is_fruit')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the extension to check.
+row("foot")
+cell returns
+cell bool
+cell Whether the extension has been registered.
+h(2, "check_flag") Token.check_flag
+tag method

View File

@ -143,6 +143,9 @@
//- Layout
.u-width-full
width: 100%
.u-float-left
float: left
margin-right: 1rem
@ -166,6 +169,9 @@
.u-padding-medium
padding: 1.8rem
.u-padding-top
padding-top: 2rem
.u-inline-block
display: inline-block

View File

@ -25,7 +25,7 @@
display: inline-block
font-size: 0.6em
font-weight: bold
padding-right: 1.25rem
padding-right: 1em
margin-left: -3.75rem
text-align: right
width: 2.5rem

View File

@ -456,24 +456,11 @@ p
}
p
| To add a lookup lemmatizer to your language, import the #[code LOOKUP]
| table and #[code Lemmatizer], and create a new classmethod:
| To provide a lookup lemmatizer for your language, import the lookup table
| and add it to the #[code Language] class as #[code lemma_lookup]:
+code("__init__py (excerpt)").
# other imports here, plus lookup table and lookup lemmatizer
from .lemmatizer import LOOKUP
from ...lemmatizerlookup import Lemmatizer
class Xxxxx(Language):
lang = 'xx'
class Defaults(Language.Defaults):
# other language defaults here
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
+code.
lemma_lookup = dict(LOOKUP)
+h(3, "tag-map") Tag map

View File

@ -103,10 +103,10 @@
"title": "Language Processing Pipelines",
"next": "vectors-similarity",
"menu": {
"How pipelines work": "pipelines",
"Examples": "examples",
"How Pipelines Work": "pipelines",
"Custom Components": "custom-components",
"Developing Extensions": "extensions",
"Multi-threading": "multithreading",
"User Hooks": "user-hooks",
"Serialization": "serialization"
}
},
@ -195,6 +195,7 @@
"teaser": "Full code examples you can modify and run.",
"next": "resources",
"menu": {
"Pipeline": "pipeline",
"Matching": "matching",
"Training": "training",
"Deep Learning": "deep-learning"

View File

@ -0,0 +1,369 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS
p
| A component receives a #[code Doc] object and can modify it for example,
| by using the current weights to make a prediction and set some annotation
| on the document. By adding a component to the pipeline, you'll get access
| to the #[code Doc] at any point #[strong during processing] instead of
| only being able to modify it afterwards.
+aside-code("Example").
def my_component(doc):
# do something to the doc here
return doc
+table(["Argument", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The #[code Doc] object processed by the previous component.
+row("foot")
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object processed by this pipeline component.
p
| Custom components can be added to the pipeline using the
| #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you
| can either specify a component to add it #[strong before or after], tell
| spaCy to add it #[strong first or last] in the pipeline, or define a
| #[strong custom name]. If no name is set and no #[code name] attribute
| is present on your component, the function name is used.
+code("Adding pipeline components").
def my_component(doc):
print("After tokenization, this doc has %s tokens." % len(doc))
if len(doc) &lt; 10:
print("This is a pretty short document.")
return doc
nlp = spacy.load('en')
nlp.pipeline.add_pipe(my_component, name='print_info', first=True)
print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner']
doc = nlp(u"This is a sentence.")
p
| Of course, you can also wrap your component as a class to allow
| initialising it with custom settings and hold state within the component.
| This is useful for #[strong stateful components], especially ones which
| #[strong depend on shared data].
+code.
class MyComponent(object):
name = 'print_info'
def __init__(vocab, short_limit=10):
self.vocab = nlp.vocab
self.short_limit = short_limit
def __call__(doc):
if len(doc) &lt; self.short_limit:
print("This is a pretty short document.")
return doc
my_component = MyComponent(nlp.vocab, short_limit=25)
nlp.add_pipe(my_component, first=True)
+h(3, "custom-components-attributes")
| Extension attributes on #[code Doc], #[code Span] and #[code Token]
+tag-new(2)
p
| As of v2.0, spaCy allows you to set any custom attributes and methods
| on the #[code Doc], #[code Span] and #[code Token], which become
| available as #[code Doc._], #[code Span._] and #[code Token._] for
| example, #[code Token._.my_attr]. This lets you store additional
| information relevant to your application, add new features and
| functionality to spaCy, and implement your own models trained with other
| machine learning libraries. It also lets you take advantage of spaCy's
| data structures and the #[code Doc] object as the "single source of
| truth".
+aside("Why ._?")
| Writing to a #[code ._] attribute instead of to the #[code Doc] directly
| keeps a clearer separation and makes it easier to ensure backwards
| compatibility. For example, if you've implemented your own #[code .coref]
| property and spaCy claims it one day, it'll break your code. Similarly,
| just by looking at the code, you'll immediately know what's built-in and
| what's custom for example, #[code doc.sentiment] is spaCy, while
| #[code doc._.sent_score] isn't.
p
| There are three main types of extensions, which can be defined using the
| #[+api("doc#set_extension") #[code Doc.set_extension]],
| #[+api("span#set_extension") #[code Span.set_extension]] and
| #[+api("token#set_extension") #[code Token.set_extension]] methods.
+list("numbers")
+item #[strong Attribute extensions].
| Set a default value for an attribute, which can be overwritten
| manually at any time. Attribute extensions work like "normal"
| variables and are the quickest way to store arbitrary information
| on a #[code Doc], #[code Span] or #[code Token].
+code-wrapper
+code.
Doc.set_extension('hello', default=True)
assert doc._.hello
doc._.hello = False
+item #[strong Property extensions].
| Define a getter and an optional setter function. If no setter is
| provided, the extension is immutable. Since the getter and setter
| functions are only called when you #[em retrieve] the attribute,
| you can also access values of previously added attribute extensions.
| For example, a #[code Doc] getter can average over #[code Token]
| attributes. For #[code Span] extensions, you'll almost always want
| to use a property otherwise, you'd have to write to
| #[em every possible] #[code Span] in the #[code Doc] to set up the
| values correctly.
+code-wrapper
+code.
Doc.set_extension('hello', getter=get_hello_value, setter=set_hello_value)
assert doc._.hello
doc._.hello = 'Hi!'
+item #[strong Method extensions].
| Assign a function that becomes available as an object method. Method
| extensions are always immutable. For more details and implementation
| ideas, see
| #[+a("/usage/examples#custom-components-attr-methods") these examples].
+code-wrapper
+code.o-no-block.
Doc.set_extension('hello', method=lambda doc, name: 'Hi {}!'.format(name))
assert doc._.hello('Bob') == 'Hi Bob!'
p
| Before you can access a custom extension, you need to register it using
| the #[code set_extension] method on the object you want
| to add it to, e.g. the #[code Doc]. Keep in mind that extensions are
| always #[strong added globally] and not just on a particular instance.
| If an attribute of the same name
| already exists, or if you're trying to access an attribute that hasn't
| been registered, spaCy will raise an #[code AttributeError].
+code("Example").
from spacy.tokens import Doc, Span, Token
fruits = ['apple', 'pear', 'banana', 'orange', 'strawberry']
is_fruit_getter = lambda token: token.text in fruits
has_fruit_getter = lambda obj: any([t.text in fruits for t in obj])
Token.set_extension('is_fruit', getter=is_fruit_getter)
Doc.set_extension('has_fruit', getter=has_fruit_getter)
Span.set_extension('has_fruit', getter=has_fruit_getter)
+aside-code("Usage example").
doc = nlp(u"I have an apple and a melon")
assert doc[3]._.is_fruit # get Token attributes
assert not doc[0]._.is_fruit
assert doc._.has_fruit # get Doc attributes
assert doc[1:4]._.has_fruit # get Span attributes
p
| Once you've registered your custom attribute, you can also use the
| built-in #[code set], #[code get] and #[code has] methods to modify and
| retrieve the attributes. This is especially useful it you want to pass in
| a string instead of calling #[code doc._.my_attr].
+table(["Method", "Description", "Valid for", "Example"])
+row
+cell #[code ._.set()]
+cell Set a value for an attribute.
+cell Attributes, mutable properties.
+cell #[code.u-break token._.set('my_attr', True)]
+row
+cell #[code ._.get()]
+cell Get the value of an attribute.
+cell Attributes, mutable properties, immutable properties, methods.
+cell #[code.u-break my_attr = span._.get('my_attr')]
+row
+cell #[code ._.has()]
+cell Check if an attribute exists.
+cell Attributes, mutable properties, immutable properties, methods.
+cell #[code.u-break doc._.has('my_attr')]
+infobox("How the ._ is implemented")
| Extension definitions the defaults, methods, getters and setters you
| pass in to #[code set_extension] are stored in class attributes on the
| #[code Underscore] class. If you write to an extension attribute, e.g.
| #[code doc._.hello = True], the data is stored within the
| #[+api("doc#attributes") #[code Doc.user_data]] dictionary. To keep the
| underscore data separate from your other dictionary entries, the string
| #[code "._."] is placed before the name, in a tuple.
+h(4, "component-example1") Example: Custom sentence segmentation logic
p
| Let's say you want to implement custom logic to improve spaCy's sentence
| boundary detection. Currently, sentence segmentation is based on the
| dependency parse, which doesn't always produce ideal results. The custom
| logic should therefore be applied #[strong after] tokenization, but
| #[strong before] the dependency parsing this way, the parser can also
| take advantage of the sentence boundaries.
+code.
def sbd_component(doc):
for i, token in enumerate(doc[:-2]):
# define sentence start if period + titlecase token
if token.text == '.' and doc[i+1].is_title:
doc[i+1].sent_start = True
return doc
nlp = spacy.load('en')
nlp.add_pipe(sbd_component, before='parser') # insert before the parser
+h(4, "component-example2")
| Example: Pipeline component for entity matching and tagging with
| custom attributes
p
| This example shows how to create a spaCy extension that takes a
| terminology list (in this case, single- and multi-word company names),
| matches the occurences in a document, labels them as #[code ORG] entities,
| merges the tokens and sets custom #[code is_tech_org] and
| #[code has_tech_org] attributes. For efficient matching, the example uses
| the #[+api("phrasematcher") #[code PhraseMatcher]] which accepts
| #[code Doc] objects as match patterns and works well for large
| terminology lists. It also ensures your patterns will always match, even
| when you customise spaCy's tokenization rules. When you call #[code nlp]
| on a text, the custom pipeline component is applied to the #[code Doc]
+github("spacy", "examples/pipeline/custom_component_entities.py", false, 500)
p
| Wrapping this functionality in a
| pipeline component allows you to reuse the module with different
| settings, and have all pre-processing taken care of when you call
| #[code nlp] on your text and receive a #[code Doc] object.
+h(4, "component-example3")
| Example: Pipeline component for GPE entities and country meta data via a
| REST API
p
| This example shows the implementation of a pipeline component
| that fetches country meta data via the
| #[+a("https://restcountries.eu") REST Countries API] sets entity
| annotations for countries, merges entities into one token and
| sets custom attributes on the #[code Doc], #[code Span] and
| #[code Token] for example, the capital, latitude/longitude coordinates
| and even the country flag.
+github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500)
p
| In this case, all data can be fetched on initialisation in one request.
| However, if you're working with text that contains incomplete country
| names, spelling mistakes or foreign-language versions, you could also
| implement a #[code like_country]-style getter function that makes a
| request to the search API endpoint and returns the best-matching
| result.
+h(4, "custom-components-usage-ideas") Other usage ideas
+list
+item
| #[strong Adding new features and hooking in models]. For example,
| a sentiment analysis model, or your preferred solution for
| lemmatization or sentiment analysis. spaCy's built-in tagger,
| parser and entity recognizer respect annotations that were already
| set on the #[code Doc] in a previous step of the pipeline.
+item
| #[strong Integrating other libraries and APIs]. For example, your
| pipeline component can write additional information and data
| directly to the #[code Doc] or #[code Token] as custom attributes,
| while making sure no information is lost in the process. This can
| be output generated by other libraries and models, or an external
| service with a REST API.
+item
| #[strong Debugging and logging]. For example, a component which
| stores and/or exports relevant information about the current state
| of the processed document, and insert it at any point of your
| pipeline.
+infobox("Developing third-party extensions")
| The new pipeline management and custom attributes finally make it easy
| to develop your own spaCy extensions and plugins and share them with
| others. Extensions can claim their own #[code ._] namespace and exist as
| standalone packages. If you're developing a tool or library and want to
| make it easy for others to use it with spaCy and add it to their
| pipeline, all you have to do is expose a function that takes a
| #[code Doc], modifies it and returns it. For more details and
| #[strong best practices], see the section on
| #[+a("#extensions") developing spaCy extensions].
+h(3, "custom-components-user-hooks") User hooks
p
| While it's generally recommended to use the #[code Doc._], #[code Span._]
| and #[code Token._] proxies to add your own custom attributes, spaCy
| offers a few exceptions to allow #[strong customising the built-in methods]
| like #[+api("doc#similarity") #[code Doc.similarity]] or
| #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can
| rely on statistical models you train yourself. For instance, you can
| provide your own on-the-fly sentence segmentation algorithm or document
| similarity method.
p
| Hooks let you customize some of the behaviours of the #[code Doc],
| #[code Span] or #[code Token] objects by adding a component to the
| pipeline. For instance, to customize the
| #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
| component that sets a custom function to
| #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
| method will check the #[code user_hooks] dict, and delegate to your
| function if you've set one. Similar results can be achieved by setting
| functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
+aside("Implementation note")
| The hooks live on the #[code Doc] object because the #[code Span] and
| #[code Token] objects are created lazily, and don't own any data. They
| just proxy to their parent #[code Doc]. This turns out to be convenient
| here — we only have to worry about installing hooks in one place.
+table(["Name", "Customises"])
+row
+cell #[code user_hooks]
+cell
+api("doc#vector") #[code Doc.vector]
+api("doc#has_vector") #[code Doc.has_vector]
+api("doc#vector_norm") #[code Doc.vector_norm]
+api("doc#sents") #[code Doc.sents]
+row
+cell #[code user_token_hooks]
+cell
+api("token#similarity") #[code Token.similarity]
+api("token#vector") #[code Token.vector]
+api("token#has_vector") #[code Token.has_vector]
+api("token#vector_norm") #[code Token.vector_norm]
+api("token#conjuncts") #[code Token.conjuncts]
+row
+cell #[code user_span_hooks]
+cell
+api("span#similarity") #[code Span.similarity]
+api("span#vector") #[code Span.vector]
+api("span#has_vector") #[code Span.has_vector]
+api("span#vector_norm") #[code Span.vector_norm]
+api("span#root") #[code Span.root]
+code("Add custom similarity hooks").
class SimilarityModel(object):
def __init__(self, model):
self._model = model
def __call__(self, doc):
doc.user_hooks['similarity'] = self.similarity
doc.user_span_hooks['similarity'] = self.similarity
doc.user_token_hooks['similarity'] = self.similarity
def similarity(self, obj1, obj2):
y = self._model([obj1.vector, obj2.vector])
return float(y[0])

View File

@ -1,126 +0,0 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES
p
| To see real-world examples of pipeline factories and components in action,
| you can have a look at the source of spaCy's built-in components, e.g.
| the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
| #[+api("entityrecognizer") #[code EntityRecongnizer]].
+h(3, "example1") Example: Custom sentence segmentation logic
p
| Let's say you want to implement custom logic to improve spaCy's sentence
| boundary detection. Currently, sentence segmentation is based on the
| dependency parse, which doesn't always produce ideal results. The custom
| logic should therefore be applied #[strong after] tokenization, but
| #[strong before] the dependency parsing this way, the parser can also
| take advantage of the sentence boundaries.
+code.
def sbd_component(doc):
for i, token in enumerate(doc[:-2]):
# define sentence start if period + titlecase token
if token.text == '.' and doc[i+1].is_title:
doc[i+1].sent_start = True
return doc
p
| In this case, we simply want to add the component to the existing
| pipeline of the English model. We can do this by inserting it at index 0
| of #[code nlp.pipeline]:
+code.
nlp = spacy.load('en')
nlp.pipeline.insert(0, sbd_component)
p
| When you call #[code nlp] on some text, spaCy will tokenize it to create
| a #[code Doc] object, and first call #[code sbd_component] on it, followed
| by the model's default pipeline.
+h(3, "example2") Example: Sentiment model
p
| Let's say you have trained your own document sentiment model on English
| text. After tokenization, you want spaCy to first execute the
| #[strong default tensorizer], followed by a custom
| #[strong sentiment component] that adds a #[code .sentiment]
| property to the #[code Doc], containing your model's sentiment precition.
p
| Your component class will have a #[code from_disk()] method that spaCy
| calls to load the model data. When called, the component will compute
| the sentiment score, add it to the #[code Doc] and return the modified
| document. Optionally, the component can include an #[code update()] method
| to allow training the model.
+code.
import pickle
from pathlib import Path
class SentimentComponent(object):
def __init__(self, vocab):
self.weights = None
def __call__(self, doc):
doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
return doc
def from_disk(self, path): # path = model path + factory ID ('sentiment')
self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
return self
def update(self, doc, gold): # update weights allows training!
prediction = sum(self.weights*doc.vector)
self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
p
| The factory will initialise the component with the #[code Vocab] object.
| To be able to add it to your model's pipeline as #[code 'sentiment'],
| it also needs to be registered via
| #[+api("spacy#set_factory") #[code set_factory()]].
+code.
def sentiment_factory(vocab):
component = SentimentComponent(vocab) # initialise component
return component
spacy.set_factory('sentiment', sentiment_factory)
p
| The above code should be #[strong shipped with your model]. You can use
| the #[+api("cli#package") #[code package]] command to create all required
| files and directories. The model package will include an
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]
| with a #[code load()] method, that will initialise the language class with
| the model's pipeline and call the #[code from_disk()] method to load
| the model data.
p
| In the model package's meta.json, specify the language class and pipeline
| IDs:
+code("meta.json (excerpt)", "json").
{
"name": "sentiment_model",
"lang": "en",
"version": "1.0.0",
"spacy_version": "&gt;=2.0.0,&lt;3.0.0",
"pipeline": ["tensorizer", "sentiment"]
}
p
| When you load your new model, spaCy will call the model's #[code load()]
| method. This will return a #[code Language] object with a pipeline
| containing the default tensorizer, and the sentiment component returned
| by your custom #[code "sentiment"] factory.
+code.
nlp = spacy.load('en_sentiment_model')
doc = nlp(u'I love pizza')
assert doc.sentiment
+infobox("Saving and loading models")
| For more information and a detailed guide on how to package your model,
| see the documentation on
| #[+a("/usage/training#saving-loading") saving and loading models].

View File

@ -0,0 +1,110 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS
p
| We're very excited about all the new possibilities for community
| extensions and plugins in spaCy v2.0, and we can't wait to see what
| you build with it! To get you started, here are a few tips, tricks and
| best practices:
+list
+item
| Make sure to choose a #[strong descriptive and specific name] for
| your pipeline component class, and set it as its #[code name]
| attribute. Avoid names that are too common or likely to clash with
| built-in or a user's other custom components. While it's fine to call
| your package "spacy_my_extension", avoid component names including
| "spacy", since this can easily lead to confusion.
+code-wrapper
+code-new name = 'myapp_lemmatizer'
+code-old name = 'lemmatizer'
+item
| When writing to #[code Doc], #[code Token] or #[code Span] objects,
| #[strong use getter functions] wherever possible, and avoid setting
| values explicitly. Tokens and spans don't own any data themselves,
| so you should provide a function that allows them to compute the
| values instead of writing static properties to individual objects.
+code-wrapper
+code-new.
is_fruit = lambda token: token.text in ('apple', 'orange')
Token.set_extension('is_fruit', getter=is_fruit)
+code-old.
token._.set_extension('is_fruit', default=False)
if token.text in ('apple', 'orange'):
token._.set('is_fruit', True)
+item
| Always add your custom attributes to the #[strong global] #[code Doc]
| #[code Token] or #[code Span] objects, not a particular instance of
| them. Add the attributes #[strong as early as possible], e.g. in
| your extension's #[code __init__] method or in the global scope of
| your module. This means that in the case of namespace collisions,
| the user will see an error immediately, not just when they run their
| pipeline.
+code-wrapper
+code-new.
from spacy.tokens import Doc
def __init__(attr='my_attr'):
Doc.set_extension(attr, getter=self.get_doc_attr)
+code-old.
def __call__(doc):
doc.set_extension('my_attr', getter=self.get_doc_attr)
+item
| If your extension is setting properties on the #[code Doc],
| #[code Token] or #[code Span], include an option to
| #[strong let the user to change those attribute names]. This makes
| it easier to avoid namespace collisions and accommodate users with
| different naming preferences. We recommend adding an #[code attrs]
| argument to the #[code __init__] method of your class so you can
| write the names to class attributes and reuse them across your
| component.
+code-wrapper
+code-new Doc.set_extension(self.doc_attr, default='some value')
+code-old Doc.set_extension('my_doc_attr', default='some value')
+item
| Ideally, extensions should be #[strong standalone packages] with
| spaCy and optionally, other packages specified as a dependency. They
| can freely assign to their own #[code ._] namespace, but should stick
| to that. If your extension's only job is to provide a better
| #[code .similarity] implementation, and your docs state this
| explicitly, there's no problem with writing to the
| #[+a("#custom-components-user-hooks") #[code user_hooks]], and
| overwriting spaCy's built-in method. However, a third-party
| extension should #[strong never silently overwrite built-ins], or
| attributes set by other extensions.
+item
| If you're looking to publish a model that depends on a custom
| pipeline component, you can either #[strong require it] in the model
| package's dependencies, or if the component is specific and
| lightweight choose to #[strong ship it with your model package]
| and add it to the #[code Language] instance returned by the
| model's #[code load()] method. For examples of this, check out the
| implementations of spaCy's
| #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]]
| and #[+api("util#load_model_from_path") #[code load_model_from_path()]]
| utility functions.
+code-wrapper
+code-new.
nlp.add_pipe(my_custom_component)
return nlp.from_disk(model_path)
+item
| Once you're ready to share your extension with others, make sure to
| #[strong add docs and installation instructions] (you can
| always link to this page for more info). Make it easy for others to
| install and use your extension, for example by uploading it to
| #[+a("https://pypi.python.org") PyPi]. If you're sharing your code on
| GitHub, don't forget to tag it
| with #[+a("https://github.com/search?q=topic%3Aspacy") #[code spacy]]
| and #[+a("https://github.com/search?q=topic%3Aspacy-pipeline") #[code spacy-pipeline]]
| to help people find it. If you post it on Twitter, feel free to tag
| #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}]
| so we can check it out.

View File

@ -11,7 +11,7 @@ p
p
| When you load a model, spaCy first consults the model's
| #[+a("/usage/saving-loading#models-generating") meta.json]. The
| #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The
| meta typically includes the model details, the ID of a language class,
| and an optional list of pipeline components. spaCy then does the
| following:
@ -21,24 +21,26 @@ p
"name": "example_model",
"lang": "en"
"description": "Example model for spaCy",
"pipeline": ["tensorizer", "tagger"]
"pipeline": ["tagger", "parser"]
}
+list("numbers")
+item
| Look up #[strong pipeline IDs] in the available
| #[strong pipeline factories].
+item
| Initialise the #[strong pipeline components] by calling their
| factories with the #[code Vocab] as an argument. This gives each
| factory and component access to the pipeline's shared data, like
| strings, morphology and annotation scheme.
+item
| Load the #[strong language class and data] for the given ID via
| #[+api("util.get_lang_class") #[code get_lang_class]].
| #[+api("util.get_lang_class") #[code get_lang_class]] and initialise
| it. The #[code Language] class contains the shared vocabulary,
| tokenization rules and the language-specific annotation scheme.
+item
| Pass the path to the #[strong model data] to the #[code Language]
| class and return it.
| Iterate over the #[strong pipeline names] and create each component
| using #[+api("language#create_pipe") #[code create_pipe]], which
| looks them up in #[code Language.factories].
+item
| Add each pipeline component to the pipeline in order, using
| #[+api("language#add_pipe") #[code add_pipe]].
+item
| Make the #[strong model data] available to the #[code Language] class
| by calling #[+api("language#from_disk") #[code from_disk]] with the
| path to the model data ditectory.
p
| So when you call this...
@ -47,12 +49,12 @@ p
nlp = spacy.load('en')
p
| ... the model tells spaCy to use the pipeline
| ... the model tells spaCy to use the language #[code "en"] and the pipeline
| #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will
| then look up each string in its internal factories registry and
| initialise the individual components. It'll then load
| #[code spacy.lang.en.English], pass it the path to the model's data
| directory, and return it for you to use as the #[code nlp] object.
| then initialise #[code spacy.lang.en.English], and create each pipeline
| component and add it to the processing pipeline. It'll then load in the
| model's data from its data ditectory and return the modified
| #[code Language] class for you to use as the #[code nlp] object.
p
| Fundamentally, a #[+a("/models") spaCy model] consists of three
@ -73,9 +75,12 @@ p
pipeline = ['tensorizer', 'tagger', 'parser', 'ner']
data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0'
cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
nlp = cls(pipeline=pipeline) # 2. initialise it with the pipeline
nlp.from_disk(model_data_path) # 3. load in the binary data
cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
nlp = cls() # 2. initialise it
for name in pipeline:
component = nlp.create_pipe(name) # 3. create the pipeline components
nlp.add_pipe(component) # 4. add the component to the pipeline
nlp.from_disk(model_data_path) # 5. load in the binary data
p
| When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
@ -87,124 +92,23 @@ p
| document, which is then processed by the component next in the pipeline.
+code("The pipeline under the hood").
doc = nlp.make_doc(u'This is a sentence')
for proc in nlp.pipeline:
doc = proc(doc)
+h(3, "creating") Creating pipeline components and factories
doc = nlp.make_doc(u'This is a sentence') # create a Doc from raw text
for name, proc in nlp.pipeline: # iterate over components in order
doc = proc(doc) # apply each component
p
| spaCy lets you customise the pipeline with your own components. Components
| are functions that receive a #[code Doc] object, modify and return it.
| If your component is stateful, you'll want to create a new one for each
| pipeline. You can do that by defining and registering a factory which
| receives the shared #[code Vocab] object and returns a component.
+h(4, "creating-component") Creating a component
p
| A component receives a #[code Doc] object and
| #[strong performs the actual processing] for example, using the current
| weights to make a prediction and set some annotation on the document. By
| adding a component to the pipeline, you'll get access to the #[code Doc]
| at any point #[strong during] processing instead of only being able to
| modify it afterwards.
+aside-code("Example").
def my_component(doc):
# do something to the doc here
return doc
+table(["Argument", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The #[code Doc] object processed by the previous component.
+row("foot")
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object processed by this pipeline component.
p
| When creating a new #[code Language] class, you can pass it a list of
| pipeline component functions to execute in that order. You can also
| add it to an existing pipeline by modifying #[code nlp.pipeline] just
| be careful not to overwrite a pipeline or its components by accident!
| The current processing pipeline is available as #[code nlp.pipeline],
| which returns a list of #[code (name, component)] tuples, or
| #[code nlp.pipe_names], which only returns a list of human-readable
| component names.
+code.
# Create a new Language object with a pipeline
from spacy.language import Language
nlp = Language(pipeline=[my_component])
nlp.pipeline
# [('tagger', &lt;spacy.pipeline.Tagger&gt;), ('parser', &lt;spacy.pipeline.DependencyParser&gt;), ('ner', &lt;spacy.pipeline.EntityRecognizer&gt;)]
nlp.pipe_names
# ['tagger', 'parser', 'ner']
# Modify an existing pipeline
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
+h(4, "creating-factory") Creating a factory
p
| A factory is a #[strong function that returns a pipeline component].
| It's called with the #[code Vocab] object, to give it access to the
| shared data between components for example, the strings, morphology,
| vectors or annotation scheme. Factories are useful for creating
| #[strong stateful components], especially ones which
| #[strong depend on shared data].
+aside-code("Example").
def my_factory(vocab):
# load some state
def my_component(doc):
# process the doc
return doc
return my_component
+table(["Argument", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell
| Shared data between components, including strings, morphology,
| vectors etc.
+row("foot")
+cell returns
+cell callable
+cell The pipeline component.
p
| By creating a factory, you're essentially telling spaCy how to get the
| pipeline component #[strong once the vocab is available]. Factories need to
| be registered via #[+api("spacy#set_factory") #[code set_factory()]] and
| by assigning them a unique ID. This ID can be added to the pipeline as a
| string. When creating a pipeline, you're free to mix strings and
| callable components:
+code.
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory', my_other_component])
p
| If spaCy comes across a string in the pipeline, it will try to resolve it
| by looking it up in the available factories. The factory will then be
| initialised with the #[code Vocab]. Providing factory names instead of
| callables also makes it easy to specify them in the model's
| #[+a("/usage/saving-loading#models-generating") meta.json]. If you're
| training your own model and want to use one of spaCy's default components,
| you won't have to worry about finding and implementing it either to use
| the default tagger, simply add #[code "tagger"] to the pipeline, and
| #[strong spaCy will know what to do].
+infobox("Important note")
| Because factories are #[strong resolved on initialisation] of the
| #[code Language] class, it's #[strong not possible] to add them to the
| pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
| works with individual component functions. To use factories, you need to
| create a new #[code Language] object, or generate a
| #[+a("/usage/training#models-generating") model package] with
| a custom pipeline.
+h(3, "disabling") Disabling pipeline components
+h(3, "disabling") Disabling and modifying pipeline components
p
| If you don't need a particular component of the pipeline for
@ -217,16 +121,19 @@ p
+code.
nlp = spacy.load('en', disable['parser', 'tagger'])
nlp = English().from_disk('/model', disable=['tensorizer', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
p
| Note that you can't write directly to #[code nlp.pipeline], as this list
| holds the #[em actual components], not the IDs. However, if you know the
| order of the components, you can still slice the list:
| You can also use the #[+api("language#remove_pipe") #[code remove_pipe]]
| method to remove pipeline components from an existing pipeline, the
| #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them,
| or the #[+api("language#replace_pipe") #[code replace_pipe]] method
| to replace them with a custom component entirely (more details on this
| in the section on #[+a("#custom-components") custom components].
+code.
nlp = spacy.load('en')
nlp.pipeline = nlp.pipeline[:2] # only use the first two components
nlp.remove_pipe('parser')
nlp.rename_pipe('ner', 'entityrecognizer')
nlp.replace_pipe('tagger', my_custom_tagger)
+infobox("Important note: disabling pipeline components")
.o-block
@ -234,12 +141,14 @@ p
| processing pipeline components, the #[code parser], #[code tagger]
| and #[code entity] keyword arguments have been replaced with
| #[code disable], which takes a list of pipeline component names.
| This lets you disable both default and custom components when loading
| This lets you disable pre-defined components when loading
| a model, or initialising a Language class via
| #[+api("language-from_disk") #[code from_disk]].
+code-new.
nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
nlp = spacy.load('en', disable=['ner'])
nlp.remove_pipe('parser')
doc = nlp(u"I don't want parsed")
+code-old.
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False)

View File

@ -1,61 +0,0 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS
p
| Hooks let you customize some of the behaviours of the #[code Doc],
| #[code Span] or #[code Token] objects by adding a component to the
| pipeline. For instance, to customize the
| #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
| component that sets a custom function to
| #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
| method will check the #[code user_hooks] dict, and delegate to your
| function if you've set one. Similar results can be achieved by setting
| functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
+code("Polymorphic similarity example").
span.similarity(doc)
token.similarity(span)
doc1.similarity(doc2)
p
| By default, this just averages the vectors for each document, and
| computes their cosine. Obviously, spaCy should make it easy for you to
| install your own similarity model. This introduces a tricky design
| challenge. The current solution is to add three more dicts to the
| #[code Doc] object:
+aside("Implementation note")
| The hooks live on the #[code Doc] object because the #[code Span] and
| #[code Token] objects are created lazily, and don't own any data. They
| just proxy to their parent #[code Doc]. This turns out to be convenient
| here — we only have to worry about installing hooks in one place.
+table(["Name", "Description"])
+row
+cell #[code user_hooks]
+cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
+row
+cell #[code user_token_hooks]
+cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
+row
+cell #[code user_span_hooks]
+cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
p
| To sum up, here's an example of hooking in custom #[code .similarity()]
| methods:
+code("Add custom similarity hooks").
class SimilarityModel(object):
def __init__(self, model):
self._model = model
def __call__(self, doc):
doc.user_hooks['similarity'] = self.similarity
doc.user_span_hooks['similarity'] = self.similarity
doc.user_token_hooks['similarity'] = self.similarity
def similarity(self, obj1, obj2):
y = self._model([obj1.vector, obj2.vector])
return float(y[0])

View File

@ -175,7 +175,7 @@ p
+code.
import spacy
from spacy.tokens.doc import Doc
from spacy.tokens import Doc
from spacy.vocab import Vocab
nlp = spacy.load('en')

View File

@ -61,7 +61,7 @@ p
output_path.open('w', encoding='utf-8').write(svg)
p
| The above code will generate the dependency visualizations and them to
| The above code will generate the dependency visualizations as to
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].

View File

@ -2,6 +2,44 @@
include ../_includes/_mixins
+section("pipeline")
+h(3, "custom-components-entities") Custom pipeline components and attribute extensions
+tag-new(2)
p
| This example shows the implementation of a pipeline component
| that sets entity annotations based on a list of single or
| multiple-word company names, merges entities into one token and
| sets custom attributes on the #[code Doc], #[code Span] and
| #[code Token].
+github("spacy", "examples/pipeline/custom_component_entities.py")
+h(3, "custom-components-api")
| Custom pipeline components and attribute extensions via a REST API
+tag-new(2)
p
| This example shows the implementation of a pipeline component
| that fetches country meta data via the
| #[+a("https://restcountries.eu") REST Countries API] sets entity
| annotations for countries, merges entities into one token and
| sets custom attributes on the #[code Doc], #[code Span] and
| #[code Token] for example, the capital, latitude/longitude
| coordinates and the country flag.
+github("spacy", "examples/pipeline/custom_component_countries_api.py")
+h(3, "custom-components-attr-methods") Custom method extensions
+tag-new(2)
p
| A collection of snippets showing examples of extensions adding
| custom methods to the #[code Doc], #[code Token] and
| #[code Span].
+github("spacy", "examples/pipeline/custom_attr_methods.py")
+section("matching")
+h(3, "matcher") Using spaCy's rule-based matcher

View File

@ -8,18 +8,18 @@ include _spacy-101/_pipelines
+h(2, "pipelines") How pipelines work
include _processing-pipelines/_pipelines
+section("examples")
+h(2, "examples") Examples
include _processing-pipelines/_examples
+section("custom-components")
+h(2, "custom-components") Creating custom pipeline components
include _processing-pipelines/_custom-components
+section("extensions")
+h(2, "extensions") Developing spaCy extensions
include _processing-pipelines/_extensions
+section("multithreading")
+h(2, "multithreading") Multi-threading
include _processing-pipelines/_multithreading
+section("user-hooks")
+h(2, "user-hooks") User hooks
include _processing-pipelines/_user-hooks
+section("serialization")
+h(2, "serialization") Serialization
include _processing-pipelines/_serialization

View File

@ -102,30 +102,36 @@ p
+h(3, "features-pipelines") Improved processing pipelines
+aside-code("Example").
# Modify an existing pipeline
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
# Set custom attributes
Doc.set_extension('my_attr', default=False)
Token.set_extension('my_attr', getter=my_token_getter)
assert doc._.my_attr, token._.my_attr
# Register a factory to create a component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory', mycomponent])
# Add components to the pipeline
my_component = lambda doc: doc
nlp.add_pipe(my_component)
p
| It's now much easier to #[strong customise the pipeline] with your own
| components, functions that receive a #[code Doc] object, modify and
| return it. If your component is stateful, you can define and register a
| factory which receives the shared #[code Vocab] object and returns a
|  component. spaCy's default components can be added to your pipeline by
| using their string IDs. This way, you won't have to worry about finding
| and implementing them simply add #[code "tagger"] to the pipeline,
| and spaCy will know what to do.
| components: functions that receive a #[code Doc] object, modify and
| return it. Extensions let you write any
| #[strong attributes, properties and methods] to the #[code Doc],
| #[code Token] and #[code Span]. You can add data, implement new
| features, integrate other libraries with spaCy or plug in your own
| machine learning models.
+image
include ../assets/img/pipeline.svg
+infobox
| #[+label-inline API:] #[+api("language") #[code Language]]
| #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text]
| #[+label-inline API:] #[+api("language") #[code Language]],
| #[+api("doc#set_extension") #[code Doc.set_extension]],
| #[+api("span#set_extension") #[code Span.set_extension]],
| #[+api("token#set_extension") #[code Token.set_extension]]
| #[+label-inline Usage:]
| #[+a("/usage/processing-pipelines") Processing pipelines]
| #[+label-inline Code:]
| #[+src("/usage/examples#section-pipeline") Pipeline examples]
+h(3, "features-text-classification") Text classification
@ -478,15 +484,16 @@ p
p
| If you've been using custom pipeline components, check out the new
| guide on #[+a("/usage/language-processing-pipelines") processing pipelines].
| Appending functions to the pipeline still works but you might be able
| to make this more convenient by registering "component factories".
| Components of the processing pipeline can now be disabled by passing a
| list of their names to the #[code disable] keyword argument on loading
| or processing.
| Appending functions to the pipeline still works but the
| #[+api("language#add_pipe") #[code add_pipe]] methods now makes this
| much more convenient. Components of the processing pipeline can now
| be disabled by passing a list of their names to the #[code disable]
| keyword argument on load, or by simply demoving them from the
| pipeline alltogether.
+code-new.
nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
nlp.remove_pipe('parser')
+code-old.
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False)