mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 13:14:11 +03:00
Merge branch 'develop' into feature/fix-matcher-operators
This commit is contained in:
commit
a928ae2f35
52
examples/pipeline/custom_attr_methods.py
Normal file
52
examples/pipeline/custom_attr_methods.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
# coding: utf-8
|
||||
"""This example contains several snippets of methods that can be set via custom
|
||||
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
|
||||
they're "bound" to the object and are partially applied – i.e. the object
|
||||
they're called on is passed in as the first argument."""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy import displacy
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def to_html(doc, output='/tmp', style='dep'):
|
||||
"""Doc method extension for saving the current state as a displaCy
|
||||
visualization.
|
||||
"""
|
||||
# generate filename from first six non-punct tokens
|
||||
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
|
||||
output_path = Path(output) / file_name
|
||||
html = displacy.render(doc, style=style, page=True) # render markup
|
||||
output_path.open('w', encoding='utf-8').write(html) # save to file
|
||||
print('Saved HTML to {}'.format(output_path))
|
||||
|
||||
|
||||
Doc.set_extension('to_html', method=to_html)
|
||||
|
||||
nlp = English()
|
||||
doc = nlp(u"This is a sentence about Apple.")
|
||||
# add entity manually for demo purposes, to make it work without a model
|
||||
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
|
||||
doc._.to_html(style='ent')
|
||||
|
||||
|
||||
def overlap_tokens(doc, other_doc):
|
||||
"""Get the tokens from the original Doc that are also in the comparison Doc.
|
||||
"""
|
||||
overlap = []
|
||||
other_tokens = [token.text for token in other_doc]
|
||||
for token in doc:
|
||||
if token.text in other_tokens:
|
||||
overlap.append(token)
|
||||
return overlap
|
||||
|
||||
|
||||
Doc.set_extension('overlap', method=overlap_tokens)
|
||||
|
||||
nlp = English()
|
||||
doc1 = nlp(u"Peach emoji is where it has always been.")
|
||||
doc2 = nlp(u"Peach is the superior emoji.")
|
||||
tokens = doc1._.overlap(doc2)
|
||||
print(tokens)
|
108
examples/pipeline/custom_component_countries_api.py
Normal file
108
examples/pipeline/custom_component_countries_api.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import requests
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
|
||||
|
||||
class RESTCountriesComponent(object):
|
||||
"""Example of a spaCy v2.0 pipeline component that requests all countries
|
||||
via the REST Countries API, merges country names into one token, assigns
|
||||
entity labels and sets attributes on country tokens, e.g. the capital and
|
||||
lat/lng coordinates. Can be extended with more details from the API.
|
||||
|
||||
REST Countries API: https://restcountries.eu
|
||||
API License: Mozilla Public License MPL 2.0
|
||||
"""
|
||||
name = 'rest_countries' # component name, will show up in the pipeline
|
||||
|
||||
def __init__(self, nlp, label='GPE'):
|
||||
"""Initialise the pipeline component. The shared nlp instance is used
|
||||
to initialise the matcher with the shared vocab, get the label ID and
|
||||
generate Doc objects as phrase match patterns.
|
||||
"""
|
||||
# Make request once on initialisation and store the data
|
||||
r = requests.get('https://restcountries.eu/rest/v2/all')
|
||||
r.raise_for_status() # make sure requests raises an error if it fails
|
||||
countries = r.json()
|
||||
|
||||
# Convert API response to dict keyed by country name for easy lookup
|
||||
# This could also be extended using the alternative and foreign language
|
||||
# names provided by the API
|
||||
self.countries = {c['name']: c for c in countries}
|
||||
self.label = nlp.vocab.strings[label] # get entity label ID
|
||||
|
||||
# Set up the PhraseMatcher with Doc patterns for each country name
|
||||
patterns = [nlp(c) for c in self.countries.keys()]
|
||||
self.matcher = PhraseMatcher(nlp.vocab)
|
||||
self.matcher.add('COUNTRIES', None, *patterns)
|
||||
|
||||
# Register attribute on the Token. We'll be overwriting this based on
|
||||
# the matches, so we're only setting a default value, not a getter.
|
||||
# If no default value is set, it defaults to None.
|
||||
Token.set_extension('is_country', default=False)
|
||||
Token.set_extension('country_capital')
|
||||
Token.set_extension('country_latlng')
|
||||
Token.set_extension('country_flag')
|
||||
|
||||
# Register attributes on Doc and Span via a getter that checks if one of
|
||||
# the contained tokens is set to is_country == True.
|
||||
Doc.set_extension('has_country', getter=self.has_country)
|
||||
Span.set_extension('has_country', getter=self.has_country)
|
||||
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the pipeline component on a Doc object and modify it if matches
|
||||
are found. Return the Doc, so it can be processed by the next component
|
||||
in the pipeline, if available.
|
||||
"""
|
||||
matches = self.matcher(doc)
|
||||
spans = [] # keep the spans for later so we can merge them afterwards
|
||||
for _, start, end in matches:
|
||||
# Generate Span representing the entity & set label
|
||||
entity = Span(doc, start, end, label=self.label)
|
||||
spans.append(entity)
|
||||
# Set custom attribute on each token of the entity
|
||||
# Can be extended with other data returned by the API, like
|
||||
# currencies, country code, flag, calling code etc.
|
||||
for token in entity:
|
||||
token._.set('is_country', True)
|
||||
token._.set('country_capital', self.countries[entity.text]['capital'])
|
||||
token._.set('country_latlng', self.countries[entity.text]['latlng'])
|
||||
token._.set('country_flag', self.countries[entity.text]['flag'])
|
||||
# Overwrite doc.ents and add entity – be careful not to replace!
|
||||
doc.ents = list(doc.ents) + [entity]
|
||||
for span in spans:
|
||||
# Iterate over all spans and merge them into one token. This is done
|
||||
# after setting the entities – otherwise, it would cause mismatched
|
||||
# indices!
|
||||
span.merge()
|
||||
return doc # don't forget to return the Doc!
|
||||
|
||||
def has_country(self, tokens):
|
||||
"""Getter for Doc and Span attributes. Returns True if one of the tokens
|
||||
is a country. Since the getter is only called when we access the
|
||||
attribute, we can refer to the Token's 'is_country' attribute here,
|
||||
which is already set in the processing step."""
|
||||
return any([t._.get('is_country') for t in tokens])
|
||||
|
||||
|
||||
# For simplicity, we start off with only the blank English Language class and
|
||||
# no model or pre-defined pipeline loaded.
|
||||
|
||||
nlp = English()
|
||||
rest_countries = RESTCountriesComponent(nlp) # initialise component
|
||||
nlp.add_pipe(rest_countries) # add it to the pipeline
|
||||
|
||||
doc = nlp(u"Some text about Colombia and the Czech Republic")
|
||||
|
||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||
print('Doc has countries', doc._.has_country) # Doc contains countries
|
||||
for token in doc:
|
||||
if token._.is_country:
|
||||
print(token.text, token._.country_capital, token._.country_latlng,
|
||||
token._.country_flag) # country data
|
||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities
|
85
examples/pipeline/custom_component_entities.py
Normal file
85
examples/pipeline/custom_component_entities.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
|
||||
|
||||
class TechCompanyRecognizer(object):
|
||||
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
|
||||
based on list of single or multiple-word company names. Companies are
|
||||
labelled as ORG and their spans are merged into one token. Additionally,
|
||||
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
|
||||
respectively."""
|
||||
name = 'tech_companies' # component name, will show up in the pipeline
|
||||
|
||||
def __init__(self, nlp, companies=tuple(), label='ORG'):
|
||||
"""Initialise the pipeline component. The shared nlp instance is used
|
||||
to initialise the matcher with the shared vocab, get the label ID and
|
||||
generate Doc objects as phrase match patterns.
|
||||
"""
|
||||
self.label = nlp.vocab.strings[label] # get entity label ID
|
||||
|
||||
# Set up the PhraseMatcher – it can now take Doc objects as patterns,
|
||||
# so even if the list of companies is long, it's very efficient
|
||||
patterns = [nlp(org) for org in companies]
|
||||
self.matcher = PhraseMatcher(nlp.vocab)
|
||||
self.matcher.add('TECH_ORGS', None, *patterns)
|
||||
|
||||
# Register attribute on the Token. We'll be overwriting this based on
|
||||
# the matches, so we're only setting a default value, not a getter.
|
||||
Token.set_extension('is_tech_org', default=False)
|
||||
|
||||
# Register attributes on Doc and Span via a getter that checks if one of
|
||||
# the contained tokens is set to is_tech_org == True.
|
||||
Doc.set_extension('has_tech_org', getter=self.has_tech_org)
|
||||
Span.set_extension('has_tech_org', getter=self.has_tech_org)
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the pipeline component on a Doc object and modify it if matches
|
||||
are found. Return the Doc, so it can be processed by the next component
|
||||
in the pipeline, if available.
|
||||
"""
|
||||
matches = self.matcher(doc)
|
||||
spans = [] # keep the spans for later so we can merge them afterwards
|
||||
for _, start, end in matches:
|
||||
# Generate Span representing the entity & set label
|
||||
entity = Span(doc, start, end, label=self.label)
|
||||
spans.append(entity)
|
||||
# Set custom attribute on each token of the entity
|
||||
for token in entity:
|
||||
token._.set('is_tech_org', True)
|
||||
# Overwrite doc.ents and add entity – be careful not to replace!
|
||||
doc.ents = list(doc.ents) + [entity]
|
||||
for span in spans:
|
||||
# Iterate over all spans and merge them into one token. This is done
|
||||
# after setting the entities – otherwise, it would cause mismatched
|
||||
# indices!
|
||||
span.merge()
|
||||
return doc # don't forget to return the Doc!
|
||||
|
||||
def has_tech_org(self, tokens):
|
||||
"""Getter for Doc and Span attributes. Returns True if one of the tokens
|
||||
is a tech org. Since the getter is only called when we access the
|
||||
attribute, we can refer to the Token's 'is_tech_org' attribute here,
|
||||
which is already set in the processing step."""
|
||||
return any([t._.get('is_tech_org') for t in tokens])
|
||||
|
||||
|
||||
# For simplicity, we start off with only the blank English Language class and
|
||||
# no model or pre-defined pipeline loaded.
|
||||
|
||||
nlp = English()
|
||||
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
|
||||
component = TechCompanyRecognizer(nlp, companies) # initialise component
|
||||
nlp.add_pipe(component, last=True) # add it to the pipeline as the last element
|
||||
|
||||
doc = nlp(u"Alphabet Inc. is the company behind Google.")
|
||||
|
||||
print('Pipeline', nlp.pipe_names) # pipeline contains component name
|
||||
print('Tokens', [t.text for t in doc]) # company names from the list are merged
|
||||
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
|
||||
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
|
||||
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
|
||||
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
|
|
@ -6,7 +6,7 @@ To achieve that, it duplicates some of spaCy's internal functionality.
|
|||
|
||||
Specifically, in this example, we don't use spaCy's built-in Language class to
|
||||
wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write
|
||||
our own simle Pipeline class, so that it's easier to see how the pieces
|
||||
our own simple Pipeline class, so that it's easier to see how the pieces
|
||||
interact.
|
||||
|
||||
Input data:
|
||||
|
@ -142,16 +142,15 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
|||
inputs, annots = zip(*batch)
|
||||
nlp.update(list(inputs), list(annots), sgd, losses=losses)
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
report_scores(i, losses['ner'], scores)
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
report_scores(channels, i+1, loss, scores)
|
||||
report_scores(i+1, losses['ner'], scores)
|
||||
|
||||
|
||||
def report_scores(i, loss, scores):
|
||||
precision = '%.2f' % scores['ents_p']
|
||||
recall = '%.2f' % scores['ents_r']
|
||||
f_measure = '%.2f' % scores['ents_f']
|
||||
print('%d %s %s %s' % (int(loss), precision, recall, f_measure))
|
||||
print('Epoch %d: %d %s %s %s' % (
|
||||
i, int(loss), precision, recall, f_measure))
|
||||
|
||||
|
||||
def read_examples(path):
|
||||
|
|
|
@ -7,7 +7,7 @@ if __name__ == '__main__':
|
|||
import plac
|
||||
import sys
|
||||
from spacy.cli import download, link, info, package, train, convert, model
|
||||
from spacy.cli import profile, evaluate
|
||||
from spacy.cli import profile, evaluate, validate
|
||||
from spacy.util import prints
|
||||
|
||||
commands = {
|
||||
|
@ -20,6 +20,7 @@ if __name__ == '__main__':
|
|||
'package': package,
|
||||
'model': model,
|
||||
'profile': profile,
|
||||
'validate': validate
|
||||
}
|
||||
if len(sys.argv) == 1:
|
||||
prints(', '.join(commands), title="Available commands", exits=1)
|
||||
|
|
|
@ -311,7 +311,7 @@ def link_vectors_to_models(vocab):
|
|||
|
||||
def Tok2Vec(width, embed_size, **kwargs):
|
||||
pretrained_dims = kwargs.get('pretrained_dims', 0)
|
||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
|
||||
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
|
||||
'*': reapply}):
|
||||
|
|
|
@ -7,3 +7,4 @@ from .train import train
|
|||
from .evaluate import evaluate
|
||||
from .convert import convert
|
||||
from .model import model
|
||||
from .validate import validate
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
from pathlib import Path
|
||||
|
||||
from .converters import conllu2json, iob2json
|
||||
from .converters import conllu2json, iob2json, conll_ner2json
|
||||
from ..util import prints
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new entry
|
||||
|
@ -12,9 +12,10 @@ from ..util import prints
|
|||
# from /converters.
|
||||
|
||||
CONVERTERS = {
|
||||
'.conllu': conllu2json,
|
||||
'.conll': conllu2json,
|
||||
'.iob': iob2json,
|
||||
'conllu': conllu2json,
|
||||
'conll': conllu2json,
|
||||
'ner': conll_ner2json,
|
||||
'iob': iob2json,
|
||||
}
|
||||
|
||||
|
||||
|
@ -22,9 +23,11 @@ CONVERTERS = {
|
|||
input_file=("input file", "positional", None, str),
|
||||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||
)
|
||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
|
||||
def convert(cmd, input_file, output_dir, n_sents=1, morphology=False,
|
||||
converter='auto'):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
|
@ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False):
|
|||
prints(input_path, title="Input file not found", exits=1)
|
||||
if not output_path.exists():
|
||||
prints(output_path, title="Output directory not found", exits=1)
|
||||
file_ext = input_path.suffix
|
||||
if not file_ext in CONVERTERS:
|
||||
prints("Can't find converter for %s" % input_path.parts[-1],
|
||||
title="Unknown format", exits=1)
|
||||
CONVERTERS[file_ext](input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology)
|
||||
if converter == 'auto':
|
||||
converter = input_path.suffix[1:]
|
||||
if not converter in CONVERTERS:
|
||||
prints("Can't find converter for %s" % converter,
|
||||
title="Unknown format", exits=1)
|
||||
func = CONVERTERS[converter]
|
||||
func(input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology)
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
from .conllu2json import conllu2json
|
||||
from .iob2json import iob2json
|
||||
from .conll_ner2json import conll_ner2json
|
||||
|
|
50
spacy/cli/converters/conll_ner2json.py
Normal file
50
spacy/cli/converters/conll_ner2json.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||
"""
|
||||
Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
|
||||
"""
|
||||
docs = read_conll_ner(input_path)
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints("Created %d documents" % len(docs),
|
||||
title="Generated output file %s" % path2str(output_file))
|
||||
|
||||
|
||||
def read_conll_ner(input_path):
|
||||
text = input_path.open('r', encoding='utf-8').read()
|
||||
i = 0
|
||||
delimit_docs = '-DOCSTART- -X- O O'
|
||||
output_docs = []
|
||||
for doc in text.strip().split(delimit_docs):
|
||||
doc = doc.strip()
|
||||
if not doc:
|
||||
continue
|
||||
output_doc = []
|
||||
for sent in doc.split('\n\n'):
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
lines = [line.strip() for line in sent.split('\n') if line.strip()]
|
||||
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
||||
biluo_ents = iob_to_biluo(iob_ents)
|
||||
output_doc.append({'tokens': [
|
||||
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
|
||||
zip(words, tags, biluo_ents)
|
||||
]})
|
||||
output_docs.append({
|
||||
'id': len(output_docs),
|
||||
'paragraphs': [{'sentences': output_doc}]
|
||||
})
|
||||
output_doc = []
|
||||
return output_docs
|
|
@ -44,7 +44,7 @@ numpy.random.seed(0)
|
|||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
||||
)
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||
gold_preproc=False, version="0.0.0", meta_path=None):
|
||||
"""
|
||||
|
@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
|||
if not isinstance(meta, dict):
|
||||
prints("Expected dict but got: {}".format(type(meta)),
|
||||
title="Not a valid meta.json format", exits=1)
|
||||
meta.setdefault('lang', lang)
|
||||
meta.setdefault('name', 'unnamed')
|
||||
|
||||
pipeline = ['tagger', 'parser', 'ner']
|
||||
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
|
||||
|
@ -88,9 +90,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
|||
n_train_words = corpus.count_train()
|
||||
|
||||
lang_class = util.get_lang_class(lang)
|
||||
nlp = lang_class(pipeline=pipeline)
|
||||
nlp = lang_class()
|
||||
meta['pipeline'] = pipeline
|
||||
nlp.meta.update(meta)
|
||||
if vectors:
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
for name in pipeline:
|
||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
nlp._optimizer = None
|
||||
|
||||
|
@ -112,17 +118,33 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
|||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ('model%d' % i)
|
||||
nlp.to_disk(epoch_model_path)
|
||||
nlp_loaded = lang_class(pipeline=pipeline)
|
||||
nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
|
||||
scorer = nlp_loaded.evaluate(
|
||||
list(corpus.dev_docs(
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=gold_preproc)))
|
||||
gold_preproc=gold_preproc))
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
end_time = timer()
|
||||
if use_gpu < 0:
|
||||
gpu_wps = None
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
else:
|
||||
gpu_wps = nwords/(end_time-start_time)
|
||||
with Model.use_device('cpu'):
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded, gold_preproc=gold_preproc))
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
end_time = timer()
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||
with acc_loc.open('w') as file_:
|
||||
file_.write(json_dumps(scorer.scores))
|
||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||
meta['accuracy'] = scorer.scores
|
||||
meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps}
|
||||
meta['lang'] = nlp.lang
|
||||
meta['pipeline'] = pipeline
|
||||
meta['spacy_version'] = '>=%s' % about.__version__
|
||||
|
@ -132,7 +154,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
|
|||
with meta_loc.open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores)
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
try:
|
||||
|
@ -153,16 +175,17 @@ def _render_parses(i, to_render):
|
|||
file_.write(html)
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['wps'] = wps
|
||||
scores['cpu_wps'] = cpu_wps
|
||||
scores['gpu_wps'] = gpu_wps or 0.0
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.3f}',
|
||||
|
@ -173,7 +196,9 @@ def print_progress(itn, losses, dev_scores, wps=0.0):
|
|||
'{ents_f:.3f}',
|
||||
'{tags_acc:.3f}',
|
||||
'{token_acc:.3f}',
|
||||
'{wps:.1f}'))
|
||||
'{cpu_wps:.1f}',
|
||||
'{gpu_wps:.1f}',
|
||||
))
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
|
|
123
spacy/cli/validate.py
Normal file
123
spacy/cli/validate.py
Normal file
|
@ -0,0 +1,123 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import requests
|
||||
import pkg_resources
|
||||
from pathlib import Path
|
||||
|
||||
from ..compat import path2str, locale_escape
|
||||
from ..util import prints, get_data_path, read_json
|
||||
from .. import about
|
||||
|
||||
|
||||
def validate(cmd):
|
||||
"""Validate that the currently installed version of spaCy is compatible
|
||||
with the installed models. Should be run after `pip install -U spacy`.
|
||||
"""
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
prints("Couldn't fetch compatibility table.",
|
||||
title="Server error (%d)" % r.status_code, exits=1)
|
||||
compat = r.json()['spacy']
|
||||
all_models = set()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
for model, model_vs in models.items():
|
||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
|
||||
current_compat = compat[about.__version__]
|
||||
model_links = get_model_links(current_compat)
|
||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
||||
incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']}
|
||||
incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']])
|
||||
na_models = [m for m in incompat_models if m not in current_compat]
|
||||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
|
||||
prints(path2str(Path(__file__).parent.parent),
|
||||
title="Installed models (spaCy v{})".format(about.__version__))
|
||||
if model_links or model_pkgs:
|
||||
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
|
||||
for name, data in model_pkgs.items():
|
||||
print(get_model_row(current_compat, name, data, 'package'))
|
||||
for name, data in model_links.items():
|
||||
print(get_model_row(current_compat, name, data, 'link'))
|
||||
else:
|
||||
prints("No models found in your current environment.", exits=0)
|
||||
|
||||
if update_models:
|
||||
cmd = ' python -m spacy download {}'
|
||||
print("\n Use the following commands to update the model packages:")
|
||||
print('\n'.join([cmd.format(pkg) for pkg in update_models]))
|
||||
|
||||
if na_models:
|
||||
prints("The following models are not available for spaCy v{}: {}"
|
||||
.format(about.__version__, ', '.join(na_models)))
|
||||
|
||||
if incompat_links:
|
||||
prints("You may also want to overwrite the incompatible links using "
|
||||
"the `spacy link` command with `--force`, or remove them from "
|
||||
"the data directory. Data path: {}"
|
||||
.format(path2str(get_data_path())))
|
||||
|
||||
|
||||
def get_model_links(compat):
|
||||
links = {}
|
||||
data_path = get_data_path()
|
||||
if data_path:
|
||||
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
||||
for model in models:
|
||||
meta_path = Path(model) / 'meta.json'
|
||||
if not meta_path.exists():
|
||||
continue
|
||||
meta = read_json(meta_path)
|
||||
link = model.parts[-1]
|
||||
name = meta['lang'] + '_' + meta['name']
|
||||
links[link] = {'name': name, 'version': meta['version'],
|
||||
'compat': is_compat(compat, name, meta['version'])}
|
||||
return links
|
||||
|
||||
|
||||
def get_model_pkgs(compat, all_models):
|
||||
pkgs = {}
|
||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||
package = pkg_name.replace('-', '_')
|
||||
if package in all_models:
|
||||
version = pkg_data.version
|
||||
pkgs[pkg_name] = {'name': package, 'version': version,
|
||||
'compat': is_compat(compat, package, version)}
|
||||
return pkgs
|
||||
|
||||
|
||||
def get_model_row(compat, name, data, type='package'):
|
||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
||||
if data['compat']:
|
||||
comp = tpl_green.format(locale_escape('✔', errors='ignore'))
|
||||
version = tpl_green.format(data['version'])
|
||||
else:
|
||||
comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
|
||||
version = tpl_red.format(data['version'])
|
||||
return get_row(type, name, data['name'], version, comp)
|
||||
|
||||
|
||||
def get_row(*args):
|
||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
||||
return tpl_row.format(*args)
|
||||
|
||||
|
||||
def is_model_path(model_path):
|
||||
exclude = ['cache', 'pycache', '__pycache__']
|
||||
name = model_path.parts[-1]
|
||||
return model_path.is_dir() and name not in exclude and not name.startswith('.')
|
||||
|
||||
|
||||
def is_compat(compat, name, version):
|
||||
return name in compat and version in compat[name]
|
||||
|
||||
|
||||
def reformat_version(version):
|
||||
if version.endswith('-alpha'):
|
||||
return version.replace('-alpha', 'a0')
|
||||
return version.replace('-alpha', 'a')
|
|
@ -6,6 +6,7 @@ import ftfy
|
|||
import sys
|
||||
import ujson
|
||||
import itertools
|
||||
import locale
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
|
||||
|
@ -113,3 +114,12 @@ def import_file(name, loc):
|
|||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def locale_escape(string, errors='replace'):
|
||||
'''
|
||||
Mangle non-supported characters, for savages with ascii terminals.
|
||||
'''
|
||||
encoding = locale.getpreferredencoding()
|
||||
string = string.encode(encoding, errors).decode('utf8')
|
||||
return string
|
||||
|
|
|
@ -213,7 +213,7 @@ class GoldCorpus(object):
|
|||
train_tuples = self.train_tuples
|
||||
if projectivize:
|
||||
train_tuples = nonproj.preprocess_training_data(
|
||||
self.train_tuples)
|
||||
self.train_tuples, label_freq_cutoff=100)
|
||||
random.shuffle(train_tuples)
|
||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||
max_length=max_length,
|
||||
|
|
|
@ -16,15 +16,13 @@ from ...util import update_exc
|
|||
class BengaliDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
lemma_rules = LEMMA_RULES
|
||||
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Bengali(Language):
|
||||
|
|
|
@ -15,9 +15,8 @@ class DanishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'da'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Danish(Language):
|
||||
|
|
|
@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -22,16 +21,12 @@ class GermanDefaults(Language.Defaults):
|
|||
lex_attr_getters[LANG] = lambda text: 'de'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
NORM_EXCEPTIONS, BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
infixes = TOKENIZER_INFIXES
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class German(Language):
|
||||
|
|
|
@ -7,7 +7,7 @@ from .tag_map import TAG_MAP
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .morph_rules import MORPH_RULES
|
||||
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC
|
||||
from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
@ -23,15 +23,15 @@ class EnglishDefaults(Language.Defaults):
|
|||
lex_attr_getters[LANG] = lambda text: 'en'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
BASE_NORMS, NORM_EXCEPTIONS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
morph_rules = dict(MORPH_RULES)
|
||||
lemma_rules = dict(LEMMA_RULES)
|
||||
lemma_index = dict(LEMMA_INDEX)
|
||||
lemma_exc = dict(LEMMA_EXC)
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
morph_rules = MORPH_RULES
|
||||
lemma_rules = LEMMA_RULES
|
||||
lemma_index = LEMMA_INDEX
|
||||
lemma_exc = LEMMA_EXC
|
||||
lemma_lookup = LOOKUP
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class English(Language):
|
||||
|
|
|
@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'es'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
sytax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
sytax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Spanish(Language):
|
||||
|
|
|
@ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fi'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
|
|
|
@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -21,17 +20,13 @@ class FrenchDefaults(Language.Defaults):
|
|||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
stop_words = STOP_WORDS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class French(Language):
|
||||
|
|
|
@ -12,9 +12,8 @@ from ...util import update_exc
|
|||
class HebrewDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'he'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Hebrew(Language):
|
||||
|
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'hu'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
token_match = TOKEN_MATCH
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Hungarian(Language):
|
||||
|
|
|
@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
@ -19,19 +18,14 @@ from ...util import update_exc
|
|||
class IndonesianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'id'
|
||||
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
stop_words = STOP_WORDS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Indonesian(Language):
|
||||
|
|
|
@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
|
|||
'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta',
|
||||
'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun',
|
||||
'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun',
|
||||
'noniliun', 'desiliun',
|
||||
]
|
||||
'noniliun', 'desiliun']
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
|
@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'it'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Italian(Language):
|
||||
|
|
|
@ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nb'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
|
|
|
@ -16,9 +16,8 @@ class DutchDefaults(Language.Defaults):
|
|||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Dutch(Language):
|
||||
|
|
|
@ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Polish(Language):
|
||||
|
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults):
|
|||
lex_attr_getters[LANG] = lambda text: 'pt'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Portuguese(Language):
|
||||
|
|
|
@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...language import Language
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
@ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'sv'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
stop_words = STOP_WORDS
|
||||
lemma_rules = LEMMA_RULES
|
||||
lemma_lookup = LOOKUP
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
|
|
|
@ -12,24 +12,27 @@ from ...language import Language
|
|||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
|
||||
class ThaiDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'th'
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Thai(Language):
|
||||
lang = 'th'
|
||||
Defaults = ThaiDefaults
|
||||
def make_doc(self, text):
|
||||
try:
|
||||
from pythainlp.tokenize import word_tokenize
|
||||
except ImportError:
|
||||
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
|
||||
"https://github.com/wannaphongcom/pythainlp/")
|
||||
words = [x for x in list(word_tokenize(text,"newmm"))]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
lang = 'th'
|
||||
Defaults = ThaiDefaults
|
||||
|
||||
def make_doc(self, text):
|
||||
try:
|
||||
from pythainlp.tokenize import word_tokenize
|
||||
except ImportError:
|
||||
raise ImportError("The Thai tokenizer requires the PyThaiNLP library: "
|
||||
"https://github.com/wannaphongcom/pythainlp/")
|
||||
words = [x for x in list(word_tokenize(text,"newmm"))]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
__all__ = ['Thai']
|
||||
|
|
|
@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults):
|
|||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'xx'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
|
||||
|
||||
|
|
|
@ -1,12 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
from contextlib import contextmanager
|
||||
import dill
|
||||
|
||||
import numpy
|
||||
from thinc.neural import Model
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.optimizers import Adam, SGD
|
||||
from thinc.neural.optimizers import Adam
|
||||
import random
|
||||
import ujson
|
||||
from collections import OrderedDict
|
||||
|
@ -17,30 +14,27 @@ from .vocab import Vocab
|
|||
from .tagger import Tagger
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .syntax.parser import get_templates
|
||||
from .syntax import nonproj
|
||||
|
||||
from .pipeline import NeuralDependencyParser, EntityRecognizer
|
||||
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
|
||||
from .pipeline import NeuralLabeller
|
||||
from .pipeline import SimilarityHook
|
||||
from .pipeline import TextCategorizer
|
||||
from . import about
|
||||
from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger
|
||||
from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer
|
||||
|
||||
from .compat import json_dumps, izip
|
||||
from .scorer import Scorer
|
||||
from ._ml import link_vectors_to_models
|
||||
from .attrs import IS_STOP
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||
from .lang.tag_map import TAG_MAP
|
||||
from .lang.lex_attrs import LEX_ATTRS
|
||||
from . import util
|
||||
from .scorer import Scorer
|
||||
from ._ml import link_vectors_to_models
|
||||
from . import about
|
||||
|
||||
|
||||
class BaseDefaults(object):
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules)
|
||||
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules,
|
||||
cls.lemma_lookup)
|
||||
|
||||
@classmethod
|
||||
def create_vocab(cls, nlp=None):
|
||||
|
@ -70,59 +64,7 @@ class BaseDefaults(object):
|
|||
prefix_search=prefix_search, suffix_search=suffix_search,
|
||||
infix_finditer=infix_finditer, token_match=token_match)
|
||||
|
||||
@classmethod
|
||||
def create_tagger(cls, nlp=None, **cfg):
|
||||
if nlp is None:
|
||||
return NeuralTagger(cls.create_vocab(nlp), **cfg)
|
||||
else:
|
||||
return NeuralTagger(nlp.vocab, **cfg)
|
||||
|
||||
@classmethod
|
||||
def create_parser(cls, nlp=None, **cfg):
|
||||
if nlp is None:
|
||||
return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
|
||||
else:
|
||||
return NeuralDependencyParser(nlp.vocab, **cfg)
|
||||
|
||||
@classmethod
|
||||
def create_entity(cls, nlp=None, **cfg):
|
||||
if nlp is None:
|
||||
return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
|
||||
else:
|
||||
return NeuralEntityRecognizer(nlp.vocab, **cfg)
|
||||
|
||||
@classmethod
|
||||
def create_pipeline(cls, nlp=None, disable=tuple()):
|
||||
meta = nlp.meta if nlp is not None else {}
|
||||
# Resolve strings, like "cnn", "lstm", etc
|
||||
pipeline = []
|
||||
for entry in meta.get('pipeline', []):
|
||||
if entry in disable or getattr(entry, 'name', entry) in disable:
|
||||
continue
|
||||
factory = cls.Defaults.factories[entry]
|
||||
pipeline.append(factory(nlp, **meta.get(entry, {})))
|
||||
return pipeline
|
||||
|
||||
factories = {
|
||||
'make_doc': create_tokenizer,
|
||||
'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
|
||||
'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
||||
'parser': lambda nlp, **cfg: [
|
||||
NeuralDependencyParser(nlp.vocab, **cfg),
|
||||
nonproj.deprojectivize],
|
||||
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
||||
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
|
||||
'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
|
||||
# Temporary compatibility -- delete after pivot
|
||||
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
|
||||
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
||||
'dependencies': lambda nlp, **cfg: [
|
||||
NeuralDependencyParser(nlp.vocab, **cfg),
|
||||
nonproj.deprojectivize,
|
||||
],
|
||||
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
||||
}
|
||||
|
||||
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
|
||||
token_match = TOKEN_MATCH
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
|
@ -136,6 +78,7 @@ class BaseDefaults(object):
|
|||
lemma_rules = {}
|
||||
lemma_exc = {}
|
||||
lemma_index = {}
|
||||
lemma_lookup = {}
|
||||
morph_rules = {}
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = {}
|
||||
|
@ -152,8 +95,17 @@ class Language(object):
|
|||
Defaults = BaseDefaults
|
||||
lang = None
|
||||
|
||||
def __init__(self, vocab=True, make_doc=True, pipeline=None,
|
||||
meta={}, disable=tuple(), **kwargs):
|
||||
factories = {
|
||||
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
||||
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
|
||||
'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
|
||||
'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
|
||||
'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
|
||||
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
||||
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
|
||||
}
|
||||
|
||||
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
|
||||
"""Initialise a Language object.
|
||||
|
||||
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
||||
|
@ -179,28 +131,7 @@ class Language(object):
|
|||
factory = self.Defaults.create_tokenizer
|
||||
make_doc = factory(self, **meta.get('tokenizer', {}))
|
||||
self.tokenizer = make_doc
|
||||
if pipeline is True:
|
||||
self.pipeline = self.Defaults.create_pipeline(self, disable)
|
||||
elif pipeline:
|
||||
# Careful not to do getattr(p, 'name', None) here
|
||||
# If we had disable=[None], we'd disable everything!
|
||||
self.pipeline = [p for p in pipeline
|
||||
if p not in disable
|
||||
and getattr(p, 'name', p) not in disable]
|
||||
# Resolve strings, like "cnn", "lstm", etc
|
||||
for i, entry in enumerate(self.pipeline):
|
||||
if entry in self.Defaults.factories:
|
||||
factory = self.Defaults.factories[entry]
|
||||
self.pipeline[i] = factory(self, **meta.get(entry, {}))
|
||||
else:
|
||||
self.pipeline = []
|
||||
flat_list = []
|
||||
for pipe in self.pipeline:
|
||||
if isinstance(pipe, list):
|
||||
flat_list.extend(pipe)
|
||||
else:
|
||||
flat_list.append(pipe)
|
||||
self.pipeline = flat_list
|
||||
self.pipeline = []
|
||||
self._optimizer = None
|
||||
|
||||
@property
|
||||
|
@ -214,11 +145,7 @@ class Language(object):
|
|||
self._meta.setdefault('email', '')
|
||||
self._meta.setdefault('url', '')
|
||||
self._meta.setdefault('license', '')
|
||||
pipeline = []
|
||||
for component in self.pipeline:
|
||||
if hasattr(component, 'name'):
|
||||
pipeline.append(component.name)
|
||||
self._meta['pipeline'] = pipeline
|
||||
self._meta['pipeline'] = self.pipe_names
|
||||
return self._meta
|
||||
|
||||
@meta.setter
|
||||
|
@ -228,34 +155,144 @@ class Language(object):
|
|||
# Conveniences to access pipeline components
|
||||
@property
|
||||
def tensorizer(self):
|
||||
return self.get_component('tensorizer')
|
||||
return self.get_pipe('tensorizer')
|
||||
|
||||
@property
|
||||
def tagger(self):
|
||||
return self.get_component('tagger')
|
||||
return self.get_pipe('tagger')
|
||||
|
||||
@property
|
||||
def parser(self):
|
||||
return self.get_component('parser')
|
||||
return self.get_pipe('parser')
|
||||
|
||||
@property
|
||||
def entity(self):
|
||||
return self.get_component('ner')
|
||||
return self.get_pipe('ner')
|
||||
|
||||
@property
|
||||
def matcher(self):
|
||||
return self.get_component('matcher')
|
||||
return self.get_pipe('matcher')
|
||||
|
||||
def get_component(self, name):
|
||||
if self.pipeline in (True, None):
|
||||
return None
|
||||
for proc in self.pipeline:
|
||||
if hasattr(proc, 'name') and proc.name.endswith(name):
|
||||
return proc
|
||||
return None
|
||||
@property
|
||||
def pipe_names(self):
|
||||
"""Get names of available pipeline components.
|
||||
|
||||
RETURNS (list): List of component name strings, in order.
|
||||
"""
|
||||
return [pipe_name for pipe_name, _ in self.pipeline]
|
||||
|
||||
def get_pipe(self, name):
|
||||
"""Get a pipeline component for a given component name.
|
||||
|
||||
name (unicode): Name of pipeline component to get.
|
||||
RETURNS (callable): The pipeline component.
|
||||
"""
|
||||
for pipe_name, component in self.pipeline:
|
||||
if pipe_name == name:
|
||||
return component
|
||||
msg = "No component '{}' found in pipeline. Available names: {}"
|
||||
raise KeyError(msg.format(name, self.pipe_names))
|
||||
|
||||
def create_pipe(self, name, config=dict()):
|
||||
"""Create a pipeline component from a factory.
|
||||
|
||||
name (unicode): Factory name to look up in `Language.factories`.
|
||||
config (dict): Configuration parameters to initialise component.
|
||||
RETURNS (callable): Pipeline component.
|
||||
"""
|
||||
if name not in self.factories:
|
||||
raise KeyError("Can't find factory for '{}'.".format(name))
|
||||
factory = self.factories[name]
|
||||
return factory(self, **config)
|
||||
|
||||
def add_pipe(self, component, name=None, before=None, after=None,
|
||||
first=None, last=None):
|
||||
"""Add a component to the processing pipeline. Valid components are
|
||||
callables that take a `Doc` object, modify it and return it. Only one of
|
||||
before, after, first or last can be set. Default behaviour is "last".
|
||||
|
||||
component (callable): The pipeline component.
|
||||
name (unicode): Name of pipeline component. Overwrites existing
|
||||
component.name attribute if available. If no name is set and
|
||||
the component exposes no name attribute, component.__name__ is
|
||||
used. An error is raised if the name already exists in the pipeline.
|
||||
before (unicode): Component name to insert component directly before.
|
||||
after (unicode): Component name to insert component directly after.
|
||||
first (bool): Insert component first / not first in the pipeline.
|
||||
last (bool): Insert component last / not last in the pipeline.
|
||||
|
||||
EXAMPLE:
|
||||
>>> nlp.add_pipe(component, before='ner')
|
||||
>>> nlp.add_pipe(component, name='custom_name', last=True)
|
||||
"""
|
||||
if name is None:
|
||||
if hasattr(component, 'name'):
|
||||
name = component.name
|
||||
elif hasattr(component, '__name__'):
|
||||
name = component.__name__
|
||||
elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
|
||||
name = component.__class__.__name__
|
||||
else:
|
||||
name = repr(component)
|
||||
if name in self.pipe_names:
|
||||
raise ValueError("'{}' already exists in pipeline.".format(name))
|
||||
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
|
||||
msg = ("Invalid constraints. You can only set one of the "
|
||||
"following: before, after, first, last.")
|
||||
raise ValueError(msg)
|
||||
pipe = (name, component)
|
||||
if last or not any([first, before, after]):
|
||||
self.pipeline.append(pipe)
|
||||
elif first:
|
||||
self.pipeline.insert(0, pipe)
|
||||
elif before and before in self.pipe_names:
|
||||
self.pipeline.insert(self.pipe_names.index(before), pipe)
|
||||
elif after and after in self.pipe_names:
|
||||
self.pipeline.insert(self.pipe_names.index(after), pipe)
|
||||
else:
|
||||
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||
unfound = before or after
|
||||
raise ValueError(msg.format(unfound, self.pipe_names))
|
||||
|
||||
def replace_pipe(self, name, component):
|
||||
"""Replace a component in the pipeline.
|
||||
|
||||
name (unicode): Name of the component to replace.
|
||||
component (callable): Pipeline component.
|
||||
"""
|
||||
if name not in self.pipe_names:
|
||||
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||
raise ValueError(msg.format(name, self.pipe_names))
|
||||
self.pipeline[self.pipe_names.index(name)] = (name, component)
|
||||
|
||||
def rename_pipe(self, old_name, new_name):
|
||||
"""Rename a pipeline component.
|
||||
|
||||
old_name (unicode): Name of the component to rename.
|
||||
new_name (unicode): New name of the component.
|
||||
"""
|
||||
if old_name not in self.pipe_names:
|
||||
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||
raise ValueError(msg.format(old_name, self.pipe_names))
|
||||
if new_name in self.pipe_names:
|
||||
msg = "'{}' already exists in pipeline. Existing names: {}"
|
||||
raise ValueError(msg.format(new_name, self.pipe_names))
|
||||
i = self.pipe_names.index(old_name)
|
||||
self.pipeline[i] = (new_name, self.pipeline[i][1])
|
||||
|
||||
def remove_pipe(self, name):
|
||||
"""Remove a component from the pipeline.
|
||||
|
||||
name (unicode): Name of the component to remove.
|
||||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||
"""
|
||||
if name not in self.pipe_names:
|
||||
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||
raise ValueError(msg.format(name, self.pipe_names))
|
||||
return self.pipeline.pop(self.pipe_names.index(name))
|
||||
|
||||
def __call__(self, text, disable=[]):
|
||||
"""'Apply the pipeline to some text. The text can span multiple sentences,
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
|
@ -269,8 +306,7 @@ class Language(object):
|
|||
('An', 'NN')
|
||||
"""
|
||||
doc = self.make_doc(text)
|
||||
for proc in self.pipeline:
|
||||
name = getattr(proc, 'name', None)
|
||||
for name, proc in self.pipeline:
|
||||
if name in disable:
|
||||
continue
|
||||
doc = proc(doc)
|
||||
|
@ -308,7 +344,7 @@ class Language(object):
|
|||
grads[key] = (W, dW)
|
||||
pipes = list(self.pipeline)
|
||||
random.shuffle(pipes)
|
||||
for proc in pipes:
|
||||
for name, proc in pipes:
|
||||
if not hasattr(proc, 'update'):
|
||||
continue
|
||||
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
||||
|
@ -322,7 +358,7 @@ class Language(object):
|
|||
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
||||
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
|
||||
"""
|
||||
for proc in self.pipeline:
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, 'preprocess_gold'):
|
||||
docs_golds = proc.preprocess_gold(docs_golds)
|
||||
for doc, gold in docs_golds:
|
||||
|
@ -354,7 +390,7 @@ class Language(object):
|
|||
|
||||
get_gold_tuples (function): Function returning gold data
|
||||
**cfg: Config parameters.
|
||||
returns: An optimizer
|
||||
RETURNS: An optimizer
|
||||
"""
|
||||
# Populate vocab
|
||||
if get_gold_tuples is not None:
|
||||
|
@ -371,7 +407,7 @@ class Language(object):
|
|||
else:
|
||||
device = None
|
||||
link_vectors_to_models(self.vocab)
|
||||
for proc in self.pipeline:
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, 'begin_training'):
|
||||
context = proc.begin_training(get_gold_tuples(),
|
||||
pipeline=self.pipeline)
|
||||
|
@ -393,7 +429,7 @@ class Language(object):
|
|||
docs, golds = zip(*docs_golds)
|
||||
docs = list(docs)
|
||||
golds = list(golds)
|
||||
for pipe in self.pipeline:
|
||||
for name, pipe in self.pipeline:
|
||||
if not hasattr(pipe, 'pipe'):
|
||||
for doc in docs:
|
||||
pipe(doc)
|
||||
|
@ -419,7 +455,7 @@ class Language(object):
|
|||
>>> with nlp.use_params(optimizer.averages):
|
||||
>>> nlp.to_disk('/tmp/checkpoint')
|
||||
"""
|
||||
contexts = [pipe.use_params(params) for pipe
|
||||
contexts = [pipe.use_params(params) for name, pipe
|
||||
in self.pipeline if hasattr(pipe, 'use_params')]
|
||||
# TODO: Having trouble with contextlib
|
||||
# Workaround: these aren't actually context managers atm.
|
||||
|
@ -466,8 +502,7 @@ class Language(object):
|
|||
yield (doc, context)
|
||||
return
|
||||
docs = (self.make_doc(text) for text in texts)
|
||||
for proc in self.pipeline:
|
||||
name = getattr(proc, 'name', None)
|
||||
for name, proc in self.pipeline:
|
||||
if name in disable:
|
||||
continue
|
||||
if hasattr(proc, 'pipe'):
|
||||
|
@ -495,14 +530,14 @@ class Language(object):
|
|||
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
||||
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
||||
))
|
||||
for proc in self.pipeline:
|
||||
for name, proc in self.pipeline:
|
||||
if not hasattr(proc, 'name'):
|
||||
continue
|
||||
if proc.name in disable:
|
||||
if name in disable:
|
||||
continue
|
||||
if not hasattr(proc, 'to_disk'):
|
||||
continue
|
||||
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
||||
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
||||
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
|
||||
util.to_disk(path, serializers, {p: False for p in disable})
|
||||
|
||||
|
@ -526,14 +561,12 @@ class Language(object):
|
|||
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
|
||||
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
||||
))
|
||||
for proc in self.pipeline:
|
||||
if not hasattr(proc, 'name'):
|
||||
continue
|
||||
if proc.name in disable:
|
||||
for name, proc in self.pipeline:
|
||||
if name in disable:
|
||||
continue
|
||||
if not hasattr(proc, 'to_disk'):
|
||||
continue
|
||||
deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
||||
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
||||
exclude = {p: False for p in disable}
|
||||
if not (path / 'vocab').exists():
|
||||
exclude['vocab'] = True
|
||||
|
@ -552,8 +585,8 @@ class Language(object):
|
|||
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||
('meta', lambda: ujson.dumps(self.meta))
|
||||
))
|
||||
for i, proc in enumerate(self.pipeline):
|
||||
if getattr(proc, 'name', None) in disable:
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
if name in disable:
|
||||
continue
|
||||
if not hasattr(proc, 'to_bytes'):
|
||||
continue
|
||||
|
@ -572,8 +605,8 @@ class Language(object):
|
|||
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
|
||||
('meta', lambda b: self.meta.update(ujson.loads(b)))
|
||||
))
|
||||
for i, proc in enumerate(self.pipeline):
|
||||
if getattr(proc, 'name', None) in disable:
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
if name in disable:
|
||||
continue
|
||||
if not hasattr(proc, 'from_bytes'):
|
||||
continue
|
||||
|
|
|
@ -10,20 +10,23 @@ class Lemmatizer(object):
|
|||
def load(cls, path, index=None, exc=None, rules=None):
|
||||
return cls(index or {}, exc or {}, rules or {})
|
||||
|
||||
def __init__(self, index, exceptions, rules):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules
|
||||
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
||||
self.index = index if index is not None else {}
|
||||
self.exc = exceptions if exceptions is not None else {}
|
||||
self.rules = rules if rules is not None else {}
|
||||
self.lookup_table = lookup if lookup is not None else {}
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
if univ_pos == NOUN:
|
||||
if univ_pos in (NOUN, 'NOUN', 'noun'):
|
||||
univ_pos = 'noun'
|
||||
elif univ_pos == VERB:
|
||||
elif univ_pos in (VERB, 'VERB', 'verb'):
|
||||
univ_pos = 'verb'
|
||||
elif univ_pos == ADJ:
|
||||
elif univ_pos in (ADJ, 'ADJ', 'adj'):
|
||||
univ_pos = 'adj'
|
||||
elif univ_pos == PUNCT:
|
||||
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
||||
univ_pos = 'punct'
|
||||
else:
|
||||
return set([string.lower()])
|
||||
# See Issue #435 for example of where this logic is requied.
|
||||
if self.is_base_form(univ_pos, morphology):
|
||||
return set([string.lower()])
|
||||
|
@ -77,6 +80,11 @@ class Lemmatizer(object):
|
|||
def punct(self, string, morphology=None):
|
||||
return self(string, 'punct', morphology)
|
||||
|
||||
def lookup(self, string):
|
||||
if string in self.lookup_table:
|
||||
return self.lookup_table[string]
|
||||
return string
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
|
|
|
@ -1,19 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
class Lemmatizer(Lemmatizer):
|
||||
@classmethod
|
||||
def load(cls, path, lookup):
|
||||
return cls(lookup or {})
|
||||
|
||||
def __init__(self, lookup):
|
||||
self.lookup = lookup
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
try:
|
||||
return set([self.lookup[string]])
|
||||
except:
|
||||
return set([string])
|
|
@ -35,6 +35,8 @@ cdef class Morphology:
|
|||
cdef RichTagC* rich_tags
|
||||
cdef PreshMapArray _cache
|
||||
|
||||
cdef int assign_untagged(self, TokenC* token) except -1
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
|
||||
|
|
|
@ -42,7 +42,7 @@ cdef class Morphology:
|
|||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.reverse_index = {}
|
||||
|
||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
self.tag_map[tag_str] = dict(attrs)
|
||||
attrs = _normalize_props(attrs)
|
||||
|
@ -52,6 +52,10 @@ cdef class Morphology:
|
|||
self.rich_tags[i].morph = 0
|
||||
self.rich_tags[i].pos = attrs[POS]
|
||||
self.reverse_index[self.rich_tags[i].name] = i
|
||||
# Add a 'null' tag, which we can reference when assign morphology to
|
||||
# untagged tokens.
|
||||
self.rich_tags[self.n_tags].id = self.n_tags
|
||||
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
self.exc = {}
|
||||
if exc is not None:
|
||||
|
@ -62,6 +66,15 @@ cdef class Morphology:
|
|||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||
self.exc), None, None)
|
||||
|
||||
cdef int assign_untagged(self, TokenC* token) except -1:
|
||||
"""Set morphological attributes on a token without a POS tag. Uses
|
||||
the lemmatizer's lookup() method, which looks up the string in the
|
||||
table provided by the language data as lemma_lookup (if available)."""
|
||||
if token.lemma == 0:
|
||||
orth_str = self.strings[token.lex.orth]
|
||||
lemma = self.lemmatizer.lookup(orth_str)
|
||||
token.lemma = self.strings.add(lemma)
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||
if isinstance(tag, basestring):
|
||||
tag = self.strings.add(tag)
|
||||
|
@ -72,7 +85,7 @@ cdef class Morphology:
|
|||
token.tag = tag
|
||||
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id >= self.n_tags:
|
||||
if tag_id > self.n_tags:
|
||||
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||
# TODO: It's pretty arbitrary to put this logic here. I guess the justification
|
||||
# is that this is where the specific word and the tag interact. Still,
|
||||
|
@ -151,8 +164,6 @@ cdef class Morphology:
|
|||
cdef unicode py_string = self.strings[orth]
|
||||
if self.lemmatizer is None:
|
||||
return self.strings.add(py_string.lower())
|
||||
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
||||
return self.strings.add(py_string.lower())
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||
|
|
|
@ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
|||
from .tokens.doc cimport Doc
|
||||
from .syntax.parser cimport Parser as LinearParser
|
||||
from .syntax.nn_parser cimport Parser as NeuralParser
|
||||
from .syntax import nonproj
|
||||
from .syntax.parser import get_templates as get_feature_templates
|
||||
from .syntax.beam_parser cimport BeamParser
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
|
@ -157,11 +158,13 @@ class BaseThincComponent(object):
|
|||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the pipe to a bytestring."""
|
||||
serialize = OrderedDict((
|
||||
('cfg', lambda: json_dumps(self.cfg)),
|
||||
('model', lambda: self.model.to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes())
|
||||
))
|
||||
serialize = OrderedDict()
|
||||
serialize['cfg'] = lambda: json_dumps(self.cfg)
|
||||
if self.model in (True, False, None):
|
||||
serialize['model'] = lambda: self.model
|
||||
else:
|
||||
serialize['model'] = self.model.to_bytes
|
||||
serialize['vocab'] = self.vocab.to_bytes
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
|
@ -182,11 +185,11 @@ class BaseThincComponent(object):
|
|||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Serialize the pipe to disk."""
|
||||
serialize = OrderedDict((
|
||||
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||
))
|
||||
serialize = OrderedDict()
|
||||
serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
|
||||
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
|
||||
if self.model not in (None, True, False):
|
||||
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
|
@ -437,13 +440,16 @@ class NeuralTagger(BaseThincComponent):
|
|||
yield
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
serialize = OrderedDict((
|
||||
('model', lambda: self.model.to_bytes()),
|
||||
('vocab', lambda: self.vocab.to_bytes()),
|
||||
('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map,
|
||||
use_bin_type=True,
|
||||
encoding='utf8'))
|
||||
))
|
||||
serialize = OrderedDict()
|
||||
if self.model in (None, True, False):
|
||||
serialize['model'] = lambda: self.model
|
||||
else:
|
||||
serialize['model'] = self.model.to_bytes
|
||||
serialize['vocab'] = self.vocab.to_bytes
|
||||
|
||||
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
|
||||
use_bin_type=True,
|
||||
encoding='utf8')
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
|
@ -778,11 +784,19 @@ cdef class DependencyParser(LinearParser):
|
|||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
|
||||
@property
|
||||
def postprocesses(self):
|
||||
return [nonproj.deprojectivize]
|
||||
|
||||
|
||||
cdef class NeuralDependencyParser(NeuralParser):
|
||||
name = 'parser'
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
@property
|
||||
def postprocesses(self):
|
||||
return [nonproj.deprojectivize]
|
||||
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
for target in []:
|
||||
labeller = NeuralLabeller(self.vocab, target=target)
|
||||
|
@ -823,6 +837,11 @@ cdef class BeamDependencyParser(BeamParser):
|
|||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
|
||||
@property
|
||||
def postprocesses(self):
|
||||
return [nonproj.deprojectivize]
|
||||
|
||||
|
||||
|
||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
|
||||
'BeamEntityRecognizer', 'TokenVectorEnoder']
|
||||
|
|
|
@ -241,8 +241,8 @@ cdef class Parser:
|
|||
def Model(cls, nr_class, **cfg):
|
||||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
||||
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
|
||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1))
|
||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
|
||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
||||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
||||
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
||||
|
@ -779,6 +779,14 @@ cdef class Parser:
|
|||
for i in range(doc.length):
|
||||
doc.c[i] = state.c._sent[i]
|
||||
self.moves.finalize_doc(doc)
|
||||
for hook in self.postprocesses:
|
||||
for doc in docs:
|
||||
hook(doc)
|
||||
|
||||
@property
|
||||
def postprocesses(self):
|
||||
# Available for subclasses, e.g. to deprojectivize
|
||||
return []
|
||||
|
||||
def add_label(self, label):
|
||||
resized = False
|
||||
|
@ -792,16 +800,25 @@ cdef class Parser:
|
|||
if self.model not in (True, False, None) and resized:
|
||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||
# just adding rows here.
|
||||
smaller = self.model[-1]._layers[-1]
|
||||
larger = Affine(self.moves.n_moves, smaller.nI)
|
||||
copy_array(larger.W[:smaller.nO], smaller.W)
|
||||
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||
self.model[-1]._layers[-1] = larger
|
||||
if self.model[-1].is_noop:
|
||||
smaller = self.model[1]
|
||||
dims = dict(self.model[1]._dims)
|
||||
dims['nO'] = self.moves.n_moves
|
||||
larger = self.model[1].__class__(**dims)
|
||||
copy_array(larger.W[:, :smaller.nO], smaller.W)
|
||||
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||
self.model = (self.model[0], larger, self.model[2])
|
||||
else:
|
||||
smaller = self.model[-1]._layers[-1]
|
||||
larger = Affine(self.moves.n_moves, smaller.nI)
|
||||
copy_array(larger.W[:smaller.nO], smaller.W)
|
||||
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||
self.model[-1]._layers[-1] = larger
|
||||
|
||||
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||
if 'model' in cfg:
|
||||
self.model = cfg['model']
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples)
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
|
||||
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
||||
for action, labels in actions.items():
|
||||
for label in labels:
|
||||
|
|
|
@ -58,8 +58,9 @@ def en_vocab():
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def en_parser():
|
||||
return util.get_lang_class('en').Defaults.create_parser()
|
||||
def en_parser(en_vocab):
|
||||
nlp = util.get_lang_class('en')(en_vocab)
|
||||
return nlp.create_pipe('parser')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
37
spacy/tests/doc/test_creation.py
Normal file
37
spacy/tests/doc/test_creation.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
'''Test Doc sets up tokens correctly.'''
|
||||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from ...vocab import Vocab
|
||||
from ...tokens.doc import Doc
|
||||
from ...lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vocab(lemmatizer):
|
||||
return Vocab(lemmatizer=lemmatizer)
|
||||
|
||||
|
||||
def test_empty_doc(vocab):
|
||||
doc = Doc(vocab)
|
||||
assert len(doc) == 0
|
||||
|
||||
|
||||
def test_single_word(vocab):
|
||||
doc = Doc(vocab, words=['a'])
|
||||
assert doc.text == 'a '
|
||||
doc = Doc(vocab, words=['a'], spaces=[False])
|
||||
assert doc.text == 'a'
|
||||
|
||||
|
||||
def test_lookup_lemmatization(vocab):
|
||||
doc = Doc(vocab, words=['dogs', 'dogses'])
|
||||
assert doc[0].text == 'dogs'
|
||||
assert doc[0].lemma_ == 'dog'
|
||||
assert doc[1].text == 'dogses'
|
||||
assert doc[1].lemma_ == 'dogses'
|
13
spacy/tests/lang/de/test_lemma.py
Normal file
13
spacy/tests/lang/de/test_lemma.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'),
|
||||
('engagierte', 'engagieren'),
|
||||
('schließt', 'schließen'),
|
||||
('vorgebenden', 'vorgebend')])
|
||||
def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
|
||||
tokens = de_tokenizer(string)
|
||||
assert tokens[0].lemma_ == lemma
|
|
@ -57,6 +57,5 @@ def test_en_lemmatizer_punct(en_lemmatizer):
|
|||
def test_en_lemmatizer_lemma_assignment(EN):
|
||||
text = "Bananas in pyjamas are geese."
|
||||
doc = EN.make_doc(text)
|
||||
assert all(t.lemma_ == '' for t in doc)
|
||||
EN.tagger(doc)
|
||||
assert all(t.lemma_ != '' for t in doc)
|
||||
|
|
|
@ -22,14 +22,14 @@ def vocab():
|
|||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
parser = NeuralDependencyParser(vocab)
|
||||
parser.cfg['token_vector_width'] = 4
|
||||
parser.cfg['hidden_width'] = 6
|
||||
parser.cfg['token_vector_width'] = 8
|
||||
parser.cfg['hidden_width'] = 30
|
||||
parser.cfg['hist_size'] = 0
|
||||
parser.add_label('left')
|
||||
parser.begin_training([], **parser.cfg)
|
||||
sgd = Adam(NumpyOps(), 0.001)
|
||||
|
||||
for i in range(30):
|
||||
for i in range(10):
|
||||
losses = {}
|
||||
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
|
||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
||||
|
@ -37,6 +37,8 @@ def parser(vocab):
|
|||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||
return parser
|
||||
|
||||
def test_init_parser(parser):
|
||||
pass
|
||||
|
||||
def test_add_label(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
import spacy
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
@pytest.mark.models
|
||||
def test_beam_parse():
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
doc = nlp(u'Australia is a country', disable=['ner'])
|
||||
ents = nlp.entity(doc, beam_width=2)
|
||||
print(ents)
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_beam_parse(EN):
|
||||
doc = EN(u'Australia is a country', disable=['ner'])
|
||||
ents = EN.entity(doc, beam_width=2)
|
||||
print(ents)
|
||||
|
|
|
@ -35,7 +35,7 @@ def parser(vocab):
|
|||
def test_no_sentences(parser):
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) == 2
|
||||
assert len(list(doc.sents)) >= 1
|
||||
|
||||
|
||||
def test_sents_1(parser):
|
||||
|
@ -64,7 +64,7 @@ def test_sents_1_3(parser):
|
|||
doc[1].sent_start = True
|
||||
doc[3].sent_start = True
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) == 4
|
||||
assert len(list(doc.sents)) >= 3
|
||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||
doc[1].sent_start = True
|
||||
doc[2].sent_start = False
|
||||
|
|
0
spacy/tests/pipeline/__init__.py
Normal file
0
spacy/tests/pipeline/__init__.py
Normal file
84
spacy/tests/pipeline/test_pipe_methods.py
Normal file
84
spacy/tests/pipeline/test_pipe_methods.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
from ...language import Language
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return Language()
|
||||
|
||||
|
||||
def new_pipe(doc):
|
||||
return doc
|
||||
|
||||
|
||||
def test_add_pipe_no_name(nlp):
|
||||
nlp.add_pipe(new_pipe)
|
||||
assert 'new_pipe' in nlp.pipe_names
|
||||
|
||||
|
||||
def test_add_pipe_duplicate_name(nlp):
|
||||
nlp.add_pipe(new_pipe, name='duplicate_name')
|
||||
with pytest.raises(ValueError):
|
||||
nlp.add_pipe(new_pipe, name='duplicate_name')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name', ['parser'])
|
||||
def test_add_pipe_first(nlp, name):
|
||||
nlp.add_pipe(new_pipe, name=name, first=True)
|
||||
assert nlp.pipeline[0][0] == name
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')])
|
||||
def test_add_pipe_last(nlp, name1, name2):
|
||||
nlp.add_pipe(lambda doc: doc, name=name2)
|
||||
nlp.add_pipe(new_pipe, name=name1, last=True)
|
||||
assert nlp.pipeline[0][0] != name1
|
||||
assert nlp.pipeline[-1][0] == name1
|
||||
|
||||
|
||||
def test_cant_add_pipe_first_and_last(nlp):
|
||||
with pytest.raises(ValueError):
|
||||
nlp.add_pipe(new_pipe, first=True, last=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name', ['my_component'])
|
||||
def test_get_pipe(nlp, name):
|
||||
with pytest.raises(KeyError):
|
||||
nlp.get_pipe(name)
|
||||
nlp.add_pipe(new_pipe, name=name)
|
||||
assert nlp.get_pipe(name) == new_pipe
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)])
|
||||
def test_replace_pipe(nlp, name, replacement):
|
||||
with pytest.raises(ValueError):
|
||||
nlp.replace_pipe(name, new_pipe)
|
||||
nlp.add_pipe(new_pipe, name=name)
|
||||
nlp.replace_pipe(name, replacement)
|
||||
assert nlp.get_pipe(name) != new_pipe
|
||||
assert nlp.get_pipe(name) == replacement
|
||||
|
||||
|
||||
@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')])
|
||||
def test_rename_pipe(nlp, old_name, new_name):
|
||||
with pytest.raises(ValueError):
|
||||
nlp.rename_pipe(old_name, new_name)
|
||||
nlp.add_pipe(new_pipe, name=old_name)
|
||||
nlp.rename_pipe(old_name, new_name)
|
||||
assert nlp.pipeline[0][0] == new_name
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name', ['my_component'])
|
||||
def test_remove_pipe(nlp, name):
|
||||
with pytest.raises(ValueError):
|
||||
nlp.remove_pipe(name)
|
||||
nlp.add_pipe(new_pipe, name=name)
|
||||
assert len(nlp.pipeline) == 1
|
||||
removed_name, removed_component = nlp.remove_pipe(name)
|
||||
assert not len(nlp.pipeline)
|
||||
assert removed_name == name
|
||||
assert removed_component == new_pipe
|
|
@ -7,6 +7,7 @@ from ..util import get_doc
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue589():
|
||||
vocab = Vocab()
|
||||
vocab.strings.set_frozen(True)
|
||||
|
|
9
spacy/tests/serialize/test_serialize_empty_model.py
Normal file
9
spacy/tests/serialize/test_serialize_empty_model.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
import spacy
|
||||
import spacy.lang.en
|
||||
from spacy.pipeline import TextCategorizer
|
||||
|
||||
def test_bytes_serialize_issue_1105():
|
||||
nlp = spacy.lang.en.English()
|
||||
tokenizer = nlp.tokenizer
|
||||
textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
|
||||
textcat_bytes = textcat.to_bytes()
|
53
spacy/tests/test_underscore.py
Normal file
53
spacy/tests/test_underscore.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
from mock import Mock
|
||||
from ..tokens.underscore import Underscore
|
||||
|
||||
|
||||
def test_create_doc_underscore():
|
||||
doc = Mock()
|
||||
doc.doc = doc
|
||||
uscore = Underscore(Underscore.doc_extensions, doc)
|
||||
assert uscore._doc is doc
|
||||
assert uscore._start is None
|
||||
assert uscore._end is None
|
||||
|
||||
|
||||
def test_doc_underscore_getattr_setattr():
|
||||
doc = Mock()
|
||||
doc.doc = doc
|
||||
doc.user_data = {}
|
||||
Underscore.doc_extensions['hello'] = (False, None, None, None)
|
||||
doc._ = Underscore(Underscore.doc_extensions, doc)
|
||||
assert doc._.hello == False
|
||||
doc._.hello = True
|
||||
assert doc._.hello == True
|
||||
|
||||
|
||||
def test_create_span_underscore():
|
||||
span = Mock(doc=Mock(), start=0, end=2)
|
||||
uscore = Underscore(Underscore.span_extensions, span,
|
||||
start=span.start, end=span.end)
|
||||
assert uscore._doc is span.doc
|
||||
assert uscore._start is span.start
|
||||
assert uscore._end is span.end
|
||||
|
||||
|
||||
def test_span_underscore_getter_setter():
|
||||
span = Mock(doc=Mock(), start=0, end=2)
|
||||
Underscore.span_extensions['hello'] = (None, None,
|
||||
lambda s: (s.start, 'hi'),
|
||||
lambda s, value: setattr(s, 'start',
|
||||
value))
|
||||
span._ = Underscore(Underscore.span_extensions, span,
|
||||
start=span.start, end=span.end)
|
||||
|
||||
assert span._.hello == (0, 'hi')
|
||||
span._.hello = 1
|
||||
assert span._.hello == (1, 'hi')
|
||||
|
||||
|
||||
def test_token_underscore_method():
|
||||
token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese')
|
||||
Underscore.token_extensions['hello'] = (None, token.say_cheese,
|
||||
None, None)
|
||||
token._ = Underscore(Underscore.token_extensions, token, start=token.idx)
|
||||
assert token._.hello() == 'cheese'
|
|
@ -30,7 +30,7 @@ from ..util import normalize_slice
|
|||
from ..compat import is_config
|
||||
from .. import about
|
||||
from .. import util
|
||||
|
||||
from .underscore import Underscore
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
@ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|||
else:
|
||||
return Lexeme.get_struct_attr(token.lex, feat_name)
|
||||
|
||||
|
||||
def _get_chunker(lang):
|
||||
try:
|
||||
cls = util.get_lang_class(lang)
|
||||
|
@ -73,6 +74,7 @@ def _get_chunker(lang):
|
|||
return None
|
||||
return cls.Defaults.syntax_iterators.get(u'noun_chunks')
|
||||
|
||||
|
||||
cdef class Doc:
|
||||
"""A sequence of Token objects. Access sentences and named entities, export
|
||||
annotations to numpy arrays, losslessly serialize to compressed binary strings.
|
||||
|
@ -87,6 +89,21 @@ cdef class Doc:
|
|||
>>> from spacy.tokens import Doc
|
||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
|
||||
"""
|
||||
@classmethod
|
||||
def set_extension(cls, name, default=None, method=None,
|
||||
getter=None, setter=None):
|
||||
nr_defined = sum(t is not None for t in (default, getter, setter, method))
|
||||
assert nr_defined == 1
|
||||
Underscore.doc_extensions[name] = (default, method, getter, setter)
|
||||
|
||||
@classmethod
|
||||
def get_extension(cls, name):
|
||||
return Underscore.doc_extensions.get(name)
|
||||
|
||||
@classmethod
|
||||
def has_extension(cls, name):
|
||||
return name in Underscore.doc_extensions
|
||||
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
||||
"""Create a Doc object.
|
||||
|
||||
|
@ -159,6 +176,10 @@ cdef class Doc:
|
|||
self.is_tagged = True
|
||||
self.is_parsed = True
|
||||
|
||||
@property
|
||||
def _(self):
|
||||
return Underscore(Underscore.doc_extensions, self)
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Get a `Token` or `Span` object.
|
||||
|
||||
|
@ -512,6 +533,8 @@ cdef class Doc:
|
|||
assert t.lex.orth != 0
|
||||
t.spacy = has_space
|
||||
self.length += 1
|
||||
# Set morphological attributes, e.g. by lemma, if possible
|
||||
self.vocab.morphology.assign_untagged(t)
|
||||
self._py_tokens.append(None)
|
||||
return t.idx + t.lex.length + t.spacy
|
||||
|
||||
|
|
|
@ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE
|
|||
from ..lexeme cimport Lexeme
|
||||
from ..compat import is_config
|
||||
from .. import about
|
||||
from .underscore import Underscore
|
||||
|
||||
|
||||
cdef class Span:
|
||||
"""A slice from a Doc object."""
|
||||
@classmethod
|
||||
def set_extension(cls, name, default=None, method=None,
|
||||
getter=None, setter=None):
|
||||
Underscore.span_extensions[name] = (default, method, getter, setter)
|
||||
|
||||
@classmethod
|
||||
def get_extension(cls, name):
|
||||
return Underscore.span_extensions.get(name)
|
||||
|
||||
@classmethod
|
||||
def has_extension(cls, name):
|
||||
return name in Underscore.span_extensions
|
||||
|
||||
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
|
||||
vector_norm=None):
|
||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||
|
@ -111,6 +125,10 @@ cdef class Span:
|
|||
for i in range(self.start, self.end):
|
||||
yield self.doc[i]
|
||||
|
||||
@property
|
||||
def _(self):
|
||||
return Underscore(Underscore.span_extensions, self,
|
||||
start=self.start_char, end=self.end_char)
|
||||
def as_doc(self):
|
||||
'''Create a Doc object view of the Span's data.
|
||||
|
||||
|
|
|
@ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
|
|||
from ..attrs cimport LEMMA, POS, TAG, DEP
|
||||
from ..compat import is_config
|
||||
from .. import about
|
||||
from .underscore import Underscore
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
|
||||
@classmethod
|
||||
def set_extension(cls, name, default=None, method=None,
|
||||
getter=None, setter=None):
|
||||
Underscore.token_extensions[name] = (default, method, getter, setter)
|
||||
|
||||
@classmethod
|
||||
def get_extension(cls, name):
|
||||
return Underscore.span_extensions.get(name)
|
||||
|
||||
@classmethod
|
||||
def has_extension(cls, name):
|
||||
return name in Underscore.span_extensions
|
||||
|
||||
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
||||
"""Construct a `Token` object.
|
||||
|
||||
|
@ -87,6 +101,11 @@ cdef class Token:
|
|||
else:
|
||||
raise ValueError(op)
|
||||
|
||||
@property
|
||||
def _(self):
|
||||
return Underscore(Underscore.token_extensions, self,
|
||||
start=self.idx, end=None)
|
||||
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||
"""Check the value of a boolean flag.
|
||||
|
||||
|
|
50
spacy/tokens/underscore.py
Normal file
50
spacy/tokens/underscore.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import functools
|
||||
|
||||
class Underscore(object):
|
||||
doc_extensions = {}
|
||||
span_extensions = {}
|
||||
token_extensions = {}
|
||||
|
||||
def __init__(self, extensions, obj, start=None, end=None):
|
||||
object.__setattr__(self, '_extensions', extensions)
|
||||
object.__setattr__(self, '_obj', obj)
|
||||
# Assumption is that for doc values, _start and _end will both be None
|
||||
# Span will set non-None values for _start and _end
|
||||
# Token will have _start be non-None, _end be None
|
||||
# This lets us key everything into the doc.user_data dictionary,
|
||||
# (see _get_key), and lets us use a single Underscore class.
|
||||
object.__setattr__(self, '_doc', obj.doc)
|
||||
object.__setattr__(self, '_start', start)
|
||||
object.__setattr__(self, '_end', end)
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name not in self._extensions:
|
||||
raise AttributeError(name)
|
||||
default, method, getter, setter = self._extensions[name]
|
||||
if getter is not None:
|
||||
return getter(self._obj)
|
||||
elif method is not None:
|
||||
return functools.partial(method, self._obj)
|
||||
else:
|
||||
return self._doc.user_data.get(self._get_key(name), default)
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
if name not in self._extensions:
|
||||
raise AttributeError(name)
|
||||
default, method, getter, setter = self._extensions[name]
|
||||
if setter is not None:
|
||||
return setter(self._obj, value)
|
||||
else:
|
||||
self._doc.user_data[self._get_key(name)] = value
|
||||
|
||||
def set(self, name, value):
|
||||
return self.__setattr__(name, value)
|
||||
|
||||
def get(self, name):
|
||||
return self.__getattr__(name)
|
||||
|
||||
def has(self, name):
|
||||
return name in self._extensions
|
||||
|
||||
def _get_key(self, name):
|
||||
return ('._.', name, self._start, self._end)
|
|
@ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
|||
if not meta:
|
||||
meta = get_model_meta(model_path)
|
||||
cls = get_lang_class(meta['lang'])
|
||||
nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides)
|
||||
nlp = cls(meta=meta, **overrides)
|
||||
pipeline = meta.get('pipeline', [])
|
||||
disable = overrides.get('disable', [])
|
||||
if pipeline is True:
|
||||
pipeline = nlp.Defaults.pipe_names
|
||||
elif pipeline in (False, None):
|
||||
pipeline = []
|
||||
for name in pipeline:
|
||||
if name not in disable:
|
||||
config = meta.get('pipeline_args', {}).get(name, {})
|
||||
component = nlp.create_pipe(name, config=config)
|
||||
nlp.add_pipe(component, name=name)
|
||||
return nlp.from_disk(model_path)
|
||||
|
||||
|
||||
|
|
|
@ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap)
|
|||
|
||||
//- Code blocks to display old/new versions
|
||||
|
||||
mixin code-wrapper()
|
||||
span.u-inline-block.u-padding-top.u-width-full
|
||||
block
|
||||
|
||||
mixin code-old()
|
||||
+code(false, false, false, false, "reject").o-block-small
|
||||
block
|
||||
|
|
|
@ -113,6 +113,22 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+h(3, "validate") Validate
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Find all models installed in the current environment (both packages and
|
||||
| shortcut links) and check whether they are compatible with the currently
|
||||
| installed version of spaCy. Should be run after upgrading spaCy via
|
||||
| #[code pip install -U spacy] to ensure that all installed models are
|
||||
| can be used with the new version. The command is also useful to detect
|
||||
| out-of-sync model links resulting from links created in different virtual
|
||||
| environments. Prints a list of models, the installed versions, the latest
|
||||
| compatible version (if out of date) and the commands for updating.
|
||||
|
||||
+code(false, "bash", "$").
|
||||
spacy validate
|
||||
|
||||
+h(3, "convert") Convert
|
||||
|
||||
p
|
||||
|
|
|
@ -43,6 +43,20 @@ p
|
|||
+cell #[code Language]
|
||||
+cell A #[code Language] object with the loaded model.
|
||||
|
||||
p
|
||||
| Essentially, #[code spacy.load()] is a convenience wrapper that reads
|
||||
| the language ID and pipeline components from a model's #[code meta.json],
|
||||
| initialises the #[code Language] class, loads in the model data and
|
||||
| returns it.
|
||||
|
||||
+code("Abstract example").
|
||||
cls = util.get_lang_class(lang) # get language for ID, e.g. 'en'
|
||||
nlp = cls() # initialise the language
|
||||
for name in pipeline:
|
||||
component = nlp.create_pipe(name) # create each pipeline component
|
||||
nlp.add_pipe(component) # add component to pipeline
|
||||
nlp.from_disk(model_data_path) # load in model data
|
||||
|
||||
+infobox("Deprecation note", "⚠️")
|
||||
.o-block
|
||||
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
||||
|
@ -141,37 +155,3 @@ p
|
|||
+cell returns
|
||||
+cell unicode
|
||||
+cell The explanation, or #[code None] if not found in the glossary.
|
||||
|
||||
+h(3, "spacy.set_factory") spacy.set_factory
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Set a factory that returns a custom
|
||||
| #[+a("/usage/processing-pipelines") processing pipeline]
|
||||
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
|
||||
|
||||
+aside-code("Example").
|
||||
def my_factory(vocab):
|
||||
def my_component(doc):
|
||||
return doc
|
||||
return my_component
|
||||
|
||||
spacy.set_factory('my_factory', my_factory)
|
||||
nlp = Language(pipeline=['my_factory'])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code factory_id]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Unique name of factory. If added to a new pipeline, spaCy will
|
||||
| look up the factory for this ID and use it to create the
|
||||
| component.
|
||||
|
||||
+row
|
||||
+cell #[code factory]
|
||||
+cell callable
|
||||
+cell
|
||||
| Callable that takes a #[code Vocab] object and returns a pipeline
|
||||
| component.
|
||||
|
|
|
@ -138,6 +138,109 @@ p Get the number of tokens in the document.
|
|||
+cell int
|
||||
+cell The number of tokens in the document.
|
||||
|
||||
+h(2, "set_extension") Doc.set_extension
|
||||
+tag classmethod
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Define a custom attribute on the #[code Doc] which becomes available via
|
||||
| #[code Doc._]. For details, see the documentation on
|
||||
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Doc
|
||||
city_getter = lambda doc: doc.text in ('New York', 'Paris', 'Berlin')
|
||||
Doc.set_extension('has_city', getter=city_getter)
|
||||
doc = nlp(u'I like New York')
|
||||
assert doc._.has_city
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Name of the attribute to set by the extension. For example,
|
||||
| #[code 'my_attr'] will be available as #[code doc._.my_attr].
|
||||
|
||||
+row
|
||||
+cell #[code default]
|
||||
+cell -
|
||||
+cell
|
||||
| Optional default value of the attribute if no getter or method
|
||||
| is defined.
|
||||
|
||||
+row
|
||||
+cell #[code method]
|
||||
+cell callable
|
||||
+cell
|
||||
| Set a custom method on the object, for example
|
||||
| #[code doc._.compare(other_doc)].
|
||||
|
||||
+row
|
||||
+cell #[code getter]
|
||||
+cell callable
|
||||
+cell
|
||||
| Getter function that takes the object and returns an attribute
|
||||
| value. Is called when the user accesses the #[code ._] attribute.
|
||||
|
||||
+row
|
||||
+cell #[code setter]
|
||||
+cell callable
|
||||
+cell
|
||||
| Setter function that takes the #[code Doc] and a value, and
|
||||
| modifies the object. Is called when the user writes to the
|
||||
| #[code Doc._] attribute.
|
||||
|
||||
+h(2, "get_extension") Doc.get_extension
|
||||
+tag classmethod
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Look up a previously registered extension by name. Returns a 4-tuple
|
||||
| #[code.u-break (default, method, getter, setter)] if the extension is
|
||||
| registered. Raises a #[code KeyError] otherwise.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Doc
|
||||
Doc.set_extension('is_city', default=False)
|
||||
extension = Doc.get_extension('is_city')
|
||||
assert extension == (False, None, None, None)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell Name of the extension.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell tuple
|
||||
+cell
|
||||
| A #[code.u-break (default, method, getter, setter)] tuple of the
|
||||
| extension.
|
||||
|
||||
+h(2, "has_extension") Doc.has_extension
|
||||
+tag classmethod
|
||||
+tag-new(2)
|
||||
|
||||
p Check whether an extension has been registered on the #[code Doc] class.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Doc
|
||||
Doc.set_extension('is_city', default=False)
|
||||
assert Doc.has_extension('is_city')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell Name of the extension to check.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the extension has been registered.
|
||||
|
||||
+h(2, "char_span") Doc.char_span
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
|
|
@ -4,7 +4,14 @@ include ../_includes/_mixins
|
|||
|
||||
p
|
||||
| Usually you'll load this once per process as #[code nlp] and pass the
|
||||
| instance around your application.
|
||||
| instance around your application. The #[code Language] class is created
|
||||
| when you call #[+api("spacy#load") #[code spacy.load()]] and contains
|
||||
| the shared vocabulary and #[+a("/usage/adding-languages") language data],
|
||||
| optional model data loaded from a #[+a("/models") model package] or
|
||||
| a path, and a #[+a("/usage/processing-pipelines") processing pipeline]
|
||||
| containing components like the tagger or parser that are called on a
|
||||
| document in order. You can also add your own processing pipeline
|
||||
| components that take a #[code Doc] object, modify it and return it.
|
||||
|
||||
+h(2, "init") Language.__init__
|
||||
+tag method
|
||||
|
@ -12,9 +19,9 @@ p
|
|||
p Initialise a #[code Language] object.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
nlp = Language(pipeline=['token_vectors', 'tags',
|
||||
'dependencies'])
|
||||
nlp = Language(Vocab())
|
||||
|
||||
from spacy.lang.en import English
|
||||
nlp = English()
|
||||
|
@ -34,14 +41,6 @@ p Initialise a #[code Language] object.
|
|||
| A function that takes text and returns a #[code Doc] object.
|
||||
| Usually a #[code Tokenizer].
|
||||
|
||||
+row
|
||||
+cell #[code pipeline]
|
||||
+cell list
|
||||
+cell
|
||||
| A list of annotation processes or IDs of annotation, processes,
|
||||
| e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
|
||||
| up in #[code Language.Defaults.factories].
|
||||
|
||||
+row
|
||||
+cell #[code meta]
|
||||
+cell dict
|
||||
|
@ -235,7 +234,6 @@ p
|
|||
| Can be called before training to pre-process gold data. By default, it
|
||||
| handles nonprojectivity and adds missing tags to the tag map.
|
||||
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code docs_golds]
|
||||
|
@ -247,6 +245,177 @@ p
|
|||
+cell tuple
|
||||
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
|
||||
|
||||
+h(2, "create_pipe") Language.create_pipe
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Create a pipeline component from a factory.
|
||||
|
||||
+aside-code("Example").
|
||||
parser = nlp.create_pipe('parser')
|
||||
nlp.add_pipe(parser)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Factory name to look up in
|
||||
| #[+api("language#class-attributes") #[code Language.factories]].
|
||||
|
||||
+row
|
||||
+cell #[code config]
|
||||
+cell dict
|
||||
+cell Configuration parameters to initialise component.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell callable
|
||||
+cell The pipeline component.
|
||||
|
||||
+h(2, "add_pipe") Language.add_pipe
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Add a component to the processing pipeline. Valid components are
|
||||
| callables that take a #[code Doc] object, modify it and return it. Only
|
||||
| one of #[code before], #[code after], #[code first] or #[code last] can
|
||||
| be set. Default behaviour is #[code last=True].
|
||||
|
||||
+aside-code("Example").
|
||||
def component(doc):
|
||||
# modify Doc and return it
|
||||
return doc
|
||||
|
||||
nlp.add_pipe(component, before='ner')
|
||||
nlp.add_pipe(component, name='custom_name', last=True)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code component]
|
||||
+cell callable
|
||||
+cell The pipeline component.
|
||||
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Name of pipeline component. Overwrites existing
|
||||
| #[code component.name] attribute if available. If no #[code name]
|
||||
| is set and the component exposes no name attribute,
|
||||
| #[code component.__name__] is used. An error is raised if the
|
||||
| name already exists in the pipeline.
|
||||
|
||||
+row
|
||||
+cell #[code before]
|
||||
+cell unicode
|
||||
+cell Component name to insert component directly before.
|
||||
|
||||
+row
|
||||
+cell #[code after]
|
||||
+cell unicode
|
||||
+cell Component name to insert component directly after:
|
||||
|
||||
+row
|
||||
+cell #[code first]
|
||||
+cell bool
|
||||
+cell Insert component first / not first in the pipeline.
|
||||
|
||||
+row
|
||||
+cell #[code last]
|
||||
+cell bool
|
||||
+cell Insert component last / not last in the pipeline.
|
||||
|
||||
+h(2, "get_pipe") Language.get_pipe
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Get a pipeline component for a given component name.
|
||||
|
||||
+aside-code("Example").
|
||||
parser = nlp.get_pipe('parser')
|
||||
custom_component = nlp.get_pipe('custom_component')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell Name of the pipeline component to get.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell callable
|
||||
+cell The pipeline component.
|
||||
|
||||
+h(2, "replace_pipe") Language.replace_pipe
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Replace a component in the pipeline.
|
||||
|
||||
+aside-code("Example").
|
||||
nlp.replace_pipe('parser', my_custom_parser)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell Name of the component to replace.
|
||||
|
||||
+row
|
||||
+cell #[code component]
|
||||
+cell callable
|
||||
+cell The pipeline component to inser.
|
||||
|
||||
|
||||
+h(2, "rename_pipe") Language.rename_pipe
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Rename a component in the pipeline. Useful to create custom names for
|
||||
| pre-defined and pre-loaded components. To change the default name of
|
||||
| a component added to the pipeline, you can also use the #[code name]
|
||||
| argument on #[+api("language#add_pipe") #[code add_pipe]].
|
||||
|
||||
+aside-code("Example").
|
||||
nlp.rename_pipe('parser', 'spacy_parser')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code old_name]
|
||||
+cell unicode
|
||||
+cell Name of the component to rename.
|
||||
|
||||
+row
|
||||
+cell #[code new_name]
|
||||
+cell unicode
|
||||
+cell New name of the component.
|
||||
|
||||
+h(2, "remove_pipe") Language.remove_pipe
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Remove a component from the pipeline. Returns the removed component name
|
||||
| and component function.
|
||||
|
||||
+aside-code("Example").
|
||||
name, component = nlp.remove_pipe('parser')
|
||||
assert name == 'parser'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell Name of the component to remove.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell tuple
|
||||
+cell A #[code (name, component)] tuple of the removed component.
|
||||
|
||||
+h(2, "to_disk") Language.to_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
@ -399,7 +568,15 @@ p Load state from a binary string.
|
|||
+row
|
||||
+cell #[code pipeline]
|
||||
+cell list
|
||||
+cell Sequence of annotation functions.
|
||||
+cell
|
||||
| List of #[code (name, component)] tuples describing the current
|
||||
| processing pipeline, in order.
|
||||
|
||||
+row
|
||||
+cell #[code pipe_names]
|
||||
+tag-new(2)
|
||||
+cell list
|
||||
+cell List of pipeline component names, in order.
|
||||
|
||||
+row
|
||||
+cell #[code meta]
|
||||
|
@ -424,3 +601,12 @@ p Load state from a binary string.
|
|||
+cell
|
||||
| Two-letter language ID, i.e.
|
||||
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
|
||||
|
||||
+row
|
||||
+cell #[code factories]
|
||||
+tag-new(2)
|
||||
+cell dict
|
||||
+cell
|
||||
| Factories that create pre-defined pipeline components, e.g. the
|
||||
| tagger, parser or entity recognizer, keyed by their component
|
||||
| name.
|
||||
|
|
|
@ -116,6 +116,109 @@ p Get the number of tokens in the span.
|
|||
+cell int
|
||||
+cell The number of tokens in the span.
|
||||
|
||||
+h(2, "set_extension") Span.set_extension
|
||||
+tag classmethod
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Define a custom attribute on the #[code Span] which becomes available via
|
||||
| #[code Span._]. For details, see the documentation on
|
||||
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Span
|
||||
city_getter = lambda span: span.text in ('New York', 'Paris', 'Berlin')
|
||||
Span.set_extension('has_city', getter=city_getter)
|
||||
doc = nlp(u'I like New York in Autumn')
|
||||
assert doc[1:4]._.has_city
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Name of the attribute to set by the extension. For example,
|
||||
| #[code 'my_attr'] will be available as #[code span._.my_attr].
|
||||
|
||||
+row
|
||||
+cell #[code default]
|
||||
+cell -
|
||||
+cell
|
||||
| Optional default value of the attribute if no getter or method
|
||||
| is defined.
|
||||
|
||||
+row
|
||||
+cell #[code method]
|
||||
+cell callable
|
||||
+cell
|
||||
| Set a custom method on the object, for example
|
||||
| #[code span._.compare(other_span)].
|
||||
|
||||
+row
|
||||
+cell #[code getter]
|
||||
+cell callable
|
||||
+cell
|
||||
| Getter function that takes the object and returns an attribute
|
||||
| value. Is called when the user accesses the #[code ._] attribute.
|
||||
|
||||
+row
|
||||
+cell #[code setter]
|
||||
+cell callable
|
||||
+cell
|
||||
| Setter function that takes the #[code Span] and a value, and
|
||||
| modifies the object. Is called when the user writes to the
|
||||
| #[code Span._] attribute.
|
||||
|
||||
+h(2, "get_extension") Span.get_extension
|
||||
+tag classmethod
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Look up a previously registered extension by name. Returns a 4-tuple
|
||||
| #[code.u-break (default, method, getter, setter)] if the extension is
|
||||
| registered. Raises a #[code KeyError] otherwise.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Span
|
||||
Span.set_extension('is_city', default=False)
|
||||
extension = Span.get_extension('is_city')
|
||||
assert extension == (False, None, None, None)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell Name of the extension.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell tuple
|
||||
+cell
|
||||
| A #[code.u-break (default, method, getter, setter)] tuple of the
|
||||
| extension.
|
||||
|
||||
+h(2, "has_extension") Span.has_extension
|
||||
+tag classmethod
|
||||
+tag-new(2)
|
||||
|
||||
p Check whether an extension has been registered on the #[code Span] class.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Span
|
||||
Span.set_extension('is_city', default=False)
|
||||
assert Span.has_extension('is_city')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell Name of the extension to check.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the extension has been registered.
|
||||
|
||||
+h(2, "similarity") Span.similarity
|
||||
+tag method
|
||||
+tag-model("vectors")
|
||||
|
|
|
@ -51,6 +51,109 @@ p The number of unicode characters in the token, i.e. #[code token.text].
|
|||
+cell int
|
||||
+cell The number of unicode characters in the token.
|
||||
|
||||
+h(2, "set_extension") Token.set_extension
|
||||
+tag classmethod
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Define a custom attribute on the #[code Token] which becomes available
|
||||
| via #[code Token._]. For details, see the documentation on
|
||||
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Token
|
||||
fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana')
|
||||
Token.set_extension('is_fruit', getter=fruit_getter)
|
||||
doc = nlp(u'I have an apple')
|
||||
assert doc[3]._.is_fruit
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Name of the attribute to set by the extension. For example,
|
||||
| #[code 'my_attr'] will be available as #[code token._.my_attr].
|
||||
|
||||
+row
|
||||
+cell #[code default]
|
||||
+cell -
|
||||
+cell
|
||||
| Optional default value of the attribute if no getter or method
|
||||
| is defined.
|
||||
|
||||
+row
|
||||
+cell #[code method]
|
||||
+cell callable
|
||||
+cell
|
||||
| Set a custom method on the object, for example
|
||||
| #[code token._.compare(other_token)].
|
||||
|
||||
+row
|
||||
+cell #[code getter]
|
||||
+cell callable
|
||||
+cell
|
||||
| Getter function that takes the object and returns an attribute
|
||||
| value. Is called when the user accesses the #[code ._] attribute.
|
||||
|
||||
+row
|
||||
+cell #[code setter]
|
||||
+cell callable
|
||||
+cell
|
||||
| Setter function that takes the #[code Token] and a value, and
|
||||
| modifies the object. Is called when the user writes to the
|
||||
| #[code Token._] attribute.
|
||||
|
||||
+h(2, "get_extension") Token.get_extension
|
||||
+tag classmethod
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Look up a previously registered extension by name. Returns a 4-tuple
|
||||
| #[code.u-break (default, method, getter, setter)] if the extension is
|
||||
| registered. Raises a #[code KeyError] otherwise.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Token
|
||||
Token.set_extension('is_fruit', default=False)
|
||||
extension = Token.get_extension('is_fruit')
|
||||
assert extension == (False, None, None, None)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell Name of the extension.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell tuple
|
||||
+cell
|
||||
| A #[code.u-break (default, method, getter, setter)] tuple of the
|
||||
| extension.
|
||||
|
||||
+h(2, "has_extension") Token.has_extension
|
||||
+tag classmethod
|
||||
+tag-new(2)
|
||||
|
||||
p Check whether an extension has been registered on the #[code Token] class.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Token
|
||||
Token.set_extension('is_fruit', default=False)
|
||||
assert Token.has_extension('is_fruit')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode
|
||||
+cell Name of the extension to check.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the extension has been registered.
|
||||
|
||||
+h(2, "check_flag") Token.check_flag
|
||||
+tag method
|
||||
|
||||
|
|
|
@ -143,6 +143,9 @@
|
|||
|
||||
//- Layout
|
||||
|
||||
.u-width-full
|
||||
width: 100%
|
||||
|
||||
.u-float-left
|
||||
float: left
|
||||
margin-right: 1rem
|
||||
|
@ -166,6 +169,9 @@
|
|||
.u-padding-medium
|
||||
padding: 1.8rem
|
||||
|
||||
.u-padding-top
|
||||
padding-top: 2rem
|
||||
|
||||
.u-inline-block
|
||||
display: inline-block
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
display: inline-block
|
||||
font-size: 0.6em
|
||||
font-weight: bold
|
||||
padding-right: 1.25rem
|
||||
padding-right: 1em
|
||||
margin-left: -3.75rem
|
||||
text-align: right
|
||||
width: 2.5rem
|
||||
|
|
|
@ -456,24 +456,11 @@ p
|
|||
}
|
||||
|
||||
p
|
||||
| To add a lookup lemmatizer to your language, import the #[code LOOKUP]
|
||||
| table and #[code Lemmatizer], and create a new classmethod:
|
||||
| To provide a lookup lemmatizer for your language, import the lookup table
|
||||
| and add it to the #[code Language] class as #[code lemma_lookup]:
|
||||
|
||||
|
||||
+code("__init__py (excerpt)").
|
||||
# other imports here, plus lookup table and lookup lemmatizer
|
||||
from .lemmatizer import LOOKUP
|
||||
from ...lemmatizerlookup import Lemmatizer
|
||||
|
||||
class Xxxxx(Language):
|
||||
lang = 'xx'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
# other language defaults here
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
+code.
|
||||
lemma_lookup = dict(LOOKUP)
|
||||
|
||||
+h(3, "tag-map") Tag map
|
||||
|
||||
|
|
|
@ -103,10 +103,10 @@
|
|||
"title": "Language Processing Pipelines",
|
||||
"next": "vectors-similarity",
|
||||
"menu": {
|
||||
"How pipelines work": "pipelines",
|
||||
"Examples": "examples",
|
||||
"How Pipelines Work": "pipelines",
|
||||
"Custom Components": "custom-components",
|
||||
"Developing Extensions": "extensions",
|
||||
"Multi-threading": "multithreading",
|
||||
"User Hooks": "user-hooks",
|
||||
"Serialization": "serialization"
|
||||
}
|
||||
},
|
||||
|
@ -195,6 +195,7 @@
|
|||
"teaser": "Full code examples you can modify and run.",
|
||||
"next": "resources",
|
||||
"menu": {
|
||||
"Pipeline": "pipeline",
|
||||
"Matching": "matching",
|
||||
"Training": "training",
|
||||
"Deep Learning": "deep-learning"
|
||||
|
|
369
website/usage/_processing-pipelines/_custom-components.jade
Normal file
369
website/usage/_processing-pipelines/_custom-components.jade
Normal file
|
@ -0,0 +1,369 @@
|
|||
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS
|
||||
|
||||
p
|
||||
| A component receives a #[code Doc] object and can modify it – for example,
|
||||
| by using the current weights to make a prediction and set some annotation
|
||||
| on the document. By adding a component to the pipeline, you'll get access
|
||||
| to the #[code Doc] at any point #[strong during processing] – instead of
|
||||
| only being able to modify it afterwards.
|
||||
|
||||
+aside-code("Example").
|
||||
def my_component(doc):
|
||||
# do something to the doc here
|
||||
return doc
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[code Doc]
|
||||
+cell The #[code Doc] object processed by the previous component.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Doc]
|
||||
+cell The #[code Doc] object processed by this pipeline component.
|
||||
|
||||
p
|
||||
| Custom components can be added to the pipeline using the
|
||||
| #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you
|
||||
| can either specify a component to add it #[strong before or after], tell
|
||||
| spaCy to add it #[strong first or last] in the pipeline, or define a
|
||||
| #[strong custom name]. If no name is set and no #[code name] attribute
|
||||
| is present on your component, the function name is used.
|
||||
|
||||
+code("Adding pipeline components").
|
||||
def my_component(doc):
|
||||
print("After tokenization, this doc has %s tokens." % len(doc))
|
||||
if len(doc) < 10:
|
||||
print("This is a pretty short document.")
|
||||
return doc
|
||||
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline.add_pipe(my_component, name='print_info', first=True)
|
||||
print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner']
|
||||
doc = nlp(u"This is a sentence.")
|
||||
|
||||
p
|
||||
| Of course, you can also wrap your component as a class to allow
|
||||
| initialising it with custom settings and hold state within the component.
|
||||
| This is useful for #[strong stateful components], especially ones which
|
||||
| #[strong depend on shared data].
|
||||
|
||||
+code.
|
||||
class MyComponent(object):
|
||||
name = 'print_info'
|
||||
|
||||
def __init__(vocab, short_limit=10):
|
||||
self.vocab = nlp.vocab
|
||||
self.short_limit = short_limit
|
||||
|
||||
def __call__(doc):
|
||||
if len(doc) < self.short_limit:
|
||||
print("This is a pretty short document.")
|
||||
return doc
|
||||
|
||||
my_component = MyComponent(nlp.vocab, short_limit=25)
|
||||
nlp.add_pipe(my_component, first=True)
|
||||
|
||||
+h(3, "custom-components-attributes")
|
||||
| Extension attributes on #[code Doc], #[code Span] and #[code Token]
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| As of v2.0, spaCy allows you to set any custom attributes and methods
|
||||
| on the #[code Doc], #[code Span] and #[code Token], which become
|
||||
| available as #[code Doc._], #[code Span._] and #[code Token._] – for
|
||||
| example, #[code Token._.my_attr]. This lets you store additional
|
||||
| information relevant to your application, add new features and
|
||||
| functionality to spaCy, and implement your own models trained with other
|
||||
| machine learning libraries. It also lets you take advantage of spaCy's
|
||||
| data structures and the #[code Doc] object as the "single source of
|
||||
| truth".
|
||||
|
||||
+aside("Why ._?")
|
||||
| Writing to a #[code ._] attribute instead of to the #[code Doc] directly
|
||||
| keeps a clearer separation and makes it easier to ensure backwards
|
||||
| compatibility. For example, if you've implemented your own #[code .coref]
|
||||
| property and spaCy claims it one day, it'll break your code. Similarly,
|
||||
| just by looking at the code, you'll immediately know what's built-in and
|
||||
| what's custom – for example, #[code doc.sentiment] is spaCy, while
|
||||
| #[code doc._.sent_score] isn't.
|
||||
|
||||
p
|
||||
| There are three main types of extensions, which can be defined using the
|
||||
| #[+api("doc#set_extension") #[code Doc.set_extension]],
|
||||
| #[+api("span#set_extension") #[code Span.set_extension]] and
|
||||
| #[+api("token#set_extension") #[code Token.set_extension]] methods.
|
||||
|
||||
+list("numbers")
|
||||
+item #[strong Attribute extensions].
|
||||
| Set a default value for an attribute, which can be overwritten
|
||||
| manually at any time. Attribute extensions work like "normal"
|
||||
| variables and are the quickest way to store arbitrary information
|
||||
| on a #[code Doc], #[code Span] or #[code Token].
|
||||
|
||||
+code-wrapper
|
||||
+code.
|
||||
Doc.set_extension('hello', default=True)
|
||||
assert doc._.hello
|
||||
doc._.hello = False
|
||||
|
||||
+item #[strong Property extensions].
|
||||
| Define a getter and an optional setter function. If no setter is
|
||||
| provided, the extension is immutable. Since the getter and setter
|
||||
| functions are only called when you #[em retrieve] the attribute,
|
||||
| you can also access values of previously added attribute extensions.
|
||||
| For example, a #[code Doc] getter can average over #[code Token]
|
||||
| attributes. For #[code Span] extensions, you'll almost always want
|
||||
| to use a property – otherwise, you'd have to write to
|
||||
| #[em every possible] #[code Span] in the #[code Doc] to set up the
|
||||
| values correctly.
|
||||
|
||||
+code-wrapper
|
||||
+code.
|
||||
Doc.set_extension('hello', getter=get_hello_value, setter=set_hello_value)
|
||||
assert doc._.hello
|
||||
doc._.hello = 'Hi!'
|
||||
|
||||
+item #[strong Method extensions].
|
||||
| Assign a function that becomes available as an object method. Method
|
||||
| extensions are always immutable. For more details and implementation
|
||||
| ideas, see
|
||||
| #[+a("/usage/examples#custom-components-attr-methods") these examples].
|
||||
|
||||
+code-wrapper
|
||||
+code.o-no-block.
|
||||
Doc.set_extension('hello', method=lambda doc, name: 'Hi {}!'.format(name))
|
||||
assert doc._.hello('Bob') == 'Hi Bob!'
|
||||
|
||||
p
|
||||
| Before you can access a custom extension, you need to register it using
|
||||
| the #[code set_extension] method on the object you want
|
||||
| to add it to, e.g. the #[code Doc]. Keep in mind that extensions are
|
||||
| always #[strong added globally] and not just on a particular instance.
|
||||
| If an attribute of the same name
|
||||
| already exists, or if you're trying to access an attribute that hasn't
|
||||
| been registered, spaCy will raise an #[code AttributeError].
|
||||
|
||||
+code("Example").
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
|
||||
fruits = ['apple', 'pear', 'banana', 'orange', 'strawberry']
|
||||
is_fruit_getter = lambda token: token.text in fruits
|
||||
has_fruit_getter = lambda obj: any([t.text in fruits for t in obj])
|
||||
|
||||
Token.set_extension('is_fruit', getter=is_fruit_getter)
|
||||
Doc.set_extension('has_fruit', getter=has_fruit_getter)
|
||||
Span.set_extension('has_fruit', getter=has_fruit_getter)
|
||||
|
||||
+aside-code("Usage example").
|
||||
doc = nlp(u"I have an apple and a melon")
|
||||
assert doc[3]._.is_fruit # get Token attributes
|
||||
assert not doc[0]._.is_fruit
|
||||
assert doc._.has_fruit # get Doc attributes
|
||||
assert doc[1:4]._.has_fruit # get Span attributes
|
||||
|
||||
p
|
||||
| Once you've registered your custom attribute, you can also use the
|
||||
| built-in #[code set], #[code get] and #[code has] methods to modify and
|
||||
| retrieve the attributes. This is especially useful it you want to pass in
|
||||
| a string instead of calling #[code doc._.my_attr].
|
||||
|
||||
+table(["Method", "Description", "Valid for", "Example"])
|
||||
+row
|
||||
+cell #[code ._.set()]
|
||||
+cell Set a value for an attribute.
|
||||
+cell Attributes, mutable properties.
|
||||
+cell #[code.u-break token._.set('my_attr', True)]
|
||||
|
||||
+row
|
||||
+cell #[code ._.get()]
|
||||
+cell Get the value of an attribute.
|
||||
+cell Attributes, mutable properties, immutable properties, methods.
|
||||
+cell #[code.u-break my_attr = span._.get('my_attr')]
|
||||
|
||||
+row
|
||||
+cell #[code ._.has()]
|
||||
+cell Check if an attribute exists.
|
||||
+cell Attributes, mutable properties, immutable properties, methods.
|
||||
+cell #[code.u-break doc._.has('my_attr')]
|
||||
|
||||
+infobox("How the ._ is implemented")
|
||||
| Extension definitions – the defaults, methods, getters and setters you
|
||||
| pass in to #[code set_extension] are stored in class attributes on the
|
||||
| #[code Underscore] class. If you write to an extension attribute, e.g.
|
||||
| #[code doc._.hello = True], the data is stored within the
|
||||
| #[+api("doc#attributes") #[code Doc.user_data]] dictionary. To keep the
|
||||
| underscore data separate from your other dictionary entries, the string
|
||||
| #[code "._."] is placed before the name, in a tuple.
|
||||
|
||||
+h(4, "component-example1") Example: Custom sentence segmentation logic
|
||||
|
||||
p
|
||||
| Let's say you want to implement custom logic to improve spaCy's sentence
|
||||
| boundary detection. Currently, sentence segmentation is based on the
|
||||
| dependency parse, which doesn't always produce ideal results. The custom
|
||||
| logic should therefore be applied #[strong after] tokenization, but
|
||||
| #[strong before] the dependency parsing – this way, the parser can also
|
||||
| take advantage of the sentence boundaries.
|
||||
|
||||
+code.
|
||||
def sbd_component(doc):
|
||||
for i, token in enumerate(doc[:-2]):
|
||||
# define sentence start if period + titlecase token
|
||||
if token.text == '.' and doc[i+1].is_title:
|
||||
doc[i+1].sent_start = True
|
||||
return doc
|
||||
|
||||
nlp = spacy.load('en')
|
||||
nlp.add_pipe(sbd_component, before='parser') # insert before the parser
|
||||
|
||||
+h(4, "component-example2")
|
||||
| Example: Pipeline component for entity matching and tagging with
|
||||
| custom attributes
|
||||
|
||||
p
|
||||
| This example shows how to create a spaCy extension that takes a
|
||||
| terminology list (in this case, single- and multi-word company names),
|
||||
| matches the occurences in a document, labels them as #[code ORG] entities,
|
||||
| merges the tokens and sets custom #[code is_tech_org] and
|
||||
| #[code has_tech_org] attributes. For efficient matching, the example uses
|
||||
| the #[+api("phrasematcher") #[code PhraseMatcher]] which accepts
|
||||
| #[code Doc] objects as match patterns and works well for large
|
||||
| terminology lists. It also ensures your patterns will always match, even
|
||||
| when you customise spaCy's tokenization rules. When you call #[code nlp]
|
||||
| on a text, the custom pipeline component is applied to the #[code Doc]
|
||||
|
||||
+github("spacy", "examples/pipeline/custom_component_entities.py", false, 500)
|
||||
|
||||
p
|
||||
| Wrapping this functionality in a
|
||||
| pipeline component allows you to reuse the module with different
|
||||
| settings, and have all pre-processing taken care of when you call
|
||||
| #[code nlp] on your text and receive a #[code Doc] object.
|
||||
|
||||
+h(4, "component-example3")
|
||||
| Example: Pipeline component for GPE entities and country meta data via a
|
||||
| REST API
|
||||
|
||||
p
|
||||
| This example shows the implementation of a pipeline component
|
||||
| that fetches country meta data via the
|
||||
| #[+a("https://restcountries.eu") REST Countries API] sets entity
|
||||
| annotations for countries, merges entities into one token and
|
||||
| sets custom attributes on the #[code Doc], #[code Span] and
|
||||
| #[code Token] – for example, the capital, latitude/longitude coordinates
|
||||
| and even the country flag.
|
||||
|
||||
+github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500)
|
||||
|
||||
p
|
||||
| In this case, all data can be fetched on initialisation in one request.
|
||||
| However, if you're working with text that contains incomplete country
|
||||
| names, spelling mistakes or foreign-language versions, you could also
|
||||
| implement a #[code like_country]-style getter function that makes a
|
||||
| request to the search API endpoint and returns the best-matching
|
||||
| result.
|
||||
|
||||
+h(4, "custom-components-usage-ideas") Other usage ideas
|
||||
|
||||
+list
|
||||
+item
|
||||
| #[strong Adding new features and hooking in models]. For example,
|
||||
| a sentiment analysis model, or your preferred solution for
|
||||
| lemmatization or sentiment analysis. spaCy's built-in tagger,
|
||||
| parser and entity recognizer respect annotations that were already
|
||||
| set on the #[code Doc] in a previous step of the pipeline.
|
||||
+item
|
||||
| #[strong Integrating other libraries and APIs]. For example, your
|
||||
| pipeline component can write additional information and data
|
||||
| directly to the #[code Doc] or #[code Token] as custom attributes,
|
||||
| while making sure no information is lost in the process. This can
|
||||
| be output generated by other libraries and models, or an external
|
||||
| service with a REST API.
|
||||
+item
|
||||
| #[strong Debugging and logging]. For example, a component which
|
||||
| stores and/or exports relevant information about the current state
|
||||
| of the processed document, and insert it at any point of your
|
||||
| pipeline.
|
||||
|
||||
+infobox("Developing third-party extensions")
|
||||
| The new pipeline management and custom attributes finally make it easy
|
||||
| to develop your own spaCy extensions and plugins and share them with
|
||||
| others. Extensions can claim their own #[code ._] namespace and exist as
|
||||
| standalone packages. If you're developing a tool or library and want to
|
||||
| make it easy for others to use it with spaCy and add it to their
|
||||
| pipeline, all you have to do is expose a function that takes a
|
||||
| #[code Doc], modifies it and returns it. For more details and
|
||||
| #[strong best practices], see the section on
|
||||
| #[+a("#extensions") developing spaCy extensions].
|
||||
|
||||
+h(3, "custom-components-user-hooks") User hooks
|
||||
|
||||
p
|
||||
| While it's generally recommended to use the #[code Doc._], #[code Span._]
|
||||
| and #[code Token._] proxies to add your own custom attributes, spaCy
|
||||
| offers a few exceptions to allow #[strong customising the built-in methods]
|
||||
| like #[+api("doc#similarity") #[code Doc.similarity]] or
|
||||
| #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can
|
||||
| rely on statistical models you train yourself. For instance, you can
|
||||
| provide your own on-the-fly sentence segmentation algorithm or document
|
||||
| similarity method.
|
||||
|
||||
p
|
||||
| Hooks let you customize some of the behaviours of the #[code Doc],
|
||||
| #[code Span] or #[code Token] objects by adding a component to the
|
||||
| pipeline. For instance, to customize the
|
||||
| #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
|
||||
| component that sets a custom function to
|
||||
| #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
|
||||
| method will check the #[code user_hooks] dict, and delegate to your
|
||||
| function if you've set one. Similar results can be achieved by setting
|
||||
| functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
|
||||
|
||||
+aside("Implementation note")
|
||||
| The hooks live on the #[code Doc] object because the #[code Span] and
|
||||
| #[code Token] objects are created lazily, and don't own any data. They
|
||||
| just proxy to their parent #[code Doc]. This turns out to be convenient
|
||||
| here — we only have to worry about installing hooks in one place.
|
||||
|
||||
+table(["Name", "Customises"])
|
||||
+row
|
||||
+cell #[code user_hooks]
|
||||
+cell
|
||||
+api("doc#vector") #[code Doc.vector]
|
||||
+api("doc#has_vector") #[code Doc.has_vector]
|
||||
+api("doc#vector_norm") #[code Doc.vector_norm]
|
||||
+api("doc#sents") #[code Doc.sents]
|
||||
|
||||
+row
|
||||
+cell #[code user_token_hooks]
|
||||
+cell
|
||||
+api("token#similarity") #[code Token.similarity]
|
||||
+api("token#vector") #[code Token.vector]
|
||||
+api("token#has_vector") #[code Token.has_vector]
|
||||
+api("token#vector_norm") #[code Token.vector_norm]
|
||||
+api("token#conjuncts") #[code Token.conjuncts]
|
||||
|
||||
+row
|
||||
+cell #[code user_span_hooks]
|
||||
+cell
|
||||
+api("span#similarity") #[code Span.similarity]
|
||||
+api("span#vector") #[code Span.vector]
|
||||
+api("span#has_vector") #[code Span.has_vector]
|
||||
+api("span#vector_norm") #[code Span.vector_norm]
|
||||
+api("span#root") #[code Span.root]
|
||||
|
||||
+code("Add custom similarity hooks").
|
||||
class SimilarityModel(object):
|
||||
def __init__(self, model):
|
||||
self._model = model
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.user_hooks['similarity'] = self.similarity
|
||||
doc.user_span_hooks['similarity'] = self.similarity
|
||||
doc.user_token_hooks['similarity'] = self.similarity
|
||||
|
||||
def similarity(self, obj1, obj2):
|
||||
y = self._model([obj1.vector, obj2.vector])
|
||||
return float(y[0])
|
|
@ -1,126 +0,0 @@
|
|||
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES
|
||||
|
||||
p
|
||||
| To see real-world examples of pipeline factories and components in action,
|
||||
| you can have a look at the source of spaCy's built-in components, e.g.
|
||||
| the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
|
||||
| #[+api("entityrecognizer") #[code EntityRecongnizer]].
|
||||
|
||||
+h(3, "example1") Example: Custom sentence segmentation logic
|
||||
|
||||
p
|
||||
| Let's say you want to implement custom logic to improve spaCy's sentence
|
||||
| boundary detection. Currently, sentence segmentation is based on the
|
||||
| dependency parse, which doesn't always produce ideal results. The custom
|
||||
| logic should therefore be applied #[strong after] tokenization, but
|
||||
| #[strong before] the dependency parsing – this way, the parser can also
|
||||
| take advantage of the sentence boundaries.
|
||||
|
||||
+code.
|
||||
def sbd_component(doc):
|
||||
for i, token in enumerate(doc[:-2]):
|
||||
# define sentence start if period + titlecase token
|
||||
if token.text == '.' and doc[i+1].is_title:
|
||||
doc[i+1].sent_start = True
|
||||
return doc
|
||||
|
||||
p
|
||||
| In this case, we simply want to add the component to the existing
|
||||
| pipeline of the English model. We can do this by inserting it at index 0
|
||||
| of #[code nlp.pipeline]:
|
||||
|
||||
+code.
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline.insert(0, sbd_component)
|
||||
|
||||
p
|
||||
| When you call #[code nlp] on some text, spaCy will tokenize it to create
|
||||
| a #[code Doc] object, and first call #[code sbd_component] on it, followed
|
||||
| by the model's default pipeline.
|
||||
|
||||
+h(3, "example2") Example: Sentiment model
|
||||
|
||||
p
|
||||
| Let's say you have trained your own document sentiment model on English
|
||||
| text. After tokenization, you want spaCy to first execute the
|
||||
| #[strong default tensorizer], followed by a custom
|
||||
| #[strong sentiment component] that adds a #[code .sentiment]
|
||||
| property to the #[code Doc], containing your model's sentiment precition.
|
||||
|
||||
p
|
||||
| Your component class will have a #[code from_disk()] method that spaCy
|
||||
| calls to load the model data. When called, the component will compute
|
||||
| the sentiment score, add it to the #[code Doc] and return the modified
|
||||
| document. Optionally, the component can include an #[code update()] method
|
||||
| to allow training the model.
|
||||
|
||||
+code.
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
class SentimentComponent(object):
|
||||
def __init__(self, vocab):
|
||||
self.weights = None
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.sentiment = sum(self.weights*doc.vector) # set sentiment property
|
||||
return doc
|
||||
|
||||
def from_disk(self, path): # path = model path + factory ID ('sentiment')
|
||||
self.weights = pickle.load(Path(path) / 'weights.bin') # load weights
|
||||
return self
|
||||
|
||||
def update(self, doc, gold): # update weights – allows training!
|
||||
prediction = sum(self.weights*doc.vector)
|
||||
self.weights -= 0.001*doc.vector*(prediction-gold.sentiment)
|
||||
|
||||
p
|
||||
| The factory will initialise the component with the #[code Vocab] object.
|
||||
| To be able to add it to your model's pipeline as #[code 'sentiment'],
|
||||
| it also needs to be registered via
|
||||
| #[+api("spacy#set_factory") #[code set_factory()]].
|
||||
|
||||
+code.
|
||||
def sentiment_factory(vocab):
|
||||
component = SentimentComponent(vocab) # initialise component
|
||||
return component
|
||||
|
||||
spacy.set_factory('sentiment', sentiment_factory)
|
||||
|
||||
p
|
||||
| The above code should be #[strong shipped with your model]. You can use
|
||||
| the #[+api("cli#package") #[code package]] command to create all required
|
||||
| files and directories. The model package will include an
|
||||
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]
|
||||
| with a #[code load()] method, that will initialise the language class with
|
||||
| the model's pipeline and call the #[code from_disk()] method to load
|
||||
| the model data.
|
||||
|
||||
p
|
||||
| In the model package's meta.json, specify the language class and pipeline
|
||||
| IDs:
|
||||
|
||||
+code("meta.json (excerpt)", "json").
|
||||
{
|
||||
"name": "sentiment_model",
|
||||
"lang": "en",
|
||||
"version": "1.0.0",
|
||||
"spacy_version": ">=2.0.0,<3.0.0",
|
||||
"pipeline": ["tensorizer", "sentiment"]
|
||||
}
|
||||
|
||||
p
|
||||
| When you load your new model, spaCy will call the model's #[code load()]
|
||||
| method. This will return a #[code Language] object with a pipeline
|
||||
| containing the default tensorizer, and the sentiment component returned
|
||||
| by your custom #[code "sentiment"] factory.
|
||||
|
||||
+code.
|
||||
nlp = spacy.load('en_sentiment_model')
|
||||
doc = nlp(u'I love pizza')
|
||||
assert doc.sentiment
|
||||
|
||||
+infobox("Saving and loading models")
|
||||
| For more information and a detailed guide on how to package your model,
|
||||
| see the documentation on
|
||||
| #[+a("/usage/training#saving-loading") saving and loading models].
|
110
website/usage/_processing-pipelines/_extensions.jade
Normal file
110
website/usage/_processing-pipelines/_extensions.jade
Normal file
|
@ -0,0 +1,110 @@
|
|||
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS
|
||||
|
||||
p
|
||||
| We're very excited about all the new possibilities for community
|
||||
| extensions and plugins in spaCy v2.0, and we can't wait to see what
|
||||
| you build with it! To get you started, here are a few tips, tricks and
|
||||
| best practices:
|
||||
|
||||
+list
|
||||
+item
|
||||
| Make sure to choose a #[strong descriptive and specific name] for
|
||||
| your pipeline component class, and set it as its #[code name]
|
||||
| attribute. Avoid names that are too common or likely to clash with
|
||||
| built-in or a user's other custom components. While it's fine to call
|
||||
| your package "spacy_my_extension", avoid component names including
|
||||
| "spacy", since this can easily lead to confusion.
|
||||
|
||||
+code-wrapper
|
||||
+code-new name = 'myapp_lemmatizer'
|
||||
+code-old name = 'lemmatizer'
|
||||
|
||||
+item
|
||||
| When writing to #[code Doc], #[code Token] or #[code Span] objects,
|
||||
| #[strong use getter functions] wherever possible, and avoid setting
|
||||
| values explicitly. Tokens and spans don't own any data themselves,
|
||||
| so you should provide a function that allows them to compute the
|
||||
| values instead of writing static properties to individual objects.
|
||||
|
||||
+code-wrapper
|
||||
+code-new.
|
||||
is_fruit = lambda token: token.text in ('apple', 'orange')
|
||||
Token.set_extension('is_fruit', getter=is_fruit)
|
||||
+code-old.
|
||||
token._.set_extension('is_fruit', default=False)
|
||||
if token.text in ('apple', 'orange'):
|
||||
token._.set('is_fruit', True)
|
||||
|
||||
+item
|
||||
| Always add your custom attributes to the #[strong global] #[code Doc]
|
||||
| #[code Token] or #[code Span] objects, not a particular instance of
|
||||
| them. Add the attributes #[strong as early as possible], e.g. in
|
||||
| your extension's #[code __init__] method or in the global scope of
|
||||
| your module. This means that in the case of namespace collisions,
|
||||
| the user will see an error immediately, not just when they run their
|
||||
| pipeline.
|
||||
|
||||
+code-wrapper
|
||||
+code-new.
|
||||
from spacy.tokens import Doc
|
||||
def __init__(attr='my_attr'):
|
||||
Doc.set_extension(attr, getter=self.get_doc_attr)
|
||||
+code-old.
|
||||
def __call__(doc):
|
||||
doc.set_extension('my_attr', getter=self.get_doc_attr)
|
||||
|
||||
+item
|
||||
| If your extension is setting properties on the #[code Doc],
|
||||
| #[code Token] or #[code Span], include an option to
|
||||
| #[strong let the user to change those attribute names]. This makes
|
||||
| it easier to avoid namespace collisions and accommodate users with
|
||||
| different naming preferences. We recommend adding an #[code attrs]
|
||||
| argument to the #[code __init__] method of your class so you can
|
||||
| write the names to class attributes and reuse them across your
|
||||
| component.
|
||||
|
||||
+code-wrapper
|
||||
+code-new Doc.set_extension(self.doc_attr, default='some value')
|
||||
+code-old Doc.set_extension('my_doc_attr', default='some value')
|
||||
|
||||
+item
|
||||
| Ideally, extensions should be #[strong standalone packages] with
|
||||
| spaCy and optionally, other packages specified as a dependency. They
|
||||
| can freely assign to their own #[code ._] namespace, but should stick
|
||||
| to that. If your extension's only job is to provide a better
|
||||
| #[code .similarity] implementation, and your docs state this
|
||||
| explicitly, there's no problem with writing to the
|
||||
| #[+a("#custom-components-user-hooks") #[code user_hooks]], and
|
||||
| overwriting spaCy's built-in method. However, a third-party
|
||||
| extension should #[strong never silently overwrite built-ins], or
|
||||
| attributes set by other extensions.
|
||||
|
||||
+item
|
||||
| If you're looking to publish a model that depends on a custom
|
||||
| pipeline component, you can either #[strong require it] in the model
|
||||
| package's dependencies, or – if the component is specific and
|
||||
| lightweight – choose to #[strong ship it with your model package]
|
||||
| and add it to the #[code Language] instance returned by the
|
||||
| model's #[code load()] method. For examples of this, check out the
|
||||
| implementations of spaCy's
|
||||
| #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]]
|
||||
| and #[+api("util#load_model_from_path") #[code load_model_from_path()]]
|
||||
| utility functions.
|
||||
|
||||
+code-wrapper
|
||||
+code-new.
|
||||
nlp.add_pipe(my_custom_component)
|
||||
return nlp.from_disk(model_path)
|
||||
|
||||
+item
|
||||
| Once you're ready to share your extension with others, make sure to
|
||||
| #[strong add docs and installation instructions] (you can
|
||||
| always link to this page for more info). Make it easy for others to
|
||||
| install and use your extension, for example by uploading it to
|
||||
| #[+a("https://pypi.python.org") PyPi]. If you're sharing your code on
|
||||
| GitHub, don't forget to tag it
|
||||
| with #[+a("https://github.com/search?q=topic%3Aspacy") #[code spacy]]
|
||||
| and #[+a("https://github.com/search?q=topic%3Aspacy-pipeline") #[code spacy-pipeline]]
|
||||
| to help people find it. If you post it on Twitter, feel free to tag
|
||||
| #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}]
|
||||
| so we can check it out.
|
|
@ -11,7 +11,7 @@ p
|
|||
|
||||
p
|
||||
| When you load a model, spaCy first consults the model's
|
||||
| #[+a("/usage/saving-loading#models-generating") meta.json]. The
|
||||
| #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The
|
||||
| meta typically includes the model details, the ID of a language class,
|
||||
| and an optional list of pipeline components. spaCy then does the
|
||||
| following:
|
||||
|
@ -21,24 +21,26 @@ p
|
|||
"name": "example_model",
|
||||
"lang": "en"
|
||||
"description": "Example model for spaCy",
|
||||
"pipeline": ["tensorizer", "tagger"]
|
||||
"pipeline": ["tagger", "parser"]
|
||||
}
|
||||
|
||||
+list("numbers")
|
||||
+item
|
||||
| Look up #[strong pipeline IDs] in the available
|
||||
| #[strong pipeline factories].
|
||||
+item
|
||||
| Initialise the #[strong pipeline components] by calling their
|
||||
| factories with the #[code Vocab] as an argument. This gives each
|
||||
| factory and component access to the pipeline's shared data, like
|
||||
| strings, morphology and annotation scheme.
|
||||
+item
|
||||
| Load the #[strong language class and data] for the given ID via
|
||||
| #[+api("util.get_lang_class") #[code get_lang_class]].
|
||||
| #[+api("util.get_lang_class") #[code get_lang_class]] and initialise
|
||||
| it. The #[code Language] class contains the shared vocabulary,
|
||||
| tokenization rules and the language-specific annotation scheme.
|
||||
+item
|
||||
| Pass the path to the #[strong model data] to the #[code Language]
|
||||
| class and return it.
|
||||
| Iterate over the #[strong pipeline names] and create each component
|
||||
| using #[+api("language#create_pipe") #[code create_pipe]], which
|
||||
| looks them up in #[code Language.factories].
|
||||
+item
|
||||
| Add each pipeline component to the pipeline in order, using
|
||||
| #[+api("language#add_pipe") #[code add_pipe]].
|
||||
+item
|
||||
| Make the #[strong model data] available to the #[code Language] class
|
||||
| by calling #[+api("language#from_disk") #[code from_disk]] with the
|
||||
| path to the model data ditectory.
|
||||
|
||||
p
|
||||
| So when you call this...
|
||||
|
@ -47,12 +49,12 @@ p
|
|||
nlp = spacy.load('en')
|
||||
|
||||
p
|
||||
| ... the model tells spaCy to use the pipeline
|
||||
| ... the model tells spaCy to use the language #[code "en"] and the pipeline
|
||||
| #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will
|
||||
| then look up each string in its internal factories registry and
|
||||
| initialise the individual components. It'll then load
|
||||
| #[code spacy.lang.en.English], pass it the path to the model's data
|
||||
| directory, and return it for you to use as the #[code nlp] object.
|
||||
| then initialise #[code spacy.lang.en.English], and create each pipeline
|
||||
| component and add it to the processing pipeline. It'll then load in the
|
||||
| model's data from its data ditectory and return the modified
|
||||
| #[code Language] class for you to use as the #[code nlp] object.
|
||||
|
||||
p
|
||||
| Fundamentally, a #[+a("/models") spaCy model] consists of three
|
||||
|
@ -73,9 +75,12 @@ p
|
|||
pipeline = ['tensorizer', 'tagger', 'parser', 'ner']
|
||||
data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0'
|
||||
|
||||
cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
|
||||
nlp = cls(pipeline=pipeline) # 2. initialise it with the pipeline
|
||||
nlp.from_disk(model_data_path) # 3. load in the binary data
|
||||
cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
|
||||
nlp = cls() # 2. initialise it
|
||||
for name in pipeline:
|
||||
component = nlp.create_pipe(name) # 3. create the pipeline components
|
||||
nlp.add_pipe(component) # 4. add the component to the pipeline
|
||||
nlp.from_disk(model_data_path) # 5. load in the binary data
|
||||
|
||||
p
|
||||
| When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
|
||||
|
@ -87,124 +92,23 @@ p
|
|||
| document, which is then processed by the component next in the pipeline.
|
||||
|
||||
+code("The pipeline under the hood").
|
||||
doc = nlp.make_doc(u'This is a sentence')
|
||||
for proc in nlp.pipeline:
|
||||
doc = proc(doc)
|
||||
|
||||
+h(3, "creating") Creating pipeline components and factories
|
||||
doc = nlp.make_doc(u'This is a sentence') # create a Doc from raw text
|
||||
for name, proc in nlp.pipeline: # iterate over components in order
|
||||
doc = proc(doc) # apply each component
|
||||
|
||||
p
|
||||
| spaCy lets you customise the pipeline with your own components. Components
|
||||
| are functions that receive a #[code Doc] object, modify and return it.
|
||||
| If your component is stateful, you'll want to create a new one for each
|
||||
| pipeline. You can do that by defining and registering a factory which
|
||||
| receives the shared #[code Vocab] object and returns a component.
|
||||
|
||||
+h(4, "creating-component") Creating a component
|
||||
|
||||
p
|
||||
| A component receives a #[code Doc] object and
|
||||
| #[strong performs the actual processing] – for example, using the current
|
||||
| weights to make a prediction and set some annotation on the document. By
|
||||
| adding a component to the pipeline, you'll get access to the #[code Doc]
|
||||
| at any point #[strong during] processing – instead of only being able to
|
||||
| modify it afterwards.
|
||||
|
||||
+aside-code("Example").
|
||||
def my_component(doc):
|
||||
# do something to the doc here
|
||||
return doc
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[code Doc]
|
||||
+cell The #[code Doc] object processed by the previous component.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Doc]
|
||||
+cell The #[code Doc] object processed by this pipeline component.
|
||||
|
||||
p
|
||||
| When creating a new #[code Language] class, you can pass it a list of
|
||||
| pipeline component functions to execute in that order. You can also
|
||||
| add it to an existing pipeline by modifying #[code nlp.pipeline] – just
|
||||
| be careful not to overwrite a pipeline or its components by accident!
|
||||
| The current processing pipeline is available as #[code nlp.pipeline],
|
||||
| which returns a list of #[code (name, component)] tuples, or
|
||||
| #[code nlp.pipe_names], which only returns a list of human-readable
|
||||
| component names.
|
||||
|
||||
+code.
|
||||
# Create a new Language object with a pipeline
|
||||
from spacy.language import Language
|
||||
nlp = Language(pipeline=[my_component])
|
||||
nlp.pipeline
|
||||
# [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
|
||||
nlp.pipe_names
|
||||
# ['tagger', 'parser', 'ner']
|
||||
|
||||
# Modify an existing pipeline
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline.append(my_component)
|
||||
|
||||
+h(4, "creating-factory") Creating a factory
|
||||
|
||||
p
|
||||
| A factory is a #[strong function that returns a pipeline component].
|
||||
| It's called with the #[code Vocab] object, to give it access to the
|
||||
| shared data between components – for example, the strings, morphology,
|
||||
| vectors or annotation scheme. Factories are useful for creating
|
||||
| #[strong stateful components], especially ones which
|
||||
| #[strong depend on shared data].
|
||||
|
||||
+aside-code("Example").
|
||||
def my_factory(vocab):
|
||||
# load some state
|
||||
def my_component(doc):
|
||||
# process the doc
|
||||
return doc
|
||||
return my_component
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell
|
||||
| Shared data between components, including strings, morphology,
|
||||
| vectors etc.
|
||||
|
||||
+row("foot")
|
||||
+cell returns
|
||||
+cell callable
|
||||
+cell The pipeline component.
|
||||
|
||||
p
|
||||
| By creating a factory, you're essentially telling spaCy how to get the
|
||||
| pipeline component #[strong once the vocab is available]. Factories need to
|
||||
| be registered via #[+api("spacy#set_factory") #[code set_factory()]] and
|
||||
| by assigning them a unique ID. This ID can be added to the pipeline as a
|
||||
| string. When creating a pipeline, you're free to mix strings and
|
||||
| callable components:
|
||||
|
||||
+code.
|
||||
spacy.set_factory('my_factory', my_factory)
|
||||
nlp = Language(pipeline=['my_factory', my_other_component])
|
||||
|
||||
p
|
||||
| If spaCy comes across a string in the pipeline, it will try to resolve it
|
||||
| by looking it up in the available factories. The factory will then be
|
||||
| initialised with the #[code Vocab]. Providing factory names instead of
|
||||
| callables also makes it easy to specify them in the model's
|
||||
| #[+a("/usage/saving-loading#models-generating") meta.json]. If you're
|
||||
| training your own model and want to use one of spaCy's default components,
|
||||
| you won't have to worry about finding and implementing it either – to use
|
||||
| the default tagger, simply add #[code "tagger"] to the pipeline, and
|
||||
| #[strong spaCy will know what to do].
|
||||
|
||||
+infobox("Important note")
|
||||
| Because factories are #[strong resolved on initialisation] of the
|
||||
| #[code Language] class, it's #[strong not possible] to add them to the
|
||||
| pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
|
||||
| works with individual component functions. To use factories, you need to
|
||||
| create a new #[code Language] object, or generate a
|
||||
| #[+a("/usage/training#models-generating") model package] with
|
||||
| a custom pipeline.
|
||||
|
||||
+h(3, "disabling") Disabling pipeline components
|
||||
+h(3, "disabling") Disabling and modifying pipeline components
|
||||
|
||||
p
|
||||
| If you don't need a particular component of the pipeline – for
|
||||
|
@ -217,16 +121,19 @@ p
|
|||
+code.
|
||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
||||
nlp = English().from_disk('/model', disable=['tensorizer', 'ner'])
|
||||
doc = nlp(u"I don't want parsed", disable=['parser'])
|
||||
|
||||
p
|
||||
| Note that you can't write directly to #[code nlp.pipeline], as this list
|
||||
| holds the #[em actual components], not the IDs. However, if you know the
|
||||
| order of the components, you can still slice the list:
|
||||
| You can also use the #[+api("language#remove_pipe") #[code remove_pipe]]
|
||||
| method to remove pipeline components from an existing pipeline, the
|
||||
| #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them,
|
||||
| or the #[+api("language#replace_pipe") #[code replace_pipe]] method
|
||||
| to replace them with a custom component entirely (more details on this
|
||||
| in the section on #[+a("#custom-components") custom components].
|
||||
|
||||
+code.
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline = nlp.pipeline[:2] # only use the first two components
|
||||
nlp.remove_pipe('parser')
|
||||
nlp.rename_pipe('ner', 'entityrecognizer')
|
||||
nlp.replace_pipe('tagger', my_custom_tagger)
|
||||
|
||||
+infobox("Important note: disabling pipeline components")
|
||||
.o-block
|
||||
|
@ -234,12 +141,14 @@ p
|
|||
| processing pipeline components, the #[code parser], #[code tagger]
|
||||
| and #[code entity] keyword arguments have been replaced with
|
||||
| #[code disable], which takes a list of pipeline component names.
|
||||
| This lets you disable both default and custom components when loading
|
||||
| This lets you disable pre-defined components when loading
|
||||
| a model, or initialising a Language class via
|
||||
| #[+api("language-from_disk") #[code from_disk]].
|
||||
|
||||
+code-new.
|
||||
nlp = spacy.load('en', disable=['tagger', 'ner'])
|
||||
doc = nlp(u"I don't want parsed", disable=['parser'])
|
||||
nlp = spacy.load('en', disable=['ner'])
|
||||
nlp.remove_pipe('parser')
|
||||
doc = nlp(u"I don't want parsed")
|
||||
+code-old.
|
||||
nlp = spacy.load('en', tagger=False, entity=False)
|
||||
doc = nlp(u"I don't want parsed", parse=False)
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS
|
||||
|
||||
p
|
||||
| Hooks let you customize some of the behaviours of the #[code Doc],
|
||||
| #[code Span] or #[code Token] objects by adding a component to the
|
||||
| pipeline. For instance, to customize the
|
||||
| #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
|
||||
| component that sets a custom function to
|
||||
| #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
|
||||
| method will check the #[code user_hooks] dict, and delegate to your
|
||||
| function if you've set one. Similar results can be achieved by setting
|
||||
| functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
|
||||
|
||||
+code("Polymorphic similarity example").
|
||||
span.similarity(doc)
|
||||
token.similarity(span)
|
||||
doc1.similarity(doc2)
|
||||
|
||||
p
|
||||
| By default, this just averages the vectors for each document, and
|
||||
| computes their cosine. Obviously, spaCy should make it easy for you to
|
||||
| install your own similarity model. This introduces a tricky design
|
||||
| challenge. The current solution is to add three more dicts to the
|
||||
| #[code Doc] object:
|
||||
|
||||
+aside("Implementation note")
|
||||
| The hooks live on the #[code Doc] object because the #[code Span] and
|
||||
| #[code Token] objects are created lazily, and don't own any data. They
|
||||
| just proxy to their parent #[code Doc]. This turns out to be convenient
|
||||
| here — we only have to worry about installing hooks in one place.
|
||||
|
||||
+table(["Name", "Description"])
|
||||
+row
|
||||
+cell #[code user_hooks]
|
||||
+cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
|
||||
|
||||
+row
|
||||
+cell #[code user_token_hooks]
|
||||
+cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
|
||||
|
||||
+row
|
||||
+cell #[code user_span_hooks]
|
||||
+cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
|
||||
|
||||
p
|
||||
| To sum up, here's an example of hooking in custom #[code .similarity()]
|
||||
| methods:
|
||||
|
||||
+code("Add custom similarity hooks").
|
||||
class SimilarityModel(object):
|
||||
def __init__(self, model):
|
||||
self._model = model
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.user_hooks['similarity'] = self.similarity
|
||||
doc.user_span_hooks['similarity'] = self.similarity
|
||||
doc.user_token_hooks['similarity'] = self.similarity
|
||||
|
||||
def similarity(self, obj1, obj2):
|
||||
y = self._model([obj1.vector, obj2.vector])
|
||||
return float(y[0])
|
|
@ -175,7 +175,7 @@ p
|
|||
|
||||
+code.
|
||||
import spacy
|
||||
from spacy.tokens.doc import Doc
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
nlp = spacy.load('en')
|
||||
|
|
|
@ -61,7 +61,7 @@ p
|
|||
output_path.open('w', encoding='utf-8').write(svg)
|
||||
|
||||
p
|
||||
| The above code will generate the dependency visualizations and them to
|
||||
| The above code will generate the dependency visualizations as to
|
||||
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,44 @@
|
|||
|
||||
include ../_includes/_mixins
|
||||
|
||||
+section("pipeline")
|
||||
+h(3, "custom-components-entities") Custom pipeline components and attribute extensions
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| This example shows the implementation of a pipeline component
|
||||
| that sets entity annotations based on a list of single or
|
||||
| multiple-word company names, merges entities into one token and
|
||||
| sets custom attributes on the #[code Doc], #[code Span] and
|
||||
| #[code Token].
|
||||
|
||||
+github("spacy", "examples/pipeline/custom_component_entities.py")
|
||||
|
||||
+h(3, "custom-components-api")
|
||||
| Custom pipeline components and attribute extensions via a REST API
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| This example shows the implementation of a pipeline component
|
||||
| that fetches country meta data via the
|
||||
| #[+a("https://restcountries.eu") REST Countries API] sets entity
|
||||
| annotations for countries, merges entities into one token and
|
||||
| sets custom attributes on the #[code Doc], #[code Span] and
|
||||
| #[code Token] – for example, the capital, latitude/longitude
|
||||
| coordinates and the country flag.
|
||||
|
||||
+github("spacy", "examples/pipeline/custom_component_countries_api.py")
|
||||
|
||||
+h(3, "custom-components-attr-methods") Custom method extensions
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| A collection of snippets showing examples of extensions adding
|
||||
| custom methods to the #[code Doc], #[code Token] and
|
||||
| #[code Span].
|
||||
|
||||
+github("spacy", "examples/pipeline/custom_attr_methods.py")
|
||||
|
||||
+section("matching")
|
||||
+h(3, "matcher") Using spaCy's rule-based matcher
|
||||
|
||||
|
|
|
@ -8,18 +8,18 @@ include _spacy-101/_pipelines
|
|||
+h(2, "pipelines") How pipelines work
|
||||
include _processing-pipelines/_pipelines
|
||||
|
||||
+section("examples")
|
||||
+h(2, "examples") Examples
|
||||
include _processing-pipelines/_examples
|
||||
+section("custom-components")
|
||||
+h(2, "custom-components") Creating custom pipeline components
|
||||
include _processing-pipelines/_custom-components
|
||||
|
||||
+section("extensions")
|
||||
+h(2, "extensions") Developing spaCy extensions
|
||||
include _processing-pipelines/_extensions
|
||||
|
||||
+section("multithreading")
|
||||
+h(2, "multithreading") Multi-threading
|
||||
include _processing-pipelines/_multithreading
|
||||
|
||||
+section("user-hooks")
|
||||
+h(2, "user-hooks") User hooks
|
||||
include _processing-pipelines/_user-hooks
|
||||
|
||||
+section("serialization")
|
||||
+h(2, "serialization") Serialization
|
||||
include _processing-pipelines/_serialization
|
||||
|
|
|
@ -102,30 +102,36 @@ p
|
|||
+h(3, "features-pipelines") Improved processing pipelines
|
||||
|
||||
+aside-code("Example").
|
||||
# Modify an existing pipeline
|
||||
nlp = spacy.load('en')
|
||||
nlp.pipeline.append(my_component)
|
||||
# Set custom attributes
|
||||
Doc.set_extension('my_attr', default=False)
|
||||
Token.set_extension('my_attr', getter=my_token_getter)
|
||||
assert doc._.my_attr, token._.my_attr
|
||||
|
||||
# Register a factory to create a component
|
||||
spacy.set_factory('my_factory', my_factory)
|
||||
nlp = Language(pipeline=['my_factory', mycomponent])
|
||||
# Add components to the pipeline
|
||||
my_component = lambda doc: doc
|
||||
nlp.add_pipe(my_component)
|
||||
|
||||
p
|
||||
| It's now much easier to #[strong customise the pipeline] with your own
|
||||
| components, functions that receive a #[code Doc] object, modify and
|
||||
| return it. If your component is stateful, you can define and register a
|
||||
| factory which receives the shared #[code Vocab] object and returns a
|
||||
| component. spaCy's default components can be added to your pipeline by
|
||||
| using their string IDs. This way, you won't have to worry about finding
|
||||
| and implementing them – simply add #[code "tagger"] to the pipeline,
|
||||
| and spaCy will know what to do.
|
||||
| components: functions that receive a #[code Doc] object, modify and
|
||||
| return it. Extensions let you write any
|
||||
| #[strong attributes, properties and methods] to the #[code Doc],
|
||||
| #[code Token] and #[code Span]. You can add data, implement new
|
||||
| features, integrate other libraries with spaCy or plug in your own
|
||||
| machine learning models.
|
||||
|
||||
+image
|
||||
include ../assets/img/pipeline.svg
|
||||
|
||||
+infobox
|
||||
| #[+label-inline API:] #[+api("language") #[code Language]]
|
||||
| #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text]
|
||||
| #[+label-inline API:] #[+api("language") #[code Language]],
|
||||
| #[+api("doc#set_extension") #[code Doc.set_extension]],
|
||||
| #[+api("span#set_extension") #[code Span.set_extension]],
|
||||
| #[+api("token#set_extension") #[code Token.set_extension]]
|
||||
| #[+label-inline Usage:]
|
||||
| #[+a("/usage/processing-pipelines") Processing pipelines]
|
||||
| #[+label-inline Code:]
|
||||
| #[+src("/usage/examples#section-pipeline") Pipeline examples]
|
||||
|
||||
+h(3, "features-text-classification") Text classification
|
||||
|
||||
|
@ -478,15 +484,16 @@ p
|
|||
p
|
||||
| If you've been using custom pipeline components, check out the new
|
||||
| guide on #[+a("/usage/language-processing-pipelines") processing pipelines].
|
||||
| Appending functions to the pipeline still works – but you might be able
|
||||
| to make this more convenient by registering "component factories".
|
||||
| Components of the processing pipeline can now be disabled by passing a
|
||||
| list of their names to the #[code disable] keyword argument on loading
|
||||
| or processing.
|
||||
| Appending functions to the pipeline still works – but the
|
||||
| #[+api("language#add_pipe") #[code add_pipe]] methods now makes this
|
||||
| much more convenient. Components of the processing pipeline can now
|
||||
| be disabled by passing a list of their names to the #[code disable]
|
||||
| keyword argument on load, or by simply demoving them from the
|
||||
| pipeline alltogether.
|
||||
|
||||
+code-new.
|
||||
nlp = spacy.load('en', disable=['tagger', 'ner'])
|
||||
doc = nlp(u"I don't want parsed", disable=['parser'])
|
||||
nlp.remove_pipe('parser')
|
||||
+code-old.
|
||||
nlp = spacy.load('en', tagger=False, entity=False)
|
||||
doc = nlp(u"I don't want parsed", parse=False)
|
||||
|
|
Loading…
Reference in New Issue
Block a user