diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py new file mode 100644 index 000000000..9b1a8325d --- /dev/null +++ b/examples/pipeline/custom_attr_methods.py @@ -0,0 +1,52 @@ +# coding: utf-8 +"""This example contains several snippets of methods that can be set via custom +Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like +they're "bound" to the object and are partially applied – i.e. the object +they're called on is passed in as the first argument.""" +from __future__ import unicode_literals + +from spacy.lang.en import English +from spacy.tokens import Doc, Span +from spacy import displacy +from pathlib import Path + + +def to_html(doc, output='/tmp', style='dep'): + """Doc method extension for saving the current state as a displaCy + visualization. + """ + # generate filename from first six non-punct tokens + file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html' + output_path = Path(output) / file_name + html = displacy.render(doc, style=style, page=True) # render markup + output_path.open('w', encoding='utf-8').write(html) # save to file + print('Saved HTML to {}'.format(output_path)) + + +Doc.set_extension('to_html', method=to_html) + +nlp = English() +doc = nlp(u"This is a sentence about Apple.") +# add entity manually for demo purposes, to make it work without a model +doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] +doc._.to_html(style='ent') + + +def overlap_tokens(doc, other_doc): + """Get the tokens from the original Doc that are also in the comparison Doc. + """ + overlap = [] + other_tokens = [token.text for token in other_doc] + for token in doc: + if token.text in other_tokens: + overlap.append(token) + return overlap + + +Doc.set_extension('overlap', method=overlap_tokens) + +nlp = English() +doc1 = nlp(u"Peach emoji is where it has always been.") +doc2 = nlp(u"Peach is the superior emoji.") +tokens = doc1._.overlap(doc2) +print(tokens) diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py new file mode 100644 index 000000000..2554af967 --- /dev/null +++ b/examples/pipeline/custom_component_countries_api.py @@ -0,0 +1,108 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import requests + +from spacy.lang.en import English +from spacy.matcher import PhraseMatcher +from spacy.tokens import Doc, Span, Token + + +class RESTCountriesComponent(object): + """Example of a spaCy v2.0 pipeline component that requests all countries + via the REST Countries API, merges country names into one token, assigns + entity labels and sets attributes on country tokens, e.g. the capital and + lat/lng coordinates. Can be extended with more details from the API. + + REST Countries API: https://restcountries.eu + API License: Mozilla Public License MPL 2.0 + """ + name = 'rest_countries' # component name, will show up in the pipeline + + def __init__(self, nlp, label='GPE'): + """Initialise the pipeline component. The shared nlp instance is used + to initialise the matcher with the shared vocab, get the label ID and + generate Doc objects as phrase match patterns. + """ + # Make request once on initialisation and store the data + r = requests.get('https://restcountries.eu/rest/v2/all') + r.raise_for_status() # make sure requests raises an error if it fails + countries = r.json() + + # Convert API response to dict keyed by country name for easy lookup + # This could also be extended using the alternative and foreign language + # names provided by the API + self.countries = {c['name']: c for c in countries} + self.label = nlp.vocab.strings[label] # get entity label ID + + # Set up the PhraseMatcher with Doc patterns for each country name + patterns = [nlp(c) for c in self.countries.keys()] + self.matcher = PhraseMatcher(nlp.vocab) + self.matcher.add('COUNTRIES', None, *patterns) + + # Register attribute on the Token. We'll be overwriting this based on + # the matches, so we're only setting a default value, not a getter. + # If no default value is set, it defaults to None. + Token.set_extension('is_country', default=False) + Token.set_extension('country_capital') + Token.set_extension('country_latlng') + Token.set_extension('country_flag') + + # Register attributes on Doc and Span via a getter that checks if one of + # the contained tokens is set to is_country == True. + Doc.set_extension('has_country', getter=self.has_country) + Span.set_extension('has_country', getter=self.has_country) + + + def __call__(self, doc): + """Apply the pipeline component on a Doc object and modify it if matches + are found. Return the Doc, so it can be processed by the next component + in the pipeline, if available. + """ + matches = self.matcher(doc) + spans = [] # keep the spans for later so we can merge them afterwards + for _, start, end in matches: + # Generate Span representing the entity & set label + entity = Span(doc, start, end, label=self.label) + spans.append(entity) + # Set custom attribute on each token of the entity + # Can be extended with other data returned by the API, like + # currencies, country code, flag, calling code etc. + for token in entity: + token._.set('is_country', True) + token._.set('country_capital', self.countries[entity.text]['capital']) + token._.set('country_latlng', self.countries[entity.text]['latlng']) + token._.set('country_flag', self.countries[entity.text]['flag']) + # Overwrite doc.ents and add entity – be careful not to replace! + doc.ents = list(doc.ents) + [entity] + for span in spans: + # Iterate over all spans and merge them into one token. This is done + # after setting the entities – otherwise, it would cause mismatched + # indices! + span.merge() + return doc # don't forget to return the Doc! + + def has_country(self, tokens): + """Getter for Doc and Span attributes. Returns True if one of the tokens + is a country. Since the getter is only called when we access the + attribute, we can refer to the Token's 'is_country' attribute here, + which is already set in the processing step.""" + return any([t._.get('is_country') for t in tokens]) + + +# For simplicity, we start off with only the blank English Language class and +# no model or pre-defined pipeline loaded. + +nlp = English() +rest_countries = RESTCountriesComponent(nlp) # initialise component +nlp.add_pipe(rest_countries) # add it to the pipeline + +doc = nlp(u"Some text about Colombia and the Czech Republic") + +print('Pipeline', nlp.pipe_names) # pipeline contains component name +print('Doc has countries', doc._.has_country) # Doc contains countries +for token in doc: + if token._.is_country: + print(token.text, token._.country_capital, token._.country_latlng, + token._.country_flag) # country data +print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py new file mode 100644 index 000000000..a0d9c61ec --- /dev/null +++ b/examples/pipeline/custom_component_entities.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from spacy.lang.en import English +from spacy.matcher import PhraseMatcher +from spacy.tokens import Doc, Span, Token + + +class TechCompanyRecognizer(object): + """Example of a spaCy v2.0 pipeline component that sets entity annotations + based on list of single or multiple-word company names. Companies are + labelled as ORG and their spans are merged into one token. Additionally, + ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token + respectively.""" + name = 'tech_companies' # component name, will show up in the pipeline + + def __init__(self, nlp, companies=tuple(), label='ORG'): + """Initialise the pipeline component. The shared nlp instance is used + to initialise the matcher with the shared vocab, get the label ID and + generate Doc objects as phrase match patterns. + """ + self.label = nlp.vocab.strings[label] # get entity label ID + + # Set up the PhraseMatcher – it can now take Doc objects as patterns, + # so even if the list of companies is long, it's very efficient + patterns = [nlp(org) for org in companies] + self.matcher = PhraseMatcher(nlp.vocab) + self.matcher.add('TECH_ORGS', None, *patterns) + + # Register attribute on the Token. We'll be overwriting this based on + # the matches, so we're only setting a default value, not a getter. + Token.set_extension('is_tech_org', default=False) + + # Register attributes on Doc and Span via a getter that checks if one of + # the contained tokens is set to is_tech_org == True. + Doc.set_extension('has_tech_org', getter=self.has_tech_org) + Span.set_extension('has_tech_org', getter=self.has_tech_org) + + def __call__(self, doc): + """Apply the pipeline component on a Doc object and modify it if matches + are found. Return the Doc, so it can be processed by the next component + in the pipeline, if available. + """ + matches = self.matcher(doc) + spans = [] # keep the spans for later so we can merge them afterwards + for _, start, end in matches: + # Generate Span representing the entity & set label + entity = Span(doc, start, end, label=self.label) + spans.append(entity) + # Set custom attribute on each token of the entity + for token in entity: + token._.set('is_tech_org', True) + # Overwrite doc.ents and add entity – be careful not to replace! + doc.ents = list(doc.ents) + [entity] + for span in spans: + # Iterate over all spans and merge them into one token. This is done + # after setting the entities – otherwise, it would cause mismatched + # indices! + span.merge() + return doc # don't forget to return the Doc! + + def has_tech_org(self, tokens): + """Getter for Doc and Span attributes. Returns True if one of the tokens + is a tech org. Since the getter is only called when we access the + attribute, we can refer to the Token's 'is_tech_org' attribute here, + which is already set in the processing step.""" + return any([t._.get('is_tech_org') for t in tokens]) + + +# For simplicity, we start off with only the blank English Language class and +# no model or pre-defined pipeline loaded. + +nlp = English() +companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc. +component = TechCompanyRecognizer(nlp, companies) # initialise component +nlp.add_pipe(component, last=True) # add it to the pipeline as the last element + +doc = nlp(u"Alphabet Inc. is the company behind Google.") + +print('Pipeline', nlp.pipe_names) # pipeline contains component name +print('Tokens', [t.text for t in doc]) # company names from the list are merged +print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs +print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org +print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not +print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities diff --git a/examples/training/train_ner_standalone.py b/examples/training/train_ner_standalone.py index e4fb1d1e8..0c5094bb7 100644 --- a/examples/training/train_ner_standalone.py +++ b/examples/training/train_ner_standalone.py @@ -6,7 +6,7 @@ To achieve that, it duplicates some of spaCy's internal functionality. Specifically, in this example, we don't use spaCy's built-in Language class to wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write -our own simle Pipeline class, so that it's easier to see how the pieces +our own simple Pipeline class, so that it's easier to see how the pieces interact. Input data: @@ -142,16 +142,15 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5): inputs, annots = zip(*batch) nlp.update(list(inputs), list(annots), sgd, losses=losses) scores = nlp.evaluate(dev_examples) - report_scores(i, losses['ner'], scores) - scores = nlp.evaluate(dev_examples) - report_scores(channels, i+1, loss, scores) + report_scores(i+1, losses['ner'], scores) def report_scores(i, loss, scores): precision = '%.2f' % scores['ents_p'] recall = '%.2f' % scores['ents_r'] f_measure = '%.2f' % scores['ents_f'] - print('%d %s %s %s' % (int(loss), precision, recall, f_measure)) + print('Epoch %d: %d %s %s %s' % ( + i, int(loss), precision, recall, f_measure)) def read_examples(path): diff --git a/spacy/__main__.py b/spacy/__main__.py index 0ec96e4a1..99d6b116c 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -7,7 +7,7 @@ if __name__ == '__main__': import plac import sys from spacy.cli import download, link, info, package, train, convert, model - from spacy.cli import profile, evaluate + from spacy.cli import profile, evaluate, validate from spacy.util import prints commands = { @@ -20,6 +20,7 @@ if __name__ == '__main__': 'package': package, 'model': model, 'profile': profile, + 'validate': validate } if len(sys.argv) == 1: prints(', '.join(commands), title="Available commands", exits=1) diff --git a/spacy/_ml.py b/spacy/_ml.py index 62e0ceb9a..b07e179f0 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -311,7 +311,7 @@ def link_vectors_to_models(vocab): def Tok2Vec(width, embed_size, **kwargs): pretrained_dims = kwargs.get('pretrained_dims', 0) - cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3) + cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, '*': reapply}): diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index ebe185f24..2595dcc03 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -7,3 +7,4 @@ from .train import train from .evaluate import evaluate from .convert import convert from .model import model +from .validate import validate diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 89615bbe8..d9a812a15 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import plac from pathlib import Path -from .converters import conllu2json, iob2json +from .converters import conllu2json, iob2json, conll_ner2json from ..util import prints # Converters are matched by file extension. To add a converter, add a new entry @@ -12,9 +12,10 @@ from ..util import prints # from /converters. CONVERTERS = { - '.conllu': conllu2json, - '.conll': conllu2json, - '.iob': iob2json, + 'conllu': conllu2json, + 'conll': conllu2json, + 'ner': conll_ner2json, + 'iob': iob2json, } @@ -22,9 +23,11 @@ CONVERTERS = { input_file=("input file", "positional", None, str), output_dir=("output directory for converted file", "positional", None, str), n_sents=("Number of sentences per doc", "option", "n", int), + converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): +def convert(cmd, input_file, output_dir, n_sents=1, morphology=False, + converter='auto'): """ Convert files into JSON format for use with train command and other experiment management functions. @@ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): prints(input_path, title="Input file not found", exits=1) if not output_path.exists(): prints(output_path, title="Output directory not found", exits=1) - file_ext = input_path.suffix - if not file_ext in CONVERTERS: - prints("Can't find converter for %s" % input_path.parts[-1], - title="Unknown format", exits=1) - CONVERTERS[file_ext](input_path, output_path, - n_sents=n_sents, use_morphology=morphology) + if converter == 'auto': + converter = input_path.suffix[1:] + if not converter in CONVERTERS: + prints("Can't find converter for %s" % converter, + title="Unknown format", exits=1) + func = CONVERTERS[converter] + func(input_path, output_path, + n_sents=n_sents, use_morphology=morphology) diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py index 9026d16c6..02b596d4d 100644 --- a/spacy/cli/converters/__init__.py +++ b/spacy/cli/converters/__init__.py @@ -1,2 +1,3 @@ from .conllu2json import conllu2json from .iob2json import iob2json +from .conll_ner2json import conll_ner2json diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py new file mode 100644 index 000000000..e3bd82e7e --- /dev/null +++ b/spacy/cli/converters/conll_ner2json.py @@ -0,0 +1,50 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...compat import json_dumps, path2str +from ...util import prints +from ...gold import iob_to_biluo + + +def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): + """ + Convert files in the CoNLL-2003 NER format into JSON format for use with train cli. + """ + docs = read_conll_ner(input_path) + + output_filename = input_path.parts[-1].replace(".conll", "") + ".json" + output_filename = input_path.parts[-1].replace(".conll", "") + ".json" + output_file = output_path / output_filename + with output_file.open('w', encoding='utf-8') as f: + f.write(json_dumps(docs)) + prints("Created %d documents" % len(docs), + title="Generated output file %s" % path2str(output_file)) + + +def read_conll_ner(input_path): + text = input_path.open('r', encoding='utf-8').read() + i = 0 + delimit_docs = '-DOCSTART- -X- O O' + output_docs = [] + for doc in text.strip().split(delimit_docs): + doc = doc.strip() + if not doc: + continue + output_doc = [] + for sent in doc.split('\n\n'): + sent = sent.strip() + if not sent: + continue + lines = [line.strip() for line in sent.split('\n') if line.strip()] + words, tags, chunks, iob_ents = zip(*[line.split() for line in lines]) + biluo_ents = iob_to_biluo(iob_ents) + output_doc.append({'tokens': [ + {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in + zip(words, tags, biluo_ents) + ]}) + output_docs.append({ + 'id': len(output_docs), + 'paragraphs': [{'sentences': output_doc}] + }) + output_doc = [] + return output_docs diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b27087056..2faea72e7 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -44,7 +44,7 @@ numpy.random.seed(0) version=("Model version", "option", "V", str), meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) ) -def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, +def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, gold_preproc=False, version="0.0.0", meta_path=None): """ @@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, if not isinstance(meta, dict): prints("Expected dict but got: {}".format(type(meta)), title="Not a valid meta.json format", exits=1) + meta.setdefault('lang', lang) + meta.setdefault('name', 'unnamed') pipeline = ['tagger', 'parser', 'ner'] if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') @@ -88,9 +90,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, n_train_words = corpus.count_train() lang_class = util.get_lang_class(lang) - nlp = lang_class(pipeline=pipeline) + nlp = lang_class() + meta['pipeline'] = pipeline + nlp.meta.update(meta) if vectors: util.load_model(vectors, vocab=nlp.vocab) + for name in pipeline: + nlp.add_pipe(nlp.create_pipe(name), name=name) optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None @@ -112,17 +118,33 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) - nlp_loaded = lang_class(pipeline=pipeline) - nlp_loaded = nlp_loaded.from_disk(epoch_model_path) - scorer = nlp_loaded.evaluate( - list(corpus.dev_docs( + nlp_loaded = util.load_model_from_path(epoch_model_path) + dev_docs = list(corpus.dev_docs( nlp_loaded, - gold_preproc=gold_preproc))) + gold_preproc=gold_preproc)) + nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) + start_time = timer() + scorer = nlp_loaded.evaluate(dev_docs) + end_time = timer() + if use_gpu < 0: + gpu_wps = None + cpu_wps = nwords/(end_time-start_time) + else: + gpu_wps = nwords/(end_time-start_time) + with Model.use_device('cpu'): + nlp_loaded = util.load_model_from_path(epoch_model_path) + dev_docs = list(corpus.dev_docs( + nlp_loaded, gold_preproc=gold_preproc)) + start_time = timer() + scorer = nlp_loaded.evaluate(dev_docs) + end_time = timer() + cpu_wps = nwords/(end_time-start_time) acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') with acc_loc.open('w') as file_: file_.write(json_dumps(scorer.scores)) meta_loc = output_path / ('model%d' % i) / 'meta.json' meta['accuracy'] = scorer.scores + meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps} meta['lang'] = nlp.lang meta['pipeline'] = pipeline meta['spacy_version'] = '>=%s' % about.__version__ @@ -132,7 +154,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, with meta_loc.open('w') as file_: file_.write(json_dumps(meta)) util.set_env_log(True) - print_progress(i, losses, scorer.scores) + print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) finally: print("Saving model...") try: @@ -153,16 +175,17 @@ def _render_parses(i, to_render): file_.write(html) -def print_progress(itn, losses, dev_scores, wps=0.0): +def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): scores = {} for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', - 'ents_p', 'ents_r', 'ents_f', 'wps']: + 'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']: scores[col] = 0.0 scores['dep_loss'] = losses.get('parser', 0.0) scores['ner_loss'] = losses.get('ner', 0.0) scores['tag_loss'] = losses.get('tagger', 0.0) scores.update(dev_scores) - scores['wps'] = wps + scores['cpu_wps'] = cpu_wps + scores['gpu_wps'] = gpu_wps or 0.0 tpl = '\t'.join(( '{:d}', '{dep_loss:.3f}', @@ -173,7 +196,9 @@ def print_progress(itn, losses, dev_scores, wps=0.0): '{ents_f:.3f}', '{tags_acc:.3f}', '{token_acc:.3f}', - '{wps:.1f}')) + '{cpu_wps:.1f}', + '{gpu_wps:.1f}', + )) print(tpl.format(itn, **scores)) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py new file mode 100644 index 000000000..c1f992ed6 --- /dev/null +++ b/spacy/cli/validate.py @@ -0,0 +1,123 @@ +# coding: utf8 +from __future__ import unicode_literals + +import requests +import pkg_resources +from pathlib import Path + +from ..compat import path2str, locale_escape +from ..util import prints, get_data_path, read_json +from .. import about + + +def validate(cmd): + """Validate that the currently installed version of spaCy is compatible + with the installed models. Should be run after `pip install -U spacy`. + """ + r = requests.get(about.__compatibility__) + if r.status_code != 200: + prints("Couldn't fetch compatibility table.", + title="Server error (%d)" % r.status_code, exits=1) + compat = r.json()['spacy'] + all_models = set() + for spacy_v, models in dict(compat).items(): + all_models.update(models.keys()) + for model, model_vs in models.items(): + compat[spacy_v][model] = [reformat_version(v) for v in model_vs] + + current_compat = compat[about.__version__] + model_links = get_model_links(current_compat) + model_pkgs = get_model_pkgs(current_compat, all_models) + incompat_links = {l for l, d in model_links.items() if not d['compat']} + incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']} + incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']]) + na_models = [m for m in incompat_models if m not in current_compat] + update_models = [m for m in incompat_models if m in current_compat] + + prints(path2str(Path(__file__).parent.parent), + title="Installed models (spaCy v{})".format(about.__version__)) + if model_links or model_pkgs: + print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) + for name, data in model_pkgs.items(): + print(get_model_row(current_compat, name, data, 'package')) + for name, data in model_links.items(): + print(get_model_row(current_compat, name, data, 'link')) + else: + prints("No models found in your current environment.", exits=0) + + if update_models: + cmd = ' python -m spacy download {}' + print("\n Use the following commands to update the model packages:") + print('\n'.join([cmd.format(pkg) for pkg in update_models])) + + if na_models: + prints("The following models are not available for spaCy v{}: {}" + .format(about.__version__, ', '.join(na_models))) + + if incompat_links: + prints("You may also want to overwrite the incompatible links using " + "the `spacy link` command with `--force`, or remove them from " + "the data directory. Data path: {}" + .format(path2str(get_data_path()))) + + +def get_model_links(compat): + links = {} + data_path = get_data_path() + if data_path: + models = [p for p in data_path.iterdir() if is_model_path(p)] + for model in models: + meta_path = Path(model) / 'meta.json' + if not meta_path.exists(): + continue + meta = read_json(meta_path) + link = model.parts[-1] + name = meta['lang'] + '_' + meta['name'] + links[link] = {'name': name, 'version': meta['version'], + 'compat': is_compat(compat, name, meta['version'])} + return links + + +def get_model_pkgs(compat, all_models): + pkgs = {} + for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): + package = pkg_name.replace('-', '_') + if package in all_models: + version = pkg_data.version + pkgs[pkg_name] = {'name': package, 'version': version, + 'compat': is_compat(compat, package, version)} + return pkgs + + +def get_model_row(compat, name, data, type='package'): + tpl_row = ' {:<10}' + (' {:<20}' * 4) + tpl_red = '\x1b[38;5;1m{}\x1b[0m' + tpl_green = '\x1b[38;5;2m{}\x1b[0m' + if data['compat']: + comp = tpl_green.format(locale_escape('✔', errors='ignore')) + version = tpl_green.format(data['version']) + else: + comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0]) + version = tpl_red.format(data['version']) + return get_row(type, name, data['name'], version, comp) + + +def get_row(*args): + tpl_row = ' {:<10}' + (' {:<20}' * 4) + return tpl_row.format(*args) + + +def is_model_path(model_path): + exclude = ['cache', 'pycache', '__pycache__'] + name = model_path.parts[-1] + return model_path.is_dir() and name not in exclude and not name.startswith('.') + + +def is_compat(compat, name, version): + return name in compat and version in compat[name] + + +def reformat_version(version): + if version.endswith('-alpha'): + return version.replace('-alpha', 'a0') + return version.replace('-alpha', 'a') diff --git a/spacy/compat.py b/spacy/compat.py index e6b7c066b..81243ce1b 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -6,6 +6,7 @@ import ftfy import sys import ujson import itertools +import locale from thinc.neural.util import copy_array @@ -113,3 +114,12 @@ def import_file(name, loc): module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module + + +def locale_escape(string, errors='replace'): + ''' + Mangle non-supported characters, for savages with ascii terminals. + ''' + encoding = locale.getpreferredencoding() + string = string.encode(encoding, errors).decode('utf8') + return string diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 2512c179f..5729af667 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -213,7 +213,7 @@ class GoldCorpus(object): train_tuples = self.train_tuples if projectivize: train_tuples = nonproj.preprocess_training_data( - self.train_tuples) + self.train_tuples, label_freq_cutoff=100) random.shuffle(train_tuples) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, max_length=max_length, diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index c2cf12f12..ff560afae 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -16,15 +16,13 @@ from ...util import update_exc class BengaliDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'bn' - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP stop_words = STOP_WORDS lemma_rules = LEMMA_RULES - - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES class Bengali(Language): diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index 99babdc2c..86e47c00d 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -15,9 +15,8 @@ class DanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'da' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Danish(Language): diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1c64541e6..e8e7a12db 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -22,16 +21,12 @@ class GermanDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'de' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - infixes = tuple(TOKENIZER_INFIXES) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + infixes = TOKENIZER_INFIXES + tag_map = TAG_MAP + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class German(Language): diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index ec14fecd0..63fd9c2b4 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -7,7 +7,7 @@ from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .morph_rules import MORPH_RULES -from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC +from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS @@ -23,15 +23,15 @@ class EnglishDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'en' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - morph_rules = dict(MORPH_RULES) - lemma_rules = dict(LEMMA_RULES) - lemma_index = dict(LEMMA_INDEX) - lemma_exc = dict(LEMMA_EXC) - syntax_iterators = dict(SYNTAX_ITERATORS) + tag_map = TAG_MAP + stop_words = STOP_WORDS + morph_rules = MORPH_RULES + lemma_rules = LEMMA_RULES + lemma_index = LEMMA_INDEX + lemma_exc = LEMMA_EXC + lemma_lookup = LOOKUP + syntax_iterators = SYNTAX_ITERATORS class English(Language): diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 1e7f55be8..661f0bbec 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'es' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) - sytax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + tag_map = TAG_MAP + stop_words = STOP_WORDS + sytax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class Spanish(Language): diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 931ad5341..7f74495c5 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'fi' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Finnish(Language): diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 06dcf2d45..42acd0736 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -21,17 +20,13 @@ class FrenchDefaults(Language.Defaults): lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - infixes = tuple(TOKENIZER_INFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) + stop_words = STOP_WORDS + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES token_match = TOKEN_MATCH - syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + syntax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class French(Language): diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index a15dc9a05..807794fee 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -12,9 +12,8 @@ from ...util import update_exc class HebrewDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'he' - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Hebrew(Language): diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index 0fe6a9f5c..35b047900 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'hu' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) + stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES token_match = TOKEN_MATCH - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + lemma_lookup = LOOKUP class Hungarian(Language): diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index e0cfa941d..2f21e73cf 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG from ...util import update_exc @@ -19,19 +18,14 @@ from ...util import update_exc class IndonesianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'id' - lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - prefixes = tuple(TOKENIZER_PREFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - infixes = tuple(TOKENIZER_INFIXES) - syntax_iterators = dict(SYNTAX_ITERATORS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + stop_words = STOP_WORDS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES + syntax_iterators = SYNTAX_ITERATORS + lemma_lookup = LOOKUP class Indonesian(Language): diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py index f6acd8508..fb6a31f99 100644 --- a/spacy/lang/id/lex_attrs.py +++ b/spacy/lang/id/lex_attrs.py @@ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta', 'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun', 'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun', - 'noniliun', 'desiliun', - ] + 'noniliun', 'desiliun'] def like_num(text): diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 7cc717cb3..6bc47ce92 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -7,7 +7,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'it' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + stop_words = STOP_WORDS + lemma_lookup = LOOKUP class Italian(Language): diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index c1b4af263..4250e6809 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'nb' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Norwegian(Language): diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 98df8d487..13786a7bc 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -16,9 +16,8 @@ class DutchDefaults(Language.Defaults): lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Dutch(Language): diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 38a240598..80011f9d8 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'pl' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) + stop_words = STOP_WORDS class Polish(Language): diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index 67539034d..2a8323597 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults): lex_attr_getters[LANG] = lambda text: 'pt' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters.update(LEX_ATTRS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + stop_words = STOP_WORDS + lemma_lookup = LOOKUP class Portuguese(Language): diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 2d3a640c5..224c105d7 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language -from ...lemmatizerlookup import Lemmatizer from ...attrs import LANG, NORM from ...util import update_exc, add_lookups @@ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'sv' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = set(STOP_WORDS) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) + stop_words = STOP_WORDS + lemma_rules = LEMMA_RULES + lemma_lookup = LOOKUP class Swedish(Language): diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index b6bdb658f..bedec46c8 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -12,24 +12,27 @@ from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups + class ThaiDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'th' - tokenizer_exceptions = TOKENIZER_EXCEPTIONS - tag_map = dict(TAG_MAP) - stop_words = set(STOP_WORDS) + tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) + tag_map = TAG_MAP + stop_words = STOP_WORDS class Thai(Language): - lang = 'th' - Defaults = ThaiDefaults - def make_doc(self, text): - try: - from pythainlp.tokenize import word_tokenize - except ImportError: - raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " - "https://github.com/wannaphongcom/pythainlp/") - words = [x for x in list(word_tokenize(text,"newmm"))] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + lang = 'th' + Defaults = ThaiDefaults + + def make_doc(self, text): + try: + from pythainlp.tokenize import word_tokenize + except ImportError: + raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " + "https://github.com/wannaphongcom/pythainlp/") + words = [x for x in list(word_tokenize(text,"newmm"))] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + __all__ = ['Thai'] diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index dc63ee33f..017f55ecc 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'xx' lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) diff --git a/spacy/language.py b/spacy/language.py index c49c64b1d..047c94a37 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,12 +1,9 @@ # coding: utf8 from __future__ import absolute_import, unicode_literals from contextlib import contextmanager -import dill -import numpy from thinc.neural import Model -from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.optimizers import Adam, SGD +from thinc.neural.optimizers import Adam import random import ujson from collections import OrderedDict @@ -17,30 +14,27 @@ from .vocab import Vocab from .tagger import Tagger from .lemmatizer import Lemmatizer from .syntax.parser import get_templates -from .syntax import nonproj -from .pipeline import NeuralDependencyParser, EntityRecognizer -from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer -from .pipeline import NeuralLabeller -from .pipeline import SimilarityHook -from .pipeline import TextCategorizer -from . import about +from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger +from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer from .compat import json_dumps, izip +from .scorer import Scorer +from ._ml import link_vectors_to_models from .attrs import IS_STOP from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .lang.lex_attrs import LEX_ATTRS from . import util -from .scorer import Scorer -from ._ml import link_vectors_to_models +from . import about class BaseDefaults(object): @classmethod def create_lemmatizer(cls, nlp=None): - return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules) + return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules, + cls.lemma_lookup) @classmethod def create_vocab(cls, nlp=None): @@ -70,59 +64,7 @@ class BaseDefaults(object): prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match) - @classmethod - def create_tagger(cls, nlp=None, **cfg): - if nlp is None: - return NeuralTagger(cls.create_vocab(nlp), **cfg) - else: - return NeuralTagger(nlp.vocab, **cfg) - - @classmethod - def create_parser(cls, nlp=None, **cfg): - if nlp is None: - return NeuralDependencyParser(cls.create_vocab(nlp), **cfg) - else: - return NeuralDependencyParser(nlp.vocab, **cfg) - - @classmethod - def create_entity(cls, nlp=None, **cfg): - if nlp is None: - return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg) - else: - return NeuralEntityRecognizer(nlp.vocab, **cfg) - - @classmethod - def create_pipeline(cls, nlp=None, disable=tuple()): - meta = nlp.meta if nlp is not None else {} - # Resolve strings, like "cnn", "lstm", etc - pipeline = [] - for entry in meta.get('pipeline', []): - if entry in disable or getattr(entry, 'name', entry) in disable: - continue - factory = cls.Defaults.factories[entry] - pipeline.append(factory(nlp, **meta.get(entry, {}))) - return pipeline - - factories = { - 'make_doc': create_tokenizer, - 'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], - 'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], - 'parser': lambda nlp, **cfg: [ - NeuralDependencyParser(nlp.vocab, **cfg), - nonproj.deprojectivize], - 'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], - 'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)], - 'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)], - # Temporary compatibility -- delete after pivot - 'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], - 'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], - 'dependencies': lambda nlp, **cfg: [ - NeuralDependencyParser(nlp.vocab, **cfg), - nonproj.deprojectivize, - ], - 'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], - } - + pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] token_match = TOKEN_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) @@ -136,6 +78,7 @@ class BaseDefaults(object): lemma_rules = {} lemma_exc = {} lemma_index = {} + lemma_lookup = {} morph_rules = {} lex_attr_getters = LEX_ATTRS syntax_iterators = {} @@ -152,8 +95,17 @@ class Language(object): Defaults = BaseDefaults lang = None - def __init__(self, vocab=True, make_doc=True, pipeline=None, - meta={}, disable=tuple(), **kwargs): + factories = { + 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), + 'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), + 'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), + 'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), + 'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), + 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), + 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) + } + + def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): """Initialise a Language object. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via @@ -179,28 +131,7 @@ class Language(object): factory = self.Defaults.create_tokenizer make_doc = factory(self, **meta.get('tokenizer', {})) self.tokenizer = make_doc - if pipeline is True: - self.pipeline = self.Defaults.create_pipeline(self, disable) - elif pipeline: - # Careful not to do getattr(p, 'name', None) here - # If we had disable=[None], we'd disable everything! - self.pipeline = [p for p in pipeline - if p not in disable - and getattr(p, 'name', p) not in disable] - # Resolve strings, like "cnn", "lstm", etc - for i, entry in enumerate(self.pipeline): - if entry in self.Defaults.factories: - factory = self.Defaults.factories[entry] - self.pipeline[i] = factory(self, **meta.get(entry, {})) - else: - self.pipeline = [] - flat_list = [] - for pipe in self.pipeline: - if isinstance(pipe, list): - flat_list.extend(pipe) - else: - flat_list.append(pipe) - self.pipeline = flat_list + self.pipeline = [] self._optimizer = None @property @@ -214,11 +145,7 @@ class Language(object): self._meta.setdefault('email', '') self._meta.setdefault('url', '') self._meta.setdefault('license', '') - pipeline = [] - for component in self.pipeline: - if hasattr(component, 'name'): - pipeline.append(component.name) - self._meta['pipeline'] = pipeline + self._meta['pipeline'] = self.pipe_names return self._meta @meta.setter @@ -228,34 +155,144 @@ class Language(object): # Conveniences to access pipeline components @property def tensorizer(self): - return self.get_component('tensorizer') + return self.get_pipe('tensorizer') @property def tagger(self): - return self.get_component('tagger') + return self.get_pipe('tagger') @property def parser(self): - return self.get_component('parser') + return self.get_pipe('parser') @property def entity(self): - return self.get_component('ner') + return self.get_pipe('ner') @property def matcher(self): - return self.get_component('matcher') + return self.get_pipe('matcher') - def get_component(self, name): - if self.pipeline in (True, None): - return None - for proc in self.pipeline: - if hasattr(proc, 'name') and proc.name.endswith(name): - return proc - return None + @property + def pipe_names(self): + """Get names of available pipeline components. + + RETURNS (list): List of component name strings, in order. + """ + return [pipe_name for pipe_name, _ in self.pipeline] + + def get_pipe(self, name): + """Get a pipeline component for a given component name. + + name (unicode): Name of pipeline component to get. + RETURNS (callable): The pipeline component. + """ + for pipe_name, component in self.pipeline: + if pipe_name == name: + return component + msg = "No component '{}' found in pipeline. Available names: {}" + raise KeyError(msg.format(name, self.pipe_names)) + + def create_pipe(self, name, config=dict()): + """Create a pipeline component from a factory. + + name (unicode): Factory name to look up in `Language.factories`. + config (dict): Configuration parameters to initialise component. + RETURNS (callable): Pipeline component. + """ + if name not in self.factories: + raise KeyError("Can't find factory for '{}'.".format(name)) + factory = self.factories[name] + return factory(self, **config) + + def add_pipe(self, component, name=None, before=None, after=None, + first=None, last=None): + """Add a component to the processing pipeline. Valid components are + callables that take a `Doc` object, modify it and return it. Only one of + before, after, first or last can be set. Default behaviour is "last". + + component (callable): The pipeline component. + name (unicode): Name of pipeline component. Overwrites existing + component.name attribute if available. If no name is set and + the component exposes no name attribute, component.__name__ is + used. An error is raised if the name already exists in the pipeline. + before (unicode): Component name to insert component directly before. + after (unicode): Component name to insert component directly after. + first (bool): Insert component first / not first in the pipeline. + last (bool): Insert component last / not last in the pipeline. + + EXAMPLE: + >>> nlp.add_pipe(component, before='ner') + >>> nlp.add_pipe(component, name='custom_name', last=True) + """ + if name is None: + if hasattr(component, 'name'): + name = component.name + elif hasattr(component, '__name__'): + name = component.__name__ + elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'): + name = component.__class__.__name__ + else: + name = repr(component) + if name in self.pipe_names: + raise ValueError("'{}' already exists in pipeline.".format(name)) + if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: + msg = ("Invalid constraints. You can only set one of the " + "following: before, after, first, last.") + raise ValueError(msg) + pipe = (name, component) + if last or not any([first, before, after]): + self.pipeline.append(pipe) + elif first: + self.pipeline.insert(0, pipe) + elif before and before in self.pipe_names: + self.pipeline.insert(self.pipe_names.index(before), pipe) + elif after and after in self.pipe_names: + self.pipeline.insert(self.pipe_names.index(after), pipe) + else: + msg = "Can't find '{}' in pipeline. Available names: {}" + unfound = before or after + raise ValueError(msg.format(unfound, self.pipe_names)) + + def replace_pipe(self, name, component): + """Replace a component in the pipeline. + + name (unicode): Name of the component to replace. + component (callable): Pipeline component. + """ + if name not in self.pipe_names: + msg = "Can't find '{}' in pipeline. Available names: {}" + raise ValueError(msg.format(name, self.pipe_names)) + self.pipeline[self.pipe_names.index(name)] = (name, component) + + def rename_pipe(self, old_name, new_name): + """Rename a pipeline component. + + old_name (unicode): Name of the component to rename. + new_name (unicode): New name of the component. + """ + if old_name not in self.pipe_names: + msg = "Can't find '{}' in pipeline. Available names: {}" + raise ValueError(msg.format(old_name, self.pipe_names)) + if new_name in self.pipe_names: + msg = "'{}' already exists in pipeline. Existing names: {}" + raise ValueError(msg.format(new_name, self.pipe_names)) + i = self.pipe_names.index(old_name) + self.pipeline[i] = (new_name, self.pipeline[i][1]) + + def remove_pipe(self, name): + """Remove a component from the pipeline. + + name (unicode): Name of the component to remove. + RETURNS (tuple): A `(name, component)` tuple of the removed component. + """ + if name not in self.pipe_names: + msg = "Can't find '{}' in pipeline. Available names: {}" + raise ValueError(msg.format(name, self.pipe_names)) + return self.pipeline.pop(self.pipe_names.index(name)) def __call__(self, text, disable=[]): - """'Apply the pipeline to some text. The text can span multiple sentences, + """Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. @@ -269,8 +306,7 @@ class Language(object): ('An', 'NN') """ doc = self.make_doc(text) - for proc in self.pipeline: - name = getattr(proc, 'name', None) + for name, proc in self.pipeline: if name in disable: continue doc = proc(doc) @@ -308,7 +344,7 @@ class Language(object): grads[key] = (W, dW) pipes = list(self.pipeline) random.shuffle(pipes) - for proc in pipes: + for name, proc in pipes: if not hasattr(proc, 'update'): continue proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) @@ -322,7 +358,7 @@ class Language(object): docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. """ - for proc in self.pipeline: + for name, proc in self.pipeline: if hasattr(proc, 'preprocess_gold'): docs_golds = proc.preprocess_gold(docs_golds) for doc, gold in docs_golds: @@ -354,7 +390,7 @@ class Language(object): get_gold_tuples (function): Function returning gold data **cfg: Config parameters. - returns: An optimizer + RETURNS: An optimizer """ # Populate vocab if get_gold_tuples is not None: @@ -371,7 +407,7 @@ class Language(object): else: device = None link_vectors_to_models(self.vocab) - for proc in self.pipeline: + for name, proc in self.pipeline: if hasattr(proc, 'begin_training'): context = proc.begin_training(get_gold_tuples(), pipeline=self.pipeline) @@ -393,7 +429,7 @@ class Language(object): docs, golds = zip(*docs_golds) docs = list(docs) golds = list(golds) - for pipe in self.pipeline: + for name, pipe in self.pipeline: if not hasattr(pipe, 'pipe'): for doc in docs: pipe(doc) @@ -419,7 +455,7 @@ class Language(object): >>> with nlp.use_params(optimizer.averages): >>> nlp.to_disk('/tmp/checkpoint') """ - contexts = [pipe.use_params(params) for pipe + contexts = [pipe.use_params(params) for name, pipe in self.pipeline if hasattr(pipe, 'use_params')] # TODO: Having trouble with contextlib # Workaround: these aren't actually context managers atm. @@ -466,8 +502,7 @@ class Language(object): yield (doc, context) return docs = (self.make_doc(text) for text in texts) - for proc in self.pipeline: - name = getattr(proc, 'name', None) + for name, proc in self.pipeline: if name in disable: continue if hasattr(proc, 'pipe'): @@ -495,14 +530,14 @@ class Language(object): ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) )) - for proc in self.pipeline: + for name, proc in self.pipeline: if not hasattr(proc, 'name'): continue - if proc.name in disable: + if name in disable: continue if not hasattr(proc, 'to_disk'): continue - serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) + serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) serializers['vocab'] = lambda p: self.vocab.to_disk(p) util.to_disk(path, serializers, {p: False for p in disable}) @@ -526,14 +561,12 @@ class Language(object): ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) )) - for proc in self.pipeline: - if not hasattr(proc, 'name'): - continue - if proc.name in disable: + for name, proc in self.pipeline: + if name in disable: continue if not hasattr(proc, 'to_disk'): continue - deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) + deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) exclude = {p: False for p in disable} if not (path / 'vocab').exists(): exclude['vocab'] = True @@ -552,8 +585,8 @@ class Language(object): ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), ('meta', lambda: ujson.dumps(self.meta)) )) - for i, proc in enumerate(self.pipeline): - if getattr(proc, 'name', None) in disable: + for i, (name, proc) in enumerate(self.pipeline): + if name in disable: continue if not hasattr(proc, 'to_bytes'): continue @@ -572,8 +605,8 @@ class Language(object): ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), ('meta', lambda b: self.meta.update(ujson.loads(b))) )) - for i, proc in enumerate(self.pipeline): - if getattr(proc, 'name', None) in disable: + for i, (name, proc) in enumerate(self.pipeline): + if name in disable: continue if not hasattr(proc, 'from_bytes'): continue diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 312c8db72..53519e4f1 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -10,20 +10,23 @@ class Lemmatizer(object): def load(cls, path, index=None, exc=None, rules=None): return cls(index or {}, exc or {}, rules or {}) - def __init__(self, index, exceptions, rules): - self.index = index - self.exc = exceptions - self.rules = rules + def __init__(self, index=None, exceptions=None, rules=None, lookup=None): + self.index = index if index is not None else {} + self.exc = exceptions if exceptions is not None else {} + self.rules = rules if rules is not None else {} + self.lookup_table = lookup if lookup is not None else {} def __call__(self, string, univ_pos, morphology=None): - if univ_pos == NOUN: + if univ_pos in (NOUN, 'NOUN', 'noun'): univ_pos = 'noun' - elif univ_pos == VERB: + elif univ_pos in (VERB, 'VERB', 'verb'): univ_pos = 'verb' - elif univ_pos == ADJ: + elif univ_pos in (ADJ, 'ADJ', 'adj'): univ_pos = 'adj' - elif univ_pos == PUNCT: + elif univ_pos in (PUNCT, 'PUNCT', 'punct'): univ_pos = 'punct' + else: + return set([string.lower()]) # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): return set([string.lower()]) @@ -77,6 +80,11 @@ class Lemmatizer(object): def punct(self, string, morphology=None): return self(string, 'punct', morphology) + def lookup(self, string): + if string in self.lookup_table: + return self.lookup_table[string] + return string + def lemmatize(string, index, exceptions, rules): string = string.lower() diff --git a/spacy/lemmatizerlookup.py b/spacy/lemmatizerlookup.py deleted file mode 100644 index 0c0c693c1..000000000 --- a/spacy/lemmatizerlookup.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .lemmatizer import Lemmatizer - - -class Lemmatizer(Lemmatizer): - @classmethod - def load(cls, path, lookup): - return cls(lookup or {}) - - def __init__(self, lookup): - self.lookup = lookup - - def __call__(self, string, univ_pos, morphology=None): - try: - return set([self.lookup[string]]) - except: - return set([string]) \ No newline at end of file diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 922843d6d..be6711bfd 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -35,6 +35,8 @@ cdef class Morphology: cdef RichTagC* rich_tags cdef PreshMapArray _cache + cdef int assign_untagged(self, TokenC* token) except -1 + cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 5ee11c151..4a1a0aa54 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -42,7 +42,7 @@ cdef class Morphology: self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} - self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) + self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): self.tag_map[tag_str] = dict(attrs) attrs = _normalize_props(attrs) @@ -52,6 +52,10 @@ cdef class Morphology: self.rich_tags[i].morph = 0 self.rich_tags[i].pos = attrs[POS] self.reverse_index[self.rich_tags[i].name] = i + # Add a 'null' tag, which we can reference when assign morphology to + # untagged tokens. + self.rich_tags[self.n_tags].id = self.n_tags + self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: @@ -62,6 +66,15 @@ cdef class Morphology: return (Morphology, (self.strings, self.tag_map, self.lemmatizer, self.exc), None, None) + cdef int assign_untagged(self, TokenC* token) except -1: + """Set morphological attributes on a token without a POS tag. Uses + the lemmatizer's lookup() method, which looks up the string in the + table provided by the language data as lemma_lookup (if available).""" + if token.lemma == 0: + orth_str = self.strings[token.lex.orth] + lemma = self.lemmatizer.lookup(orth_str) + token.lemma = self.strings.add(lemma) + cdef int assign_tag(self, TokenC* token, tag) except -1: if isinstance(tag, basestring): tag = self.strings.add(tag) @@ -72,7 +85,7 @@ cdef class Morphology: token.tag = tag cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: - if tag_id >= self.n_tags: + if tag_id > self.n_tags: raise ValueError("Unknown tag ID: %s" % tag_id) # TODO: It's pretty arbitrary to put this logic here. I guess the justification # is that this is where the specific word and the tag interact. Still, @@ -151,8 +164,6 @@ cdef class Morphology: cdef unicode py_string = self.strings[orth] if self.lemmatizer is None: return self.strings.add(py_string.lower()) - if univ_pos not in (NOUN, VERB, ADJ, PUNCT): - return self.strings.add(py_string.lower()) cdef set lemma_strings cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index c39976630..5bb4b090e 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity from .tokens.doc cimport Doc from .syntax.parser cimport Parser as LinearParser from .syntax.nn_parser cimport Parser as NeuralParser +from .syntax import nonproj from .syntax.parser import get_templates as get_feature_templates from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown @@ -157,11 +158,13 @@ class BaseThincComponent(object): def to_bytes(self, **exclude): """Serialize the pipe to a bytestring.""" - serialize = OrderedDict(( - ('cfg', lambda: json_dumps(self.cfg)), - ('model', lambda: self.model.to_bytes()), - ('vocab', lambda: self.vocab.to_bytes()) - )) + serialize = OrderedDict() + serialize['cfg'] = lambda: json_dumps(self.cfg) + if self.model in (True, False, None): + serialize['model'] = lambda: self.model + else: + serialize['model'] = self.model.to_bytes + serialize['vocab'] = self.vocab.to_bytes return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): @@ -182,11 +185,11 @@ class BaseThincComponent(object): def to_disk(self, path, **exclude): """Serialize the pipe to disk.""" - serialize = OrderedDict(( - ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), - ('vocab', lambda p: self.vocab.to_disk(p)), - ('model', lambda p: p.open('wb').write(self.model.to_bytes())), - )) + serialize = OrderedDict() + serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg)) + serialize['vocab'] = lambda p: self.vocab.to_disk(p) + if self.model not in (None, True, False): + serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes()) util.to_disk(path, serialize, exclude) def from_disk(self, path, **exclude): @@ -437,13 +440,16 @@ class NeuralTagger(BaseThincComponent): yield def to_bytes(self, **exclude): - serialize = OrderedDict(( - ('model', lambda: self.model.to_bytes()), - ('vocab', lambda: self.vocab.to_bytes()), - ('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map, - use_bin_type=True, - encoding='utf8')) - )) + serialize = OrderedDict() + if self.model in (None, True, False): + serialize['model'] = lambda: self.model + else: + serialize['model'] = self.model.to_bytes + serialize['vocab'] = self.vocab.to_bytes + + serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map, + use_bin_type=True, + encoding='utf8') return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): @@ -778,11 +784,19 @@ cdef class DependencyParser(LinearParser): if isinstance(label, basestring): label = self.vocab.strings[label] + @property + def postprocesses(self): + return [nonproj.deprojectivize] + cdef class NeuralDependencyParser(NeuralParser): name = 'parser' TransitionSystem = ArcEager + @property + def postprocesses(self): + return [nonproj.deprojectivize] + def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): for target in []: labeller = NeuralLabeller(self.vocab, target=target) @@ -823,6 +837,11 @@ cdef class BeamDependencyParser(BeamParser): if isinstance(label, basestring): label = self.vocab.strings[label] + @property + def postprocesses(self): + return [nonproj.deprojectivize] + + __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', 'BeamEntityRecognizer', 'TokenVectorEnoder'] diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 65311f48a..1f4918935 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -241,8 +241,8 @@ cdef class Parser: def Model(cls, nr_class, **cfg): depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) - hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) - parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1)) + hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200)) + parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) @@ -779,6 +779,14 @@ cdef class Parser: for i in range(doc.length): doc.c[i] = state.c._sent[i] self.moves.finalize_doc(doc) + for hook in self.postprocesses: + for doc in docs: + hook(doc) + + @property + def postprocesses(self): + # Available for subclasses, e.g. to deprojectivize + return [] def add_label(self, label): resized = False @@ -792,16 +800,25 @@ cdef class Parser: if self.model not in (True, False, None) and resized: # Weights are stored in (nr_out, nr_in) format, so we're basically # just adding rows here. - smaller = self.model[-1]._layers[-1] - larger = Affine(self.moves.n_moves, smaller.nI) - copy_array(larger.W[:smaller.nO], smaller.W) - copy_array(larger.b[:smaller.nO], smaller.b) - self.model[-1]._layers[-1] = larger + if self.model[-1].is_noop: + smaller = self.model[1] + dims = dict(self.model[1]._dims) + dims['nO'] = self.moves.n_moves + larger = self.model[1].__class__(**dims) + copy_array(larger.W[:, :smaller.nO], smaller.W) + copy_array(larger.b[:smaller.nO], smaller.b) + self.model = (self.model[0], larger, self.model[2]) + else: + smaller = self.model[-1]._layers[-1] + larger = Affine(self.moves.n_moves, smaller.nI) + copy_array(larger.W[:smaller.nO], smaller.W) + copy_array(larger.b[:smaller.nO], smaller.b) + self.model[-1]._layers[-1] = larger def begin_training(self, gold_tuples, pipeline=None, **cfg): if 'model' in cfg: self.model = cfg['model'] - gold_tuples = nonproj.preprocess_training_data(gold_tuples) + gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100) actions = self.moves.get_actions(gold_parses=gold_tuples) for action, labels in actions.items(): for label in labels: diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b33a7c008..28b5f4ab9 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -58,8 +58,9 @@ def en_vocab(): @pytest.fixture -def en_parser(): - return util.get_lang_class('en').Defaults.create_parser() +def en_parser(en_vocab): + nlp = util.get_lang_class('en')(en_vocab) + return nlp.create_pipe('parser') @pytest.fixture diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py new file mode 100644 index 000000000..c14fdfbe9 --- /dev/null +++ b/spacy/tests/doc/test_creation.py @@ -0,0 +1,37 @@ +'''Test Doc sets up tokens correctly.''' +from __future__ import unicode_literals +import pytest + +from ...vocab import Vocab +from ...tokens.doc import Doc +from ...lemmatizer import Lemmatizer + + +@pytest.fixture +def lemmatizer(): + return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'}) + + +@pytest.fixture +def vocab(lemmatizer): + return Vocab(lemmatizer=lemmatizer) + + +def test_empty_doc(vocab): + doc = Doc(vocab) + assert len(doc) == 0 + + +def test_single_word(vocab): + doc = Doc(vocab, words=['a']) + assert doc.text == 'a ' + doc = Doc(vocab, words=['a'], spaces=[False]) + assert doc.text == 'a' + + +def test_lookup_lemmatization(vocab): + doc = Doc(vocab, words=['dogs', 'dogses']) + assert doc[0].text == 'dogs' + assert doc[0].lemma_ == 'dog' + assert doc[1].text == 'dogses' + assert doc[1].lemma_ == 'dogses' diff --git a/spacy/tests/lang/de/test_lemma.py b/spacy/tests/lang/de/test_lemma.py new file mode 100644 index 000000000..39b3b0313 --- /dev/null +++ b/spacy/tests/lang/de/test_lemma.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'), + ('engagierte', 'engagieren'), + ('schließt', 'schließen'), + ('vorgebenden', 'vorgebend')]) +def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma): + tokens = de_tokenizer(string) + assert tokens[0].lemma_ == lemma diff --git a/spacy/tests/lang/en/test_lemmatizer.py b/spacy/tests/lang/en/test_lemmatizer.py index ecde87bed..22c8f2499 100644 --- a/spacy/tests/lang/en/test_lemmatizer.py +++ b/spacy/tests/lang/en/test_lemmatizer.py @@ -57,6 +57,5 @@ def test_en_lemmatizer_punct(en_lemmatizer): def test_en_lemmatizer_lemma_assignment(EN): text = "Bananas in pyjamas are geese." doc = EN.make_doc(text) - assert all(t.lemma_ == '' for t in doc) EN.tagger(doc) assert all(t.lemma_ != '' for t in doc) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index b89cca113..3fbfc96a6 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -22,14 +22,14 @@ def vocab(): @pytest.fixture def parser(vocab): parser = NeuralDependencyParser(vocab) - parser.cfg['token_vector_width'] = 4 - parser.cfg['hidden_width'] = 6 + parser.cfg['token_vector_width'] = 8 + parser.cfg['hidden_width'] = 30 parser.cfg['hist_size'] = 0 parser.add_label('left') parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) - for i in range(30): + for i in range(10): losses = {} doc = Doc(vocab, words=['a', 'b', 'c', 'd']) gold = GoldParse(doc, heads=[1, 1, 3, 3], @@ -37,6 +37,8 @@ def parser(vocab): parser.update([doc], [gold], sgd=sgd, losses=losses) return parser +def test_init_parser(parser): + pass def test_add_label(parser): doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) diff --git a/spacy/tests/parser/test_beam_parse.py b/spacy/tests/parser/test_beam_parse.py index da5f43d5e..dd77c6805 100644 --- a/spacy/tests/parser/test_beam_parse.py +++ b/spacy/tests/parser/test_beam_parse.py @@ -1,10 +1,11 @@ -import spacy +# coding: utf8 +from __future__ import unicode_literals + import pytest -@pytest.mark.models -def test_beam_parse(): - nlp = spacy.load('en_core_web_sm') - doc = nlp(u'Australia is a country', disable=['ner']) - ents = nlp.entity(doc, beam_width=2) - print(ents) +@pytest.mark.models('en') +def test_beam_parse(EN): + doc = EN(u'Australia is a country', disable=['ner']) + ents = EN.entity(doc, beam_width=2) + print(ents) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 77326f797..4c973bd97 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -35,7 +35,7 @@ def parser(vocab): def test_no_sentences(parser): doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc = parser(doc) - assert len(list(doc.sents)) == 2 + assert len(list(doc.sents)) >= 1 def test_sents_1(parser): @@ -64,7 +64,7 @@ def test_sents_1_3(parser): doc[1].sent_start = True doc[3].sent_start = True doc = parser(doc) - assert len(list(doc.sents)) == 4 + assert len(list(doc.sents)) >= 3 doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc[1].sent_start = True doc[2].sent_start = False diff --git a/spacy/tests/pipeline/__init__.py b/spacy/tests/pipeline/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py new file mode 100644 index 000000000..5ec78aefb --- /dev/null +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -0,0 +1,84 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from ...language import Language + + +@pytest.fixture +def nlp(): + return Language() + + +def new_pipe(doc): + return doc + + +def test_add_pipe_no_name(nlp): + nlp.add_pipe(new_pipe) + assert 'new_pipe' in nlp.pipe_names + + +def test_add_pipe_duplicate_name(nlp): + nlp.add_pipe(new_pipe, name='duplicate_name') + with pytest.raises(ValueError): + nlp.add_pipe(new_pipe, name='duplicate_name') + + +@pytest.mark.parametrize('name', ['parser']) +def test_add_pipe_first(nlp, name): + nlp.add_pipe(new_pipe, name=name, first=True) + assert nlp.pipeline[0][0] == name + + +@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')]) +def test_add_pipe_last(nlp, name1, name2): + nlp.add_pipe(lambda doc: doc, name=name2) + nlp.add_pipe(new_pipe, name=name1, last=True) + assert nlp.pipeline[0][0] != name1 + assert nlp.pipeline[-1][0] == name1 + + +def test_cant_add_pipe_first_and_last(nlp): + with pytest.raises(ValueError): + nlp.add_pipe(new_pipe, first=True, last=True) + + +@pytest.mark.parametrize('name', ['my_component']) +def test_get_pipe(nlp, name): + with pytest.raises(KeyError): + nlp.get_pipe(name) + nlp.add_pipe(new_pipe, name=name) + assert nlp.get_pipe(name) == new_pipe + + +@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)]) +def test_replace_pipe(nlp, name, replacement): + with pytest.raises(ValueError): + nlp.replace_pipe(name, new_pipe) + nlp.add_pipe(new_pipe, name=name) + nlp.replace_pipe(name, replacement) + assert nlp.get_pipe(name) != new_pipe + assert nlp.get_pipe(name) == replacement + + +@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')]) +def test_rename_pipe(nlp, old_name, new_name): + with pytest.raises(ValueError): + nlp.rename_pipe(old_name, new_name) + nlp.add_pipe(new_pipe, name=old_name) + nlp.rename_pipe(old_name, new_name) + assert nlp.pipeline[0][0] == new_name + + +@pytest.mark.parametrize('name', ['my_component']) +def test_remove_pipe(nlp, name): + with pytest.raises(ValueError): + nlp.remove_pipe(name) + nlp.add_pipe(new_pipe, name=name) + assert len(nlp.pipeline) == 1 + removed_name, removed_component = nlp.remove_pipe(name) + assert not len(nlp.pipeline) + assert removed_name == name + assert removed_component == new_pipe diff --git a/spacy/tests/regression/test_issue589.py b/spacy/tests/regression/test_issue589.py index 27363739d..96ea4be61 100644 --- a/spacy/tests/regression/test_issue589.py +++ b/spacy/tests/regression/test_issue589.py @@ -7,6 +7,7 @@ from ..util import get_doc import pytest +@pytest.mark.xfail def test_issue589(): vocab = Vocab() vocab.strings.set_frozen(True) diff --git a/spacy/tests/serialize/test_serialize_empty_model.py b/spacy/tests/serialize/test_serialize_empty_model.py new file mode 100644 index 000000000..b614a3648 --- /dev/null +++ b/spacy/tests/serialize/test_serialize_empty_model.py @@ -0,0 +1,9 @@ +import spacy +import spacy.lang.en +from spacy.pipeline import TextCategorizer + +def test_bytes_serialize_issue_1105(): + nlp = spacy.lang.en.English() + tokenizer = nlp.tokenizer + textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER']) + textcat_bytes = textcat.to_bytes() diff --git a/spacy/tests/test_underscore.py b/spacy/tests/test_underscore.py new file mode 100644 index 000000000..c7df57b62 --- /dev/null +++ b/spacy/tests/test_underscore.py @@ -0,0 +1,53 @@ +from mock import Mock +from ..tokens.underscore import Underscore + + +def test_create_doc_underscore(): + doc = Mock() + doc.doc = doc + uscore = Underscore(Underscore.doc_extensions, doc) + assert uscore._doc is doc + assert uscore._start is None + assert uscore._end is None + + +def test_doc_underscore_getattr_setattr(): + doc = Mock() + doc.doc = doc + doc.user_data = {} + Underscore.doc_extensions['hello'] = (False, None, None, None) + doc._ = Underscore(Underscore.doc_extensions, doc) + assert doc._.hello == False + doc._.hello = True + assert doc._.hello == True + + +def test_create_span_underscore(): + span = Mock(doc=Mock(), start=0, end=2) + uscore = Underscore(Underscore.span_extensions, span, + start=span.start, end=span.end) + assert uscore._doc is span.doc + assert uscore._start is span.start + assert uscore._end is span.end + + +def test_span_underscore_getter_setter(): + span = Mock(doc=Mock(), start=0, end=2) + Underscore.span_extensions['hello'] = (None, None, + lambda s: (s.start, 'hi'), + lambda s, value: setattr(s, 'start', + value)) + span._ = Underscore(Underscore.span_extensions, span, + start=span.start, end=span.end) + + assert span._.hello == (0, 'hi') + span._.hello = 1 + assert span._.hello == (1, 'hi') + + +def test_token_underscore_method(): + token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese') + Underscore.token_extensions['hello'] = (None, token.say_cheese, + None, None) + token._ = Underscore(Underscore.token_extensions, token, start=token.idx) + assert token._.hello() == 'cheese' diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index df75ab3ec..05d393d2b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -30,7 +30,7 @@ from ..util import normalize_slice from ..compat import is_config from .. import about from .. import util - +from .underscore import Underscore DEF PADDING = 5 @@ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: else: return Lexeme.get_struct_attr(token.lex, feat_name) + def _get_chunker(lang): try: cls = util.get_lang_class(lang) @@ -73,6 +74,7 @@ def _get_chunker(lang): return None return cls.Defaults.syntax_iterators.get(u'noun_chunks') + cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary strings. @@ -87,6 +89,21 @@ cdef class Doc: >>> from spacy.tokens import Doc >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) """ + @classmethod + def set_extension(cls, name, default=None, method=None, + getter=None, setter=None): + nr_defined = sum(t is not None for t in (default, getter, setter, method)) + assert nr_defined == 1 + Underscore.doc_extensions[name] = (default, method, getter, setter) + + @classmethod + def get_extension(cls, name): + return Underscore.doc_extensions.get(name) + + @classmethod + def has_extension(cls, name): + return name in Underscore.doc_extensions + def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): """Create a Doc object. @@ -159,6 +176,10 @@ cdef class Doc: self.is_tagged = True self.is_parsed = True + @property + def _(self): + return Underscore(Underscore.doc_extensions, self) + def __getitem__(self, object i): """Get a `Token` or `Span` object. @@ -512,6 +533,8 @@ cdef class Doc: assert t.lex.orth != 0 t.spacy = has_space self.length += 1 + # Set morphological attributes, e.g. by lemma, if possible + self.vocab.morphology.assign_untagged(t) self._py_tokens.append(None) return t.idx + t.lex.length + t.spacy diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index c6bb1a0bb..3b31c50c0 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE from ..lexeme cimport Lexeme from ..compat import is_config from .. import about +from .underscore import Underscore cdef class Span: """A slice from a Doc object.""" + @classmethod + def set_extension(cls, name, default=None, method=None, + getter=None, setter=None): + Underscore.span_extensions[name] = (default, method, getter, setter) + + @classmethod + def get_extension(cls, name): + return Underscore.span_extensions.get(name) + + @classmethod + def has_extension(cls, name): + return name in Underscore.span_extensions + def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, vector_norm=None): """Create a `Span` object from the slice `doc[start : end]`. @@ -111,10 +125,14 @@ cdef class Span: for i in range(self.start, self.end): yield self.doc[i] + @property + def _(self): + return Underscore(Underscore.span_extensions, self, + start=self.start_char, end=self.end_char) def as_doc(self): '''Create a Doc object view of the Span's data. - This is mostly useful for C-typed interfaces. + This is mostly useful for C-typed interfaces. ''' cdef Doc doc = Doc(self.doc.vocab) doc.length = self.end-self.start diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 78ba920dd..9ff59eabe 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST from ..attrs cimport LEMMA, POS, TAG, DEP from ..compat import is_config from .. import about +from .underscore import Underscore cdef class Token: """An individual token – i.e. a word, punctuation symbol, whitespace, etc.""" + @classmethod + def set_extension(cls, name, default=None, method=None, + getter=None, setter=None): + Underscore.token_extensions[name] = (default, method, getter, setter) + + @classmethod + def get_extension(cls, name): + return Underscore.span_extensions.get(name) + + @classmethod + def has_extension(cls, name): + return name in Underscore.span_extensions + def __cinit__(self, Vocab vocab, Doc doc, int offset): """Construct a `Token` object. @@ -87,6 +101,11 @@ cdef class Token: else: raise ValueError(op) + @property + def _(self): + return Underscore(Underscore.token_extensions, self, + start=self.idx, end=None) + cpdef bint check_flag(self, attr_id_t flag_id) except -1: """Check the value of a boolean flag. @@ -266,7 +285,7 @@ cdef class Token: def __get__(self): if 'vector_norm' in self.doc.user_token_hooks: return self.doc.user_token_hooks['vector_norm'](self) - vector = self.vector + vector = self.vector return numpy.sqrt((vector ** 2).sum()) property n_lefts: diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py new file mode 100644 index 000000000..6e782647b --- /dev/null +++ b/spacy/tokens/underscore.py @@ -0,0 +1,50 @@ +import functools + +class Underscore(object): + doc_extensions = {} + span_extensions = {} + token_extensions = {} + + def __init__(self, extensions, obj, start=None, end=None): + object.__setattr__(self, '_extensions', extensions) + object.__setattr__(self, '_obj', obj) + # Assumption is that for doc values, _start and _end will both be None + # Span will set non-None values for _start and _end + # Token will have _start be non-None, _end be None + # This lets us key everything into the doc.user_data dictionary, + # (see _get_key), and lets us use a single Underscore class. + object.__setattr__(self, '_doc', obj.doc) + object.__setattr__(self, '_start', start) + object.__setattr__(self, '_end', end) + + def __getattr__(self, name): + if name not in self._extensions: + raise AttributeError(name) + default, method, getter, setter = self._extensions[name] + if getter is not None: + return getter(self._obj) + elif method is not None: + return functools.partial(method, self._obj) + else: + return self._doc.user_data.get(self._get_key(name), default) + + def __setattr__(self, name, value): + if name not in self._extensions: + raise AttributeError(name) + default, method, getter, setter = self._extensions[name] + if setter is not None: + return setter(self._obj, value) + else: + self._doc.user_data[self._get_key(name)] = value + + def set(self, name, value): + return self.__setattr__(name, value) + + def get(self, name): + return self.__getattr__(name) + + def has(self, name): + return name in self._extensions + + def _get_key(self, name): + return ('._.', name, self._start, self._end) diff --git a/spacy/util.py b/spacy/util.py index e1a721a12..50ebc036b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides): if not meta: meta = get_model_meta(model_path) cls = get_lang_class(meta['lang']) - nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides) + nlp = cls(meta=meta, **overrides) + pipeline = meta.get('pipeline', []) + disable = overrides.get('disable', []) + if pipeline is True: + pipeline = nlp.Defaults.pipe_names + elif pipeline in (False, None): + pipeline = [] + for name in pipeline: + if name not in disable: + config = meta.get('pipeline_args', {}).get(name, {}) + component = nlp.create_pipe(name, config=config) + nlp.add_pipe(component, name=name) return nlp.from_disk(model_path) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 4876c6b6b..7666889b5 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap) //- Code blocks to display old/new versions +mixin code-wrapper() + span.u-inline-block.u-padding-top.u-width-full + block + mixin code-old() +code(false, false, false, false, "reject").o-block-small block diff --git a/website/api/_top-level/_cli.jade b/website/api/_top-level/_cli.jade index 3a4b4702a..b2a9c574d 100644 --- a/website/api/_top-level/_cli.jade +++ b/website/api/_top-level/_cli.jade @@ -113,6 +113,22 @@ p +cell flag +cell Show help message and available arguments. ++h(3, "validate") Validate + +tag-new(2) + +p + | Find all models installed in the current environment (both packages and + | shortcut links) and check whether they are compatible with the currently + | installed version of spaCy. Should be run after upgrading spaCy via + | #[code pip install -U spacy] to ensure that all installed models are + | can be used with the new version. The command is also useful to detect + | out-of-sync model links resulting from links created in different virtual + | environments. Prints a list of models, the installed versions, the latest + | compatible version (if out of date) and the commands for updating. + ++code(false, "bash", "$"). + spacy validate + +h(3, "convert") Convert p diff --git a/website/api/_top-level/_spacy.jade b/website/api/_top-level/_spacy.jade index c14f62f7e..81ec744ad 100644 --- a/website/api/_top-level/_spacy.jade +++ b/website/api/_top-level/_spacy.jade @@ -43,6 +43,20 @@ p +cell #[code Language] +cell A #[code Language] object with the loaded model. +p + | Essentially, #[code spacy.load()] is a convenience wrapper that reads + | the language ID and pipeline components from a model's #[code meta.json], + | initialises the #[code Language] class, loads in the model data and + | returns it. + ++code("Abstract example"). + cls = util.get_lang_class(lang) # get language for ID, e.g. 'en' + nlp = cls() # initialise the language + for name in pipeline: + component = nlp.create_pipe(name) # create each pipeline component + nlp.add_pipe(component) # add component to pipeline + nlp.from_disk(model_data_path) # load in model data + +infobox("Deprecation note", "⚠️") .o-block | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy @@ -141,37 +155,3 @@ p +cell returns +cell unicode +cell The explanation, or #[code None] if not found in the glossary. - -+h(3, "spacy.set_factory") spacy.set_factory - +tag function - +tag-new(2) - -p - | Set a factory that returns a custom - | #[+a("/usage/processing-pipelines") processing pipeline] - | component. Factories are useful for creating stateful components, especially ones which depend on shared data. - -+aside-code("Example"). - def my_factory(vocab): - def my_component(doc): - return doc - return my_component - - spacy.set_factory('my_factory', my_factory) - nlp = Language(pipeline=['my_factory']) - -+table(["Name", "Type", "Description"]) - +row - +cell #[code factory_id] - +cell unicode - +cell - | Unique name of factory. If added to a new pipeline, spaCy will - | look up the factory for this ID and use it to create the - | component. - - +row - +cell #[code factory] - +cell callable - +cell - | Callable that takes a #[code Vocab] object and returns a pipeline - | component. diff --git a/website/api/doc.jade b/website/api/doc.jade index 85932c605..dce6b89e0 100644 --- a/website/api/doc.jade +++ b/website/api/doc.jade @@ -138,6 +138,109 @@ p Get the number of tokens in the document. +cell int +cell The number of tokens in the document. ++h(2, "set_extension") Doc.set_extension + +tag classmethod + +tag-new(2) + +p + | Define a custom attribute on the #[code Doc] which becomes available via + | #[code Doc._]. For details, see the documentation on + | #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. + ++aside-code("Example"). + from spacy.tokens import Doc + city_getter = lambda doc: doc.text in ('New York', 'Paris', 'Berlin') + Doc.set_extension('has_city', getter=city_getter) + doc = nlp(u'I like New York') + assert doc._.has_city + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | Name of the attribute to set by the extension. For example, + | #[code 'my_attr'] will be available as #[code doc._.my_attr]. + + +row + +cell #[code default] + +cell - + +cell + | Optional default value of the attribute if no getter or method + | is defined. + + +row + +cell #[code method] + +cell callable + +cell + | Set a custom method on the object, for example + | #[code doc._.compare(other_doc)]. + + +row + +cell #[code getter] + +cell callable + +cell + | Getter function that takes the object and returns an attribute + | value. Is called when the user accesses the #[code ._] attribute. + + +row + +cell #[code setter] + +cell callable + +cell + | Setter function that takes the #[code Doc] and a value, and + | modifies the object. Is called when the user writes to the + | #[code Doc._] attribute. + ++h(2, "get_extension") Doc.get_extension + +tag classmethod + +tag-new(2) + +p + | Look up a previously registered extension by name. Returns a 4-tuple + | #[code.u-break (default, method, getter, setter)] if the extension is + | registered. Raises a #[code KeyError] otherwise. + ++aside-code("Example"). + from spacy.tokens import Doc + Doc.set_extension('is_city', default=False) + extension = Doc.get_extension('is_city') + assert extension == (False, None, None, None) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension. + + +row("foot") + +cell returns + +cell tuple + +cell + | A #[code.u-break (default, method, getter, setter)] tuple of the + | extension. + ++h(2, "has_extension") Doc.has_extension + +tag classmethod + +tag-new(2) + +p Check whether an extension has been registered on the #[code Doc] class. + ++aside-code("Example"). + from spacy.tokens import Doc + Doc.set_extension('is_city', default=False) + assert Doc.has_extension('is_city') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension to check. + + +row("foot") + +cell returns + +cell bool + +cell Whether the extension has been registered. + +h(2, "char_span") Doc.char_span +tag method +tag-new(2) diff --git a/website/api/language.jade b/website/api/language.jade index 617c81599..500d6c411 100644 --- a/website/api/language.jade +++ b/website/api/language.jade @@ -4,7 +4,14 @@ include ../_includes/_mixins p | Usually you'll load this once per process as #[code nlp] and pass the - | instance around your application. + | instance around your application. The #[code Language] class is created + | when you call #[+api("spacy#load") #[code spacy.load()]] and contains + | the shared vocabulary and #[+a("/usage/adding-languages") language data], + | optional model data loaded from a #[+a("/models") model package] or + | a path, and a #[+a("/usage/processing-pipelines") processing pipeline] + | containing components like the tagger or parser that are called on a + | document in order. You can also add your own processing pipeline + | components that take a #[code Doc] object, modify it and return it. +h(2, "init") Language.__init__ +tag method @@ -12,9 +19,9 @@ p p Initialise a #[code Language] object. +aside-code("Example"). + from spacy.vocab import Vocab from spacy.language import Language - nlp = Language(pipeline=['token_vectors', 'tags', - 'dependencies']) + nlp = Language(Vocab()) from spacy.lang.en import English nlp = English() @@ -34,14 +41,6 @@ p Initialise a #[code Language] object. | A function that takes text and returns a #[code Doc] object. | Usually a #[code Tokenizer]. - +row - +cell #[code pipeline] - +cell list - +cell - | A list of annotation processes or IDs of annotation, processes, - | e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked - | up in #[code Language.Defaults.factories]. - +row +cell #[code meta] +cell dict @@ -235,7 +234,6 @@ p | Can be called before training to pre-process gold data. By default, it | handles nonprojectivity and adds missing tags to the tag map. - +table(["Name", "Type", "Description"]) +row +cell #[code docs_golds] @@ -247,6 +245,177 @@ p +cell tuple +cell Tuples of #[code Doc] and #[code GoldParse] objects. ++h(2, "create_pipe") Language.create_pipe + +tag method + +tag-new(2) + +p Create a pipeline component from a factory. + ++aside-code("Example"). + parser = nlp.create_pipe('parser') + nlp.add_pipe(parser) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | Factory name to look up in + | #[+api("language#class-attributes") #[code Language.factories]]. + + +row + +cell #[code config] + +cell dict + +cell Configuration parameters to initialise component. + + +row("foot") + +cell returns + +cell callable + +cell The pipeline component. + ++h(2, "add_pipe") Language.add_pipe + +tag method + +tag-new(2) + +p + | Add a component to the processing pipeline. Valid components are + | callables that take a #[code Doc] object, modify it and return it. Only + | one of #[code before], #[code after], #[code first] or #[code last] can + | be set. Default behaviour is #[code last=True]. + ++aside-code("Example"). + def component(doc): + # modify Doc and return it + return doc + + nlp.add_pipe(component, before='ner') + nlp.add_pipe(component, name='custom_name', last=True) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code component] + +cell callable + +cell The pipeline component. + + +row + +cell #[code name] + +cell unicode + +cell + | Name of pipeline component. Overwrites existing + | #[code component.name] attribute if available. If no #[code name] + | is set and the component exposes no name attribute, + | #[code component.__name__] is used. An error is raised if the + | name already exists in the pipeline. + + +row + +cell #[code before] + +cell unicode + +cell Component name to insert component directly before. + + +row + +cell #[code after] + +cell unicode + +cell Component name to insert component directly after: + + +row + +cell #[code first] + +cell bool + +cell Insert component first / not first in the pipeline. + + +row + +cell #[code last] + +cell bool + +cell Insert component last / not last in the pipeline. + ++h(2, "get_pipe") Language.get_pipe + +tag method + +tag-new(2) + +p Get a pipeline component for a given component name. + ++aside-code("Example"). + parser = nlp.get_pipe('parser') + custom_component = nlp.get_pipe('custom_component') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the pipeline component to get. + + +row("foot") + +cell returns + +cell callable + +cell The pipeline component. + ++h(2, "replace_pipe") Language.replace_pipe + +tag method + +tag-new(2) + +p Replace a component in the pipeline. + ++aside-code("Example"). + nlp.replace_pipe('parser', my_custom_parser) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the component to replace. + + +row + +cell #[code component] + +cell callable + +cell The pipeline component to inser. + + ++h(2, "rename_pipe") Language.rename_pipe + +tag method + +tag-new(2) + +p + | Rename a component in the pipeline. Useful to create custom names for + | pre-defined and pre-loaded components. To change the default name of + | a component added to the pipeline, you can also use the #[code name] + | argument on #[+api("language#add_pipe") #[code add_pipe]]. + ++aside-code("Example"). + nlp.rename_pipe('parser', 'spacy_parser') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code old_name] + +cell unicode + +cell Name of the component to rename. + + +row + +cell #[code new_name] + +cell unicode + +cell New name of the component. + ++h(2, "remove_pipe") Language.remove_pipe + +tag method + +tag-new(2) + +p + | Remove a component from the pipeline. Returns the removed component name + | and component function. + ++aside-code("Example"). + name, component = nlp.remove_pipe('parser') + assert name == 'parser' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the component to remove. + + +row("foot") + +cell returns + +cell tuple + +cell A #[code (name, component)] tuple of the removed component. + +h(2, "to_disk") Language.to_disk +tag method +tag-new(2) @@ -399,7 +568,15 @@ p Load state from a binary string. +row +cell #[code pipeline] +cell list - +cell Sequence of annotation functions. + +cell + | List of #[code (name, component)] tuples describing the current + | processing pipeline, in order. + + +row + +cell #[code pipe_names] + +tag-new(2) + +cell list + +cell List of pipeline component names, in order. +row +cell #[code meta] @@ -424,3 +601,12 @@ p Load state from a binary string. +cell | Two-letter language ID, i.e. | #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]. + + +row + +cell #[code factories] + +tag-new(2) + +cell dict + +cell + | Factories that create pre-defined pipeline components, e.g. the + | tagger, parser or entity recognizer, keyed by their component + | name. diff --git a/website/api/span.jade b/website/api/span.jade index 067e709f0..6bff45a9b 100644 --- a/website/api/span.jade +++ b/website/api/span.jade @@ -116,6 +116,109 @@ p Get the number of tokens in the span. +cell int +cell The number of tokens in the span. ++h(2, "set_extension") Span.set_extension + +tag classmethod + +tag-new(2) + +p + | Define a custom attribute on the #[code Span] which becomes available via + | #[code Span._]. For details, see the documentation on + | #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. + ++aside-code("Example"). + from spacy.tokens import Span + city_getter = lambda span: span.text in ('New York', 'Paris', 'Berlin') + Span.set_extension('has_city', getter=city_getter) + doc = nlp(u'I like New York in Autumn') + assert doc[1:4]._.has_city + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | Name of the attribute to set by the extension. For example, + | #[code 'my_attr'] will be available as #[code span._.my_attr]. + + +row + +cell #[code default] + +cell - + +cell + | Optional default value of the attribute if no getter or method + | is defined. + + +row + +cell #[code method] + +cell callable + +cell + | Set a custom method on the object, for example + | #[code span._.compare(other_span)]. + + +row + +cell #[code getter] + +cell callable + +cell + | Getter function that takes the object and returns an attribute + | value. Is called when the user accesses the #[code ._] attribute. + + +row + +cell #[code setter] + +cell callable + +cell + | Setter function that takes the #[code Span] and a value, and + | modifies the object. Is called when the user writes to the + | #[code Span._] attribute. + ++h(2, "get_extension") Span.get_extension + +tag classmethod + +tag-new(2) + +p + | Look up a previously registered extension by name. Returns a 4-tuple + | #[code.u-break (default, method, getter, setter)] if the extension is + | registered. Raises a #[code KeyError] otherwise. + ++aside-code("Example"). + from spacy.tokens import Span + Span.set_extension('is_city', default=False) + extension = Span.get_extension('is_city') + assert extension == (False, None, None, None) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension. + + +row("foot") + +cell returns + +cell tuple + +cell + | A #[code.u-break (default, method, getter, setter)] tuple of the + | extension. + ++h(2, "has_extension") Span.has_extension + +tag classmethod + +tag-new(2) + +p Check whether an extension has been registered on the #[code Span] class. + ++aside-code("Example"). + from spacy.tokens import Span + Span.set_extension('is_city', default=False) + assert Span.has_extension('is_city') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension to check. + + +row("foot") + +cell returns + +cell bool + +cell Whether the extension has been registered. + +h(2, "similarity") Span.similarity +tag method +tag-model("vectors") diff --git a/website/api/token.jade b/website/api/token.jade index 814a13310..465d44c66 100644 --- a/website/api/token.jade +++ b/website/api/token.jade @@ -51,6 +51,109 @@ p The number of unicode characters in the token, i.e. #[code token.text]. +cell int +cell The number of unicode characters in the token. ++h(2, "set_extension") Token.set_extension + +tag classmethod + +tag-new(2) + +p + | Define a custom attribute on the #[code Token] which becomes available + | via #[code Token._]. For details, see the documentation on + | #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. + ++aside-code("Example"). + from spacy.tokens import Token + fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana') + Token.set_extension('is_fruit', getter=fruit_getter) + doc = nlp(u'I have an apple') + assert doc[3]._.is_fruit + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | Name of the attribute to set by the extension. For example, + | #[code 'my_attr'] will be available as #[code token._.my_attr]. + + +row + +cell #[code default] + +cell - + +cell + | Optional default value of the attribute if no getter or method + | is defined. + + +row + +cell #[code method] + +cell callable + +cell + | Set a custom method on the object, for example + | #[code token._.compare(other_token)]. + + +row + +cell #[code getter] + +cell callable + +cell + | Getter function that takes the object and returns an attribute + | value. Is called when the user accesses the #[code ._] attribute. + + +row + +cell #[code setter] + +cell callable + +cell + | Setter function that takes the #[code Token] and a value, and + | modifies the object. Is called when the user writes to the + | #[code Token._] attribute. + ++h(2, "get_extension") Token.get_extension + +tag classmethod + +tag-new(2) + +p + | Look up a previously registered extension by name. Returns a 4-tuple + | #[code.u-break (default, method, getter, setter)] if the extension is + | registered. Raises a #[code KeyError] otherwise. + ++aside-code("Example"). + from spacy.tokens import Token + Token.set_extension('is_fruit', default=False) + extension = Token.get_extension('is_fruit') + assert extension == (False, None, None, None) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension. + + +row("foot") + +cell returns + +cell tuple + +cell + | A #[code.u-break (default, method, getter, setter)] tuple of the + | extension. + ++h(2, "has_extension") Token.has_extension + +tag classmethod + +tag-new(2) + +p Check whether an extension has been registered on the #[code Token] class. + ++aside-code("Example"). + from spacy.tokens import Token + Token.set_extension('is_fruit', default=False) + assert Token.has_extension('is_fruit') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the extension to check. + + +row("foot") + +cell returns + +cell bool + +cell Whether the extension has been registered. + +h(2, "check_flag") Token.check_flag +tag method diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass index e2ba552b7..91a6251e6 100644 --- a/website/assets/css/_base/_utilities.sass +++ b/website/assets/css/_base/_utilities.sass @@ -143,6 +143,9 @@ //- Layout +.u-width-full + width: 100% + .u-float-left float: left margin-right: 1rem @@ -166,6 +169,9 @@ .u-padding-medium padding: 1.8rem +.u-padding-top + padding-top: 2rem + .u-inline-block display: inline-block diff --git a/website/assets/css/_components/_lists.sass b/website/assets/css/_components/_lists.sass index 2a933c95e..553af6578 100644 --- a/website/assets/css/_components/_lists.sass +++ b/website/assets/css/_components/_lists.sass @@ -25,7 +25,7 @@ display: inline-block font-size: 0.6em font-weight: bold - padding-right: 1.25rem + padding-right: 1em margin-left: -3.75rem text-align: right width: 2.5rem diff --git a/website/usage/_adding-languages/_language-data.jade b/website/usage/_adding-languages/_language-data.jade index 81a6d638e..dc86b7a03 100644 --- a/website/usage/_adding-languages/_language-data.jade +++ b/website/usage/_adding-languages/_language-data.jade @@ -456,24 +456,11 @@ p } p - | To add a lookup lemmatizer to your language, import the #[code LOOKUP] - | table and #[code Lemmatizer], and create a new classmethod: + | To provide a lookup lemmatizer for your language, import the lookup table + | and add it to the #[code Language] class as #[code lemma_lookup]: - -+code("__init__py (excerpt)"). - # other imports here, plus lookup table and lookup lemmatizer - from .lemmatizer import LOOKUP - from ...lemmatizerlookup import Lemmatizer - - class Xxxxx(Language): - lang = 'xx' - - class Defaults(Language.Defaults): - # other language defaults here - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOKUP) ++code. + lemma_lookup = dict(LOOKUP) +h(3, "tag-map") Tag map diff --git a/website/usage/_data.json b/website/usage/_data.json index b34304ed6..06b0371ae 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -103,10 +103,10 @@ "title": "Language Processing Pipelines", "next": "vectors-similarity", "menu": { - "How pipelines work": "pipelines", - "Examples": "examples", + "How Pipelines Work": "pipelines", + "Custom Components": "custom-components", + "Developing Extensions": "extensions", "Multi-threading": "multithreading", - "User Hooks": "user-hooks", "Serialization": "serialization" } }, @@ -195,6 +195,7 @@ "teaser": "Full code examples you can modify and run.", "next": "resources", "menu": { + "Pipeline": "pipeline", "Matching": "matching", "Training": "training", "Deep Learning": "deep-learning" diff --git a/website/usage/_processing-pipelines/_custom-components.jade b/website/usage/_processing-pipelines/_custom-components.jade new file mode 100644 index 000000000..ea3ea9b97 --- /dev/null +++ b/website/usage/_processing-pipelines/_custom-components.jade @@ -0,0 +1,369 @@ +//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS + +p + | A component receives a #[code Doc] object and can modify it – for example, + | by using the current weights to make a prediction and set some annotation + | on the document. By adding a component to the pipeline, you'll get access + | to the #[code Doc] at any point #[strong during processing] – instead of + | only being able to modify it afterwards. + ++aside-code("Example"). + def my_component(doc): + # do something to the doc here + return doc + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The #[code Doc] object processed by the previous component. + + +row("foot") + +cell returns + +cell #[code Doc] + +cell The #[code Doc] object processed by this pipeline component. + +p + | Custom components can be added to the pipeline using the + | #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you + | can either specify a component to add it #[strong before or after], tell + | spaCy to add it #[strong first or last] in the pipeline, or define a + | #[strong custom name]. If no name is set and no #[code name] attribute + | is present on your component, the function name is used. + ++code("Adding pipeline components"). + def my_component(doc): + print("After tokenization, this doc has %s tokens." % len(doc)) + if len(doc) < 10: + print("This is a pretty short document.") + return doc + + nlp = spacy.load('en') + nlp.pipeline.add_pipe(my_component, name='print_info', first=True) + print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner'] + doc = nlp(u"This is a sentence.") + +p + | Of course, you can also wrap your component as a class to allow + | initialising it with custom settings and hold state within the component. + | This is useful for #[strong stateful components], especially ones which + | #[strong depend on shared data]. + ++code. + class MyComponent(object): + name = 'print_info' + + def __init__(vocab, short_limit=10): + self.vocab = nlp.vocab + self.short_limit = short_limit + + def __call__(doc): + if len(doc) < self.short_limit: + print("This is a pretty short document.") + return doc + + my_component = MyComponent(nlp.vocab, short_limit=25) + nlp.add_pipe(my_component, first=True) + ++h(3, "custom-components-attributes") + | Extension attributes on #[code Doc], #[code Span] and #[code Token] + +tag-new(2) + +p + | As of v2.0, spaCy allows you to set any custom attributes and methods + | on the #[code Doc], #[code Span] and #[code Token], which become + | available as #[code Doc._], #[code Span._] and #[code Token._] – for + | example, #[code Token._.my_attr]. This lets you store additional + | information relevant to your application, add new features and + | functionality to spaCy, and implement your own models trained with other + | machine learning libraries. It also lets you take advantage of spaCy's + | data structures and the #[code Doc] object as the "single source of + | truth". + ++aside("Why ._?") + | Writing to a #[code ._] attribute instead of to the #[code Doc] directly + | keeps a clearer separation and makes it easier to ensure backwards + | compatibility. For example, if you've implemented your own #[code .coref] + | property and spaCy claims it one day, it'll break your code. Similarly, + | just by looking at the code, you'll immediately know what's built-in and + | what's custom – for example, #[code doc.sentiment] is spaCy, while + | #[code doc._.sent_score] isn't. + +p + | There are three main types of extensions, which can be defined using the + | #[+api("doc#set_extension") #[code Doc.set_extension]], + | #[+api("span#set_extension") #[code Span.set_extension]] and + | #[+api("token#set_extension") #[code Token.set_extension]] methods. + ++list("numbers") + +item #[strong Attribute extensions]. + | Set a default value for an attribute, which can be overwritten + | manually at any time. Attribute extensions work like "normal" + | variables and are the quickest way to store arbitrary information + | on a #[code Doc], #[code Span] or #[code Token]. + + +code-wrapper + +code. + Doc.set_extension('hello', default=True) + assert doc._.hello + doc._.hello = False + + +item #[strong Property extensions]. + | Define a getter and an optional setter function. If no setter is + | provided, the extension is immutable. Since the getter and setter + | functions are only called when you #[em retrieve] the attribute, + | you can also access values of previously added attribute extensions. + | For example, a #[code Doc] getter can average over #[code Token] + | attributes. For #[code Span] extensions, you'll almost always want + | to use a property – otherwise, you'd have to write to + | #[em every possible] #[code Span] in the #[code Doc] to set up the + | values correctly. + + +code-wrapper + +code. + Doc.set_extension('hello', getter=get_hello_value, setter=set_hello_value) + assert doc._.hello + doc._.hello = 'Hi!' + + +item #[strong Method extensions]. + | Assign a function that becomes available as an object method. Method + | extensions are always immutable. For more details and implementation + | ideas, see + | #[+a("/usage/examples#custom-components-attr-methods") these examples]. + + +code-wrapper + +code.o-no-block. + Doc.set_extension('hello', method=lambda doc, name: 'Hi {}!'.format(name)) + assert doc._.hello('Bob') == 'Hi Bob!' + +p + | Before you can access a custom extension, you need to register it using + | the #[code set_extension] method on the object you want + | to add it to, e.g. the #[code Doc]. Keep in mind that extensions are + | always #[strong added globally] and not just on a particular instance. + | If an attribute of the same name + | already exists, or if you're trying to access an attribute that hasn't + | been registered, spaCy will raise an #[code AttributeError]. + ++code("Example"). + from spacy.tokens import Doc, Span, Token + + fruits = ['apple', 'pear', 'banana', 'orange', 'strawberry'] + is_fruit_getter = lambda token: token.text in fruits + has_fruit_getter = lambda obj: any([t.text in fruits for t in obj]) + + Token.set_extension('is_fruit', getter=is_fruit_getter) + Doc.set_extension('has_fruit', getter=has_fruit_getter) + Span.set_extension('has_fruit', getter=has_fruit_getter) + ++aside-code("Usage example"). + doc = nlp(u"I have an apple and a melon") + assert doc[3]._.is_fruit # get Token attributes + assert not doc[0]._.is_fruit + assert doc._.has_fruit # get Doc attributes + assert doc[1:4]._.has_fruit # get Span attributes + +p + | Once you've registered your custom attribute, you can also use the + | built-in #[code set], #[code get] and #[code has] methods to modify and + | retrieve the attributes. This is especially useful it you want to pass in + | a string instead of calling #[code doc._.my_attr]. + ++table(["Method", "Description", "Valid for", "Example"]) + +row + +cell #[code ._.set()] + +cell Set a value for an attribute. + +cell Attributes, mutable properties. + +cell #[code.u-break token._.set('my_attr', True)] + + +row + +cell #[code ._.get()] + +cell Get the value of an attribute. + +cell Attributes, mutable properties, immutable properties, methods. + +cell #[code.u-break my_attr = span._.get('my_attr')] + + +row + +cell #[code ._.has()] + +cell Check if an attribute exists. + +cell Attributes, mutable properties, immutable properties, methods. + +cell #[code.u-break doc._.has('my_attr')] + ++infobox("How the ._ is implemented") + | Extension definitions – the defaults, methods, getters and setters you + | pass in to #[code set_extension] are stored in class attributes on the + | #[code Underscore] class. If you write to an extension attribute, e.g. + | #[code doc._.hello = True], the data is stored within the + | #[+api("doc#attributes") #[code Doc.user_data]] dictionary. To keep the + | underscore data separate from your other dictionary entries, the string + | #[code "._."] is placed before the name, in a tuple. + ++h(4, "component-example1") Example: Custom sentence segmentation logic + +p + | Let's say you want to implement custom logic to improve spaCy's sentence + | boundary detection. Currently, sentence segmentation is based on the + | dependency parse, which doesn't always produce ideal results. The custom + | logic should therefore be applied #[strong after] tokenization, but + | #[strong before] the dependency parsing – this way, the parser can also + | take advantage of the sentence boundaries. + ++code. + def sbd_component(doc): + for i, token in enumerate(doc[:-2]): + # define sentence start if period + titlecase token + if token.text == '.' and doc[i+1].is_title: + doc[i+1].sent_start = True + return doc + + nlp = spacy.load('en') + nlp.add_pipe(sbd_component, before='parser') # insert before the parser + ++h(4, "component-example2") + | Example: Pipeline component for entity matching and tagging with + | custom attributes + +p + | This example shows how to create a spaCy extension that takes a + | terminology list (in this case, single- and multi-word company names), + | matches the occurences in a document, labels them as #[code ORG] entities, + | merges the tokens and sets custom #[code is_tech_org] and + | #[code has_tech_org] attributes. For efficient matching, the example uses + | the #[+api("phrasematcher") #[code PhraseMatcher]] which accepts + | #[code Doc] objects as match patterns and works well for large + | terminology lists. It also ensures your patterns will always match, even + | when you customise spaCy's tokenization rules. When you call #[code nlp] + | on a text, the custom pipeline component is applied to the #[code Doc] + ++github("spacy", "examples/pipeline/custom_component_entities.py", false, 500) + +p + | Wrapping this functionality in a + | pipeline component allows you to reuse the module with different + | settings, and have all pre-processing taken care of when you call + | #[code nlp] on your text and receive a #[code Doc] object. + ++h(4, "component-example3") + | Example: Pipeline component for GPE entities and country meta data via a + | REST API + +p + | This example shows the implementation of a pipeline component + | that fetches country meta data via the + | #[+a("https://restcountries.eu") REST Countries API] sets entity + | annotations for countries, merges entities into one token and + | sets custom attributes on the #[code Doc], #[code Span] and + | #[code Token] – for example, the capital, latitude/longitude coordinates + | and even the country flag. + ++github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500) + +p + | In this case, all data can be fetched on initialisation in one request. + | However, if you're working with text that contains incomplete country + | names, spelling mistakes or foreign-language versions, you could also + | implement a #[code like_country]-style getter function that makes a + | request to the search API endpoint and returns the best-matching + | result. + ++h(4, "custom-components-usage-ideas") Other usage ideas + ++list + +item + | #[strong Adding new features and hooking in models]. For example, + | a sentiment analysis model, or your preferred solution for + | lemmatization or sentiment analysis. spaCy's built-in tagger, + | parser and entity recognizer respect annotations that were already + | set on the #[code Doc] in a previous step of the pipeline. + +item + | #[strong Integrating other libraries and APIs]. For example, your + | pipeline component can write additional information and data + | directly to the #[code Doc] or #[code Token] as custom attributes, + | while making sure no information is lost in the process. This can + | be output generated by other libraries and models, or an external + | service with a REST API. + +item + | #[strong Debugging and logging]. For example, a component which + | stores and/or exports relevant information about the current state + | of the processed document, and insert it at any point of your + | pipeline. + ++infobox("Developing third-party extensions") + | The new pipeline management and custom attributes finally make it easy + | to develop your own spaCy extensions and plugins and share them with + | others. Extensions can claim their own #[code ._] namespace and exist as + | standalone packages. If you're developing a tool or library and want to + | make it easy for others to use it with spaCy and add it to their + | pipeline, all you have to do is expose a function that takes a + | #[code Doc], modifies it and returns it. For more details and + | #[strong best practices], see the section on + | #[+a("#extensions") developing spaCy extensions]. + ++h(3, "custom-components-user-hooks") User hooks + +p + | While it's generally recommended to use the #[code Doc._], #[code Span._] + | and #[code Token._] proxies to add your own custom attributes, spaCy + | offers a few exceptions to allow #[strong customising the built-in methods] + | like #[+api("doc#similarity") #[code Doc.similarity]] or + | #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can + | rely on statistical models you train yourself. For instance, you can + | provide your own on-the-fly sentence segmentation algorithm or document + | similarity method. + +p + | Hooks let you customize some of the behaviours of the #[code Doc], + | #[code Span] or #[code Token] objects by adding a component to the + | pipeline. For instance, to customize the + | #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a + | component that sets a custom function to + | #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] + | method will check the #[code user_hooks] dict, and delegate to your + | function if you've set one. Similar results can be achieved by setting + | functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. + ++aside("Implementation note") + | The hooks live on the #[code Doc] object because the #[code Span] and + | #[code Token] objects are created lazily, and don't own any data. They + | just proxy to their parent #[code Doc]. This turns out to be convenient + | here — we only have to worry about installing hooks in one place. + ++table(["Name", "Customises"]) + +row + +cell #[code user_hooks] + +cell + +api("doc#vector") #[code Doc.vector] + +api("doc#has_vector") #[code Doc.has_vector] + +api("doc#vector_norm") #[code Doc.vector_norm] + +api("doc#sents") #[code Doc.sents] + + +row + +cell #[code user_token_hooks] + +cell + +api("token#similarity") #[code Token.similarity] + +api("token#vector") #[code Token.vector] + +api("token#has_vector") #[code Token.has_vector] + +api("token#vector_norm") #[code Token.vector_norm] + +api("token#conjuncts") #[code Token.conjuncts] + + +row + +cell #[code user_span_hooks] + +cell + +api("span#similarity") #[code Span.similarity] + +api("span#vector") #[code Span.vector] + +api("span#has_vector") #[code Span.has_vector] + +api("span#vector_norm") #[code Span.vector_norm] + +api("span#root") #[code Span.root] + ++code("Add custom similarity hooks"). + class SimilarityModel(object): + def __init__(self, model): + self._model = model + + def __call__(self, doc): + doc.user_hooks['similarity'] = self.similarity + doc.user_span_hooks['similarity'] = self.similarity + doc.user_token_hooks['similarity'] = self.similarity + + def similarity(self, obj1, obj2): + y = self._model([obj1.vector, obj2.vector]) + return float(y[0]) diff --git a/website/usage/_processing-pipelines/_examples.jade b/website/usage/_processing-pipelines/_examples.jade deleted file mode 100644 index 616bed32c..000000000 --- a/website/usage/_processing-pipelines/_examples.jade +++ /dev/null @@ -1,126 +0,0 @@ -//- 💫 DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES - -p - | To see real-world examples of pipeline factories and components in action, - | you can have a look at the source of spaCy's built-in components, e.g. - | the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or - | #[+api("entityrecognizer") #[code EntityRecongnizer]]. - -+h(3, "example1") Example: Custom sentence segmentation logic - -p - | Let's say you want to implement custom logic to improve spaCy's sentence - | boundary detection. Currently, sentence segmentation is based on the - | dependency parse, which doesn't always produce ideal results. The custom - | logic should therefore be applied #[strong after] tokenization, but - | #[strong before] the dependency parsing – this way, the parser can also - | take advantage of the sentence boundaries. - -+code. - def sbd_component(doc): - for i, token in enumerate(doc[:-2]): - # define sentence start if period + titlecase token - if token.text == '.' and doc[i+1].is_title: - doc[i+1].sent_start = True - return doc - -p - | In this case, we simply want to add the component to the existing - | pipeline of the English model. We can do this by inserting it at index 0 - | of #[code nlp.pipeline]: - -+code. - nlp = spacy.load('en') - nlp.pipeline.insert(0, sbd_component) - -p - | When you call #[code nlp] on some text, spaCy will tokenize it to create - | a #[code Doc] object, and first call #[code sbd_component] on it, followed - | by the model's default pipeline. - -+h(3, "example2") Example: Sentiment model - -p - | Let's say you have trained your own document sentiment model on English - | text. After tokenization, you want spaCy to first execute the - | #[strong default tensorizer], followed by a custom - | #[strong sentiment component] that adds a #[code .sentiment] - | property to the #[code Doc], containing your model's sentiment precition. - -p - | Your component class will have a #[code from_disk()] method that spaCy - | calls to load the model data. When called, the component will compute - | the sentiment score, add it to the #[code Doc] and return the modified - | document. Optionally, the component can include an #[code update()] method - | to allow training the model. - -+code. - import pickle - from pathlib import Path - - class SentimentComponent(object): - def __init__(self, vocab): - self.weights = None - - def __call__(self, doc): - doc.sentiment = sum(self.weights*doc.vector) # set sentiment property - return doc - - def from_disk(self, path): # path = model path + factory ID ('sentiment') - self.weights = pickle.load(Path(path) / 'weights.bin') # load weights - return self - - def update(self, doc, gold): # update weights – allows training! - prediction = sum(self.weights*doc.vector) - self.weights -= 0.001*doc.vector*(prediction-gold.sentiment) - -p - | The factory will initialise the component with the #[code Vocab] object. - | To be able to add it to your model's pipeline as #[code 'sentiment'], - | it also needs to be registered via - | #[+api("spacy#set_factory") #[code set_factory()]]. - -+code. - def sentiment_factory(vocab): - component = SentimentComponent(vocab) # initialise component - return component - - spacy.set_factory('sentiment', sentiment_factory) - -p - | The above code should be #[strong shipped with your model]. You can use - | the #[+api("cli#package") #[code package]] command to create all required - | files and directories. The model package will include an - | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]] - | with a #[code load()] method, that will initialise the language class with - | the model's pipeline and call the #[code from_disk()] method to load - | the model data. - -p - | In the model package's meta.json, specify the language class and pipeline - | IDs: - -+code("meta.json (excerpt)", "json"). - { - "name": "sentiment_model", - "lang": "en", - "version": "1.0.0", - "spacy_version": ">=2.0.0,<3.0.0", - "pipeline": ["tensorizer", "sentiment"] - } - -p - | When you load your new model, spaCy will call the model's #[code load()] - | method. This will return a #[code Language] object with a pipeline - | containing the default tensorizer, and the sentiment component returned - | by your custom #[code "sentiment"] factory. - -+code. - nlp = spacy.load('en_sentiment_model') - doc = nlp(u'I love pizza') - assert doc.sentiment - -+infobox("Saving and loading models") - | For more information and a detailed guide on how to package your model, - | see the documentation on - | #[+a("/usage/training#saving-loading") saving and loading models]. diff --git a/website/usage/_processing-pipelines/_extensions.jade b/website/usage/_processing-pipelines/_extensions.jade new file mode 100644 index 000000000..a1d8168e0 --- /dev/null +++ b/website/usage/_processing-pipelines/_extensions.jade @@ -0,0 +1,110 @@ +//- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS + +p + | We're very excited about all the new possibilities for community + | extensions and plugins in spaCy v2.0, and we can't wait to see what + | you build with it! To get you started, here are a few tips, tricks and + | best practices: + ++list + +item + | Make sure to choose a #[strong descriptive and specific name] for + | your pipeline component class, and set it as its #[code name] + | attribute. Avoid names that are too common or likely to clash with + | built-in or a user's other custom components. While it's fine to call + | your package "spacy_my_extension", avoid component names including + | "spacy", since this can easily lead to confusion. + + +code-wrapper + +code-new name = 'myapp_lemmatizer' + +code-old name = 'lemmatizer' + + +item + | When writing to #[code Doc], #[code Token] or #[code Span] objects, + | #[strong use getter functions] wherever possible, and avoid setting + | values explicitly. Tokens and spans don't own any data themselves, + | so you should provide a function that allows them to compute the + | values instead of writing static properties to individual objects. + + +code-wrapper + +code-new. + is_fruit = lambda token: token.text in ('apple', 'orange') + Token.set_extension('is_fruit', getter=is_fruit) + +code-old. + token._.set_extension('is_fruit', default=False) + if token.text in ('apple', 'orange'): + token._.set('is_fruit', True) + + +item + | Always add your custom attributes to the #[strong global] #[code Doc] + | #[code Token] or #[code Span] objects, not a particular instance of + | them. Add the attributes #[strong as early as possible], e.g. in + | your extension's #[code __init__] method or in the global scope of + | your module. This means that in the case of namespace collisions, + | the user will see an error immediately, not just when they run their + | pipeline. + + +code-wrapper + +code-new. + from spacy.tokens import Doc + def __init__(attr='my_attr'): + Doc.set_extension(attr, getter=self.get_doc_attr) + +code-old. + def __call__(doc): + doc.set_extension('my_attr', getter=self.get_doc_attr) + + +item + | If your extension is setting properties on the #[code Doc], + | #[code Token] or #[code Span], include an option to + | #[strong let the user to change those attribute names]. This makes + | it easier to avoid namespace collisions and accommodate users with + | different naming preferences. We recommend adding an #[code attrs] + | argument to the #[code __init__] method of your class so you can + | write the names to class attributes and reuse them across your + | component. + + +code-wrapper + +code-new Doc.set_extension(self.doc_attr, default='some value') + +code-old Doc.set_extension('my_doc_attr', default='some value') + + +item + | Ideally, extensions should be #[strong standalone packages] with + | spaCy and optionally, other packages specified as a dependency. They + | can freely assign to their own #[code ._] namespace, but should stick + | to that. If your extension's only job is to provide a better + | #[code .similarity] implementation, and your docs state this + | explicitly, there's no problem with writing to the + | #[+a("#custom-components-user-hooks") #[code user_hooks]], and + | overwriting spaCy's built-in method. However, a third-party + | extension should #[strong never silently overwrite built-ins], or + | attributes set by other extensions. + + +item + | If you're looking to publish a model that depends on a custom + | pipeline component, you can either #[strong require it] in the model + | package's dependencies, or – if the component is specific and + | lightweight – choose to #[strong ship it with your model package] + | and add it to the #[code Language] instance returned by the + | model's #[code load()] method. For examples of this, check out the + | implementations of spaCy's + | #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]] + | and #[+api("util#load_model_from_path") #[code load_model_from_path()]] + | utility functions. + + +code-wrapper + +code-new. + nlp.add_pipe(my_custom_component) + return nlp.from_disk(model_path) + + +item + | Once you're ready to share your extension with others, make sure to + | #[strong add docs and installation instructions] (you can + | always link to this page for more info). Make it easy for others to + | install and use your extension, for example by uploading it to + | #[+a("https://pypi.python.org") PyPi]. If you're sharing your code on + | GitHub, don't forget to tag it + | with #[+a("https://github.com/search?q=topic%3Aspacy") #[code spacy]] + | and #[+a("https://github.com/search?q=topic%3Aspacy-pipeline") #[code spacy-pipeline]] + | to help people find it. If you post it on Twitter, feel free to tag + | #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}] + | so we can check it out. diff --git a/website/usage/_processing-pipelines/_pipelines.jade b/website/usage/_processing-pipelines/_pipelines.jade index d09ed4ead..3c1c28af1 100644 --- a/website/usage/_processing-pipelines/_pipelines.jade +++ b/website/usage/_processing-pipelines/_pipelines.jade @@ -11,7 +11,7 @@ p p | When you load a model, spaCy first consults the model's - | #[+a("/usage/saving-loading#models-generating") meta.json]. The + | #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The | meta typically includes the model details, the ID of a language class, | and an optional list of pipeline components. spaCy then does the | following: @@ -21,24 +21,26 @@ p "name": "example_model", "lang": "en" "description": "Example model for spaCy", - "pipeline": ["tensorizer", "tagger"] + "pipeline": ["tagger", "parser"] } +list("numbers") - +item - | Look up #[strong pipeline IDs] in the available - | #[strong pipeline factories]. - +item - | Initialise the #[strong pipeline components] by calling their - | factories with the #[code Vocab] as an argument. This gives each - | factory and component access to the pipeline's shared data, like - | strings, morphology and annotation scheme. +item | Load the #[strong language class and data] for the given ID via - | #[+api("util.get_lang_class") #[code get_lang_class]]. + | #[+api("util.get_lang_class") #[code get_lang_class]] and initialise + | it. The #[code Language] class contains the shared vocabulary, + | tokenization rules and the language-specific annotation scheme. +item - | Pass the path to the #[strong model data] to the #[code Language] - | class and return it. + | Iterate over the #[strong pipeline names] and create each component + | using #[+api("language#create_pipe") #[code create_pipe]], which + | looks them up in #[code Language.factories]. + +item + | Add each pipeline component to the pipeline in order, using + | #[+api("language#add_pipe") #[code add_pipe]]. + +item + | Make the #[strong model data] available to the #[code Language] class + | by calling #[+api("language#from_disk") #[code from_disk]] with the + | path to the model data ditectory. p | So when you call this... @@ -47,12 +49,12 @@ p nlp = spacy.load('en') p - | ... the model tells spaCy to use the pipeline + | ... the model tells spaCy to use the language #[code "en"] and the pipeline | #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will - | then look up each string in its internal factories registry and - | initialise the individual components. It'll then load - | #[code spacy.lang.en.English], pass it the path to the model's data - | directory, and return it for you to use as the #[code nlp] object. + | then initialise #[code spacy.lang.en.English], and create each pipeline + | component and add it to the processing pipeline. It'll then load in the + | model's data from its data ditectory and return the modified + | #[code Language] class for you to use as the #[code nlp] object. p | Fundamentally, a #[+a("/models") spaCy model] consists of three @@ -73,9 +75,12 @@ p pipeline = ['tensorizer', 'tagger', 'parser', 'ner'] data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0' - cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English() - nlp = cls(pipeline=pipeline) # 2. initialise it with the pipeline - nlp.from_disk(model_data_path) # 3. load in the binary data + cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English() + nlp = cls() # 2. initialise it + for name in pipeline: + component = nlp.create_pipe(name) # 3. create the pipeline components + nlp.add_pipe(component) # 4. add the component to the pipeline + nlp.from_disk(model_data_path) # 5. load in the binary data p | When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and @@ -87,124 +92,23 @@ p | document, which is then processed by the component next in the pipeline. +code("The pipeline under the hood"). - doc = nlp.make_doc(u'This is a sentence') - for proc in nlp.pipeline: - doc = proc(doc) - -+h(3, "creating") Creating pipeline components and factories + doc = nlp.make_doc(u'This is a sentence') # create a Doc from raw text + for name, proc in nlp.pipeline: # iterate over components in order + doc = proc(doc) # apply each component p - | spaCy lets you customise the pipeline with your own components. Components - | are functions that receive a #[code Doc] object, modify and return it. - | If your component is stateful, you'll want to create a new one for each - | pipeline. You can do that by defining and registering a factory which - | receives the shared #[code Vocab] object and returns a component. - -+h(4, "creating-component") Creating a component - -p - | A component receives a #[code Doc] object and - | #[strong performs the actual processing] – for example, using the current - | weights to make a prediction and set some annotation on the document. By - | adding a component to the pipeline, you'll get access to the #[code Doc] - | at any point #[strong during] processing – instead of only being able to - | modify it afterwards. - -+aside-code("Example"). - def my_component(doc): - # do something to the doc here - return doc - -+table(["Argument", "Type", "Description"]) - +row - +cell #[code doc] - +cell #[code Doc] - +cell The #[code Doc] object processed by the previous component. - - +row("foot") - +cell returns - +cell #[code Doc] - +cell The #[code Doc] object processed by this pipeline component. - -p - | When creating a new #[code Language] class, you can pass it a list of - | pipeline component functions to execute in that order. You can also - | add it to an existing pipeline by modifying #[code nlp.pipeline] – just - | be careful not to overwrite a pipeline or its components by accident! + | The current processing pipeline is available as #[code nlp.pipeline], + | which returns a list of #[code (name, component)] tuples, or + | #[code nlp.pipe_names], which only returns a list of human-readable + | component names. +code. - # Create a new Language object with a pipeline - from spacy.language import Language - nlp = Language(pipeline=[my_component]) + nlp.pipeline + # [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)] + nlp.pipe_names + # ['tagger', 'parser', 'ner'] - # Modify an existing pipeline - nlp = spacy.load('en') - nlp.pipeline.append(my_component) - -+h(4, "creating-factory") Creating a factory - -p - | A factory is a #[strong function that returns a pipeline component]. - | It's called with the #[code Vocab] object, to give it access to the - | shared data between components – for example, the strings, morphology, - | vectors or annotation scheme. Factories are useful for creating - | #[strong stateful components], especially ones which - | #[strong depend on shared data]. - -+aside-code("Example"). - def my_factory(vocab): - # load some state - def my_component(doc): - # process the doc - return doc - return my_component - -+table(["Argument", "Type", "Description"]) - +row - +cell #[code vocab] - +cell #[code Vocab] - +cell - | Shared data between components, including strings, morphology, - | vectors etc. - - +row("foot") - +cell returns - +cell callable - +cell The pipeline component. - -p - | By creating a factory, you're essentially telling spaCy how to get the - | pipeline component #[strong once the vocab is available]. Factories need to - | be registered via #[+api("spacy#set_factory") #[code set_factory()]] and - | by assigning them a unique ID. This ID can be added to the pipeline as a - | string. When creating a pipeline, you're free to mix strings and - | callable components: - -+code. - spacy.set_factory('my_factory', my_factory) - nlp = Language(pipeline=['my_factory', my_other_component]) - -p - | If spaCy comes across a string in the pipeline, it will try to resolve it - | by looking it up in the available factories. The factory will then be - | initialised with the #[code Vocab]. Providing factory names instead of - | callables also makes it easy to specify them in the model's - | #[+a("/usage/saving-loading#models-generating") meta.json]. If you're - | training your own model and want to use one of spaCy's default components, - | you won't have to worry about finding and implementing it either – to use - | the default tagger, simply add #[code "tagger"] to the pipeline, and - | #[strong spaCy will know what to do]. - -+infobox("Important note") - | Because factories are #[strong resolved on initialisation] of the - | #[code Language] class, it's #[strong not possible] to add them to the - | pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only - | works with individual component functions. To use factories, you need to - | create a new #[code Language] object, or generate a - | #[+a("/usage/training#models-generating") model package] with - | a custom pipeline. - -+h(3, "disabling") Disabling pipeline components ++h(3, "disabling") Disabling and modifying pipeline components p | If you don't need a particular component of the pipeline – for @@ -217,16 +121,19 @@ p +code. nlp = spacy.load('en', disable['parser', 'tagger']) nlp = English().from_disk('/model', disable=['tensorizer', 'ner']) - doc = nlp(u"I don't want parsed", disable=['parser']) p - | Note that you can't write directly to #[code nlp.pipeline], as this list - | holds the #[em actual components], not the IDs. However, if you know the - | order of the components, you can still slice the list: + | You can also use the #[+api("language#remove_pipe") #[code remove_pipe]] + | method to remove pipeline components from an existing pipeline, the + | #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them, + | or the #[+api("language#replace_pipe") #[code replace_pipe]] method + | to replace them with a custom component entirely (more details on this + | in the section on #[+a("#custom-components") custom components]. +code. - nlp = spacy.load('en') - nlp.pipeline = nlp.pipeline[:2] # only use the first two components + nlp.remove_pipe('parser') + nlp.rename_pipe('ner', 'entityrecognizer') + nlp.replace_pipe('tagger', my_custom_tagger) +infobox("Important note: disabling pipeline components") .o-block @@ -234,12 +141,14 @@ p | processing pipeline components, the #[code parser], #[code tagger] | and #[code entity] keyword arguments have been replaced with | #[code disable], which takes a list of pipeline component names. - | This lets you disable both default and custom components when loading + | This lets you disable pre-defined components when loading | a model, or initialising a Language class via | #[+api("language-from_disk") #[code from_disk]]. + +code-new. - nlp = spacy.load('en', disable=['tagger', 'ner']) - doc = nlp(u"I don't want parsed", disable=['parser']) + nlp = spacy.load('en', disable=['ner']) + nlp.remove_pipe('parser') + doc = nlp(u"I don't want parsed") +code-old. nlp = spacy.load('en', tagger=False, entity=False) doc = nlp(u"I don't want parsed", parse=False) diff --git a/website/usage/_processing-pipelines/_user-hooks.jade b/website/usage/_processing-pipelines/_user-hooks.jade deleted file mode 100644 index e7dce53fe..000000000 --- a/website/usage/_processing-pipelines/_user-hooks.jade +++ /dev/null @@ -1,61 +0,0 @@ -//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS - -p - | Hooks let you customize some of the behaviours of the #[code Doc], - | #[code Span] or #[code Token] objects by adding a component to the - | pipeline. For instance, to customize the - | #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a - | component that sets a custom function to - | #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] - | method will check the #[code user_hooks] dict, and delegate to your - | function if you've set one. Similar results can be achieved by setting - | functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. - -+code("Polymorphic similarity example"). - span.similarity(doc) - token.similarity(span) - doc1.similarity(doc2) - -p - | By default, this just averages the vectors for each document, and - | computes their cosine. Obviously, spaCy should make it easy for you to - | install your own similarity model. This introduces a tricky design - | challenge. The current solution is to add three more dicts to the - | #[code Doc] object: - -+aside("Implementation note") - | The hooks live on the #[code Doc] object because the #[code Span] and - | #[code Token] objects are created lazily, and don't own any data. They - | just proxy to their parent #[code Doc]. This turns out to be convenient - | here — we only have to worry about installing hooks in one place. - -+table(["Name", "Description"]) - +row - +cell #[code user_hooks] - +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents] - - +row - +cell #[code user_token_hooks] - +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts] - - +row - +cell #[code user_span_hooks] - +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root] - -p - | To sum up, here's an example of hooking in custom #[code .similarity()] - | methods: - -+code("Add custom similarity hooks"). - class SimilarityModel(object): - def __init__(self, model): - self._model = model - - def __call__(self, doc): - doc.user_hooks['similarity'] = self.similarity - doc.user_span_hooks['similarity'] = self.similarity - doc.user_token_hooks['similarity'] = self.similarity - - def similarity(self, obj1, obj2): - y = self._model([obj1.vector, obj2.vector]) - return float(y[0]) diff --git a/website/usage/_spacy-101/_lightning-tour.jade b/website/usage/_spacy-101/_lightning-tour.jade index 061ec7758..ecf57fbc2 100644 --- a/website/usage/_spacy-101/_lightning-tour.jade +++ b/website/usage/_spacy-101/_lightning-tour.jade @@ -175,7 +175,7 @@ p +code. import spacy - from spacy.tokens.doc import Doc + from spacy.tokens import Doc from spacy.vocab import Vocab nlp = spacy.load('en') diff --git a/website/usage/_visualizers/_html.jade b/website/usage/_visualizers/_html.jade index 595192442..648a6de80 100644 --- a/website/usage/_visualizers/_html.jade +++ b/website/usage/_visualizers/_html.jade @@ -61,7 +61,7 @@ p output_path.open('w', encoding='utf-8').write(svg) p - | The above code will generate the dependency visualizations and them to + | The above code will generate the dependency visualizations as to | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. diff --git a/website/usage/examples.jade b/website/usage/examples.jade index 75d05e339..5dfeaf2a7 100644 --- a/website/usage/examples.jade +++ b/website/usage/examples.jade @@ -2,6 +2,44 @@ include ../_includes/_mixins ++section("pipeline") + +h(3, "custom-components-entities") Custom pipeline components and attribute extensions + +tag-new(2) + + p + | This example shows the implementation of a pipeline component + | that sets entity annotations based on a list of single or + | multiple-word company names, merges entities into one token and + | sets custom attributes on the #[code Doc], #[code Span] and + | #[code Token]. + + +github("spacy", "examples/pipeline/custom_component_entities.py") + + +h(3, "custom-components-api") + | Custom pipeline components and attribute extensions via a REST API + +tag-new(2) + + p + | This example shows the implementation of a pipeline component + | that fetches country meta data via the + | #[+a("https://restcountries.eu") REST Countries API] sets entity + | annotations for countries, merges entities into one token and + | sets custom attributes on the #[code Doc], #[code Span] and + | #[code Token] – for example, the capital, latitude/longitude + | coordinates and the country flag. + + +github("spacy", "examples/pipeline/custom_component_countries_api.py") + + +h(3, "custom-components-attr-methods") Custom method extensions + +tag-new(2) + + p + | A collection of snippets showing examples of extensions adding + | custom methods to the #[code Doc], #[code Token] and + | #[code Span]. + + +github("spacy", "examples/pipeline/custom_attr_methods.py") + +section("matching") +h(3, "matcher") Using spaCy's rule-based matcher diff --git a/website/usage/processing-pipelines.jade b/website/usage/processing-pipelines.jade index 0bb96780e..045a32ddb 100644 --- a/website/usage/processing-pipelines.jade +++ b/website/usage/processing-pipelines.jade @@ -8,18 +8,18 @@ include _spacy-101/_pipelines +h(2, "pipelines") How pipelines work include _processing-pipelines/_pipelines -+section("examples") - +h(2, "examples") Examples - include _processing-pipelines/_examples ++section("custom-components") + +h(2, "custom-components") Creating custom pipeline components + include _processing-pipelines/_custom-components + ++section("extensions") + +h(2, "extensions") Developing spaCy extensions + include _processing-pipelines/_extensions +section("multithreading") +h(2, "multithreading") Multi-threading include _processing-pipelines/_multithreading -+section("user-hooks") - +h(2, "user-hooks") User hooks - include _processing-pipelines/_user-hooks - +section("serialization") +h(2, "serialization") Serialization include _processing-pipelines/_serialization diff --git a/website/usage/v2.jade b/website/usage/v2.jade index 8737c0b76..66304c860 100644 --- a/website/usage/v2.jade +++ b/website/usage/v2.jade @@ -102,30 +102,36 @@ p +h(3, "features-pipelines") Improved processing pipelines +aside-code("Example"). - # Modify an existing pipeline - nlp = spacy.load('en') - nlp.pipeline.append(my_component) + # Set custom attributes + Doc.set_extension('my_attr', default=False) + Token.set_extension('my_attr', getter=my_token_getter) + assert doc._.my_attr, token._.my_attr - # Register a factory to create a component - spacy.set_factory('my_factory', my_factory) - nlp = Language(pipeline=['my_factory', mycomponent]) + # Add components to the pipeline + my_component = lambda doc: doc + nlp.add_pipe(my_component) p | It's now much easier to #[strong customise the pipeline] with your own - | components, functions that receive a #[code Doc] object, modify and - | return it. If your component is stateful, you can define and register a - | factory which receives the shared #[code Vocab] object and returns a - |  component. spaCy's default components can be added to your pipeline by - | using their string IDs. This way, you won't have to worry about finding - | and implementing them – simply add #[code "tagger"] to the pipeline, - | and spaCy will know what to do. + | components: functions that receive a #[code Doc] object, modify and + | return it. Extensions let you write any + | #[strong attributes, properties and methods] to the #[code Doc], + | #[code Token] and #[code Span]. You can add data, implement new + | features, integrate other libraries with spaCy or plug in your own + | machine learning models. +image include ../assets/img/pipeline.svg +infobox - | #[+label-inline API:] #[+api("language") #[code Language]] - | #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text] + | #[+label-inline API:] #[+api("language") #[code Language]], + | #[+api("doc#set_extension") #[code Doc.set_extension]], + | #[+api("span#set_extension") #[code Span.set_extension]], + | #[+api("token#set_extension") #[code Token.set_extension]] + | #[+label-inline Usage:] + | #[+a("/usage/processing-pipelines") Processing pipelines] + | #[+label-inline Code:] + | #[+src("/usage/examples#section-pipeline") Pipeline examples] +h(3, "features-text-classification") Text classification @@ -478,15 +484,16 @@ p p | If you've been using custom pipeline components, check out the new | guide on #[+a("/usage/language-processing-pipelines") processing pipelines]. - | Appending functions to the pipeline still works – but you might be able - | to make this more convenient by registering "component factories". - | Components of the processing pipeline can now be disabled by passing a - | list of their names to the #[code disable] keyword argument on loading - | or processing. + | Appending functions to the pipeline still works – but the + | #[+api("language#add_pipe") #[code add_pipe]] methods now makes this + | much more convenient. Components of the processing pipeline can now + | be disabled by passing a list of their names to the #[code disable] + | keyword argument on load, or by simply demoving them from the + | pipeline alltogether. +code-new. nlp = spacy.load('en', disable=['tagger', 'ner']) - doc = nlp(u"I don't want parsed", disable=['parser']) + nlp.remove_pipe('parser') +code-old. nlp = spacy.load('en', tagger=False, entity=False) doc = nlp(u"I don't want parsed", parse=False)