mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'develop' into feature/fix-matcher-operators
This commit is contained in:
		
						commit
						a928ae2f35
					
				
							
								
								
									
										52
									
								
								examples/pipeline/custom_attr_methods.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								examples/pipeline/custom_attr_methods.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,52 @@ | |||
| # coding: utf-8 | ||||
| """This example contains several snippets of methods that can be set via custom | ||||
| Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like | ||||
| they're "bound" to the object and are partially applied – i.e. the object | ||||
| they're called on is passed in as the first argument.""" | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from spacy.lang.en import English | ||||
| from spacy.tokens import Doc, Span | ||||
| from spacy import displacy | ||||
| from pathlib import Path | ||||
| 
 | ||||
| 
 | ||||
| def to_html(doc, output='/tmp', style='dep'): | ||||
|     """Doc method extension for saving the current state as a displaCy | ||||
|     visualization. | ||||
|     """ | ||||
|     # generate filename from first six non-punct tokens | ||||
|     file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html' | ||||
|     output_path = Path(output) / file_name | ||||
|     html = displacy.render(doc, style=style, page=True)  # render markup | ||||
|     output_path.open('w', encoding='utf-8').write(html)  # save to file | ||||
|     print('Saved HTML to {}'.format(output_path)) | ||||
| 
 | ||||
| 
 | ||||
| Doc.set_extension('to_html', method=to_html) | ||||
| 
 | ||||
| nlp = English() | ||||
| doc = nlp(u"This is a sentence about Apple.") | ||||
| # add entity manually for demo purposes, to make it work without a model | ||||
| doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] | ||||
| doc._.to_html(style='ent') | ||||
| 
 | ||||
| 
 | ||||
| def overlap_tokens(doc, other_doc): | ||||
|     """Get the tokens from the original Doc that are also in the comparison Doc. | ||||
|     """ | ||||
|     overlap = [] | ||||
|     other_tokens = [token.text for token in other_doc] | ||||
|     for token in doc: | ||||
|         if token.text in other_tokens: | ||||
|             overlap.append(token) | ||||
|     return overlap | ||||
| 
 | ||||
| 
 | ||||
| Doc.set_extension('overlap', method=overlap_tokens) | ||||
| 
 | ||||
| nlp = English() | ||||
| doc1 = nlp(u"Peach emoji is where it has always been.") | ||||
| doc2 = nlp(u"Peach is the superior emoji.") | ||||
| tokens = doc1._.overlap(doc2) | ||||
| print(tokens) | ||||
							
								
								
									
										108
									
								
								examples/pipeline/custom_component_countries_api.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								examples/pipeline/custom_component_countries_api.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,108 @@ | |||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import requests | ||||
| 
 | ||||
| from spacy.lang.en import English | ||||
| from spacy.matcher import PhraseMatcher | ||||
| from spacy.tokens import Doc, Span, Token | ||||
| 
 | ||||
| 
 | ||||
| class RESTCountriesComponent(object): | ||||
|     """Example of a spaCy v2.0 pipeline component that requests all countries | ||||
|     via the REST Countries API, merges country names into one token, assigns | ||||
|     entity labels and sets attributes on country tokens, e.g. the capital and | ||||
|     lat/lng coordinates. Can be extended with more details from the API. | ||||
| 
 | ||||
|     REST Countries API: https://restcountries.eu | ||||
|     API License: Mozilla Public License MPL 2.0 | ||||
|     """ | ||||
|     name = 'rest_countries' # component name, will show up in the pipeline | ||||
| 
 | ||||
|     def __init__(self, nlp, label='GPE'): | ||||
|         """Initialise the pipeline component. The shared nlp instance is used | ||||
|         to initialise the matcher with the shared vocab, get the label ID and | ||||
|         generate Doc objects as phrase match patterns. | ||||
|         """ | ||||
|         # Make request once on initialisation and store the data | ||||
|         r = requests.get('https://restcountries.eu/rest/v2/all') | ||||
|         r.raise_for_status()  # make sure requests raises an error if it fails | ||||
|         countries = r.json() | ||||
| 
 | ||||
|         # Convert API response to dict keyed by country name for easy lookup | ||||
|         # This could also be extended using the alternative and foreign language | ||||
|         # names provided by the API | ||||
|         self.countries = {c['name']: c for c in countries} | ||||
|         self.label = nlp.vocab.strings[label]  # get entity label ID | ||||
| 
 | ||||
|         # Set up the PhraseMatcher with Doc patterns for each country name | ||||
|         patterns = [nlp(c) for c in self.countries.keys()] | ||||
|         self.matcher = PhraseMatcher(nlp.vocab) | ||||
|         self.matcher.add('COUNTRIES', None, *patterns) | ||||
| 
 | ||||
|         # Register attribute on the Token. We'll be overwriting this based on | ||||
|         # the matches, so we're only setting a default value, not a getter. | ||||
|         # If no default value is set, it defaults to None. | ||||
|         Token.set_extension('is_country', default=False) | ||||
|         Token.set_extension('country_capital') | ||||
|         Token.set_extension('country_latlng') | ||||
|         Token.set_extension('country_flag') | ||||
| 
 | ||||
|         # Register attributes on Doc and Span via a getter that checks if one of | ||||
|         # the contained tokens is set to is_country == True. | ||||
|         Doc.set_extension('has_country', getter=self.has_country) | ||||
|         Span.set_extension('has_country', getter=self.has_country) | ||||
| 
 | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         """Apply the pipeline component on a Doc object and modify it if matches | ||||
|         are found. Return the Doc, so it can be processed by the next component | ||||
|         in the pipeline, if available. | ||||
|         """ | ||||
|         matches = self.matcher(doc) | ||||
|         spans = []  # keep the spans for later so we can merge them afterwards | ||||
|         for _, start, end in matches: | ||||
|             # Generate Span representing the entity & set label | ||||
|             entity = Span(doc, start, end, label=self.label) | ||||
|             spans.append(entity) | ||||
|             # Set custom attribute on each token of the entity | ||||
|             # Can be extended with other data returned by the API, like | ||||
|             # currencies, country code, flag, calling code etc. | ||||
|             for token in entity: | ||||
|                 token._.set('is_country', True) | ||||
|                 token._.set('country_capital', self.countries[entity.text]['capital']) | ||||
|                 token._.set('country_latlng', self.countries[entity.text]['latlng']) | ||||
|                 token._.set('country_flag', self.countries[entity.text]['flag']) | ||||
|             # Overwrite doc.ents and add entity – be careful not to replace! | ||||
|             doc.ents = list(doc.ents) + [entity] | ||||
|         for span in spans: | ||||
|             # Iterate over all spans and merge them into one token. This is done | ||||
|             # after setting the entities – otherwise, it would cause mismatched | ||||
|             # indices! | ||||
|             span.merge() | ||||
|         return doc  # don't forget to return the Doc! | ||||
| 
 | ||||
|     def has_country(self, tokens): | ||||
|         """Getter for Doc and Span attributes. Returns True if one of the tokens | ||||
|         is a country. Since the getter is only called when we access the | ||||
|         attribute, we can refer to the Token's 'is_country' attribute here, | ||||
|         which is already set in the processing step.""" | ||||
|         return any([t._.get('is_country') for t in tokens]) | ||||
| 
 | ||||
| 
 | ||||
| # For simplicity, we start off with only the blank English Language class and | ||||
| # no model or pre-defined pipeline loaded. | ||||
| 
 | ||||
| nlp = English() | ||||
| rest_countries = RESTCountriesComponent(nlp)  # initialise component | ||||
| nlp.add_pipe(rest_countries) # add it to the pipeline | ||||
| 
 | ||||
| doc = nlp(u"Some text about Colombia and the Czech Republic") | ||||
| 
 | ||||
| print('Pipeline', nlp.pipe_names)  # pipeline contains component name | ||||
| print('Doc has countries', doc._.has_country)  # Doc contains countries | ||||
| for token in doc: | ||||
|     if token._.is_country: | ||||
|         print(token.text, token._.country_capital, token._.country_latlng, | ||||
|               token._.country_flag)  # country data | ||||
| print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities | ||||
							
								
								
									
										85
									
								
								examples/pipeline/custom_component_entities.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								examples/pipeline/custom_component_entities.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,85 @@ | |||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from spacy.lang.en import English | ||||
| from spacy.matcher import PhraseMatcher | ||||
| from spacy.tokens import Doc, Span, Token | ||||
| 
 | ||||
| 
 | ||||
| class TechCompanyRecognizer(object): | ||||
|     """Example of a spaCy v2.0 pipeline component that sets entity annotations | ||||
|     based on list of single or multiple-word company names. Companies are | ||||
|     labelled as ORG and their spans are merged into one token. Additionally, | ||||
|     ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token | ||||
|     respectively.""" | ||||
|     name = 'tech_companies'  # component name, will show up in the pipeline | ||||
| 
 | ||||
|     def __init__(self, nlp, companies=tuple(), label='ORG'): | ||||
|         """Initialise the pipeline component. The shared nlp instance is used | ||||
|         to initialise the matcher with the shared vocab, get the label ID and | ||||
|         generate Doc objects as phrase match patterns. | ||||
|         """ | ||||
|         self.label = nlp.vocab.strings[label]  # get entity label ID | ||||
| 
 | ||||
|         # Set up the PhraseMatcher – it can now take Doc objects as patterns, | ||||
|         # so even if the list of companies is long, it's very efficient | ||||
|         patterns = [nlp(org) for org in companies] | ||||
|         self.matcher = PhraseMatcher(nlp.vocab) | ||||
|         self.matcher.add('TECH_ORGS', None, *patterns) | ||||
| 
 | ||||
|         # Register attribute on the Token. We'll be overwriting this based on | ||||
|         # the matches, so we're only setting a default value, not a getter. | ||||
|         Token.set_extension('is_tech_org', default=False) | ||||
| 
 | ||||
|         # Register attributes on Doc and Span via a getter that checks if one of | ||||
|         # the contained tokens is set to is_tech_org == True. | ||||
|         Doc.set_extension('has_tech_org', getter=self.has_tech_org) | ||||
|         Span.set_extension('has_tech_org', getter=self.has_tech_org) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         """Apply the pipeline component on a Doc object and modify it if matches | ||||
|         are found. Return the Doc, so it can be processed by the next component | ||||
|         in the pipeline, if available. | ||||
|         """ | ||||
|         matches = self.matcher(doc) | ||||
|         spans = []  # keep the spans for later so we can merge them afterwards | ||||
|         for _, start, end in matches: | ||||
|             # Generate Span representing the entity & set label | ||||
|             entity = Span(doc, start, end, label=self.label) | ||||
|             spans.append(entity) | ||||
|             # Set custom attribute on each token of the entity | ||||
|             for token in entity: | ||||
|                 token._.set('is_tech_org', True) | ||||
|             # Overwrite doc.ents and add entity – be careful not to replace! | ||||
|             doc.ents = list(doc.ents) + [entity] | ||||
|         for span in spans: | ||||
|             # Iterate over all spans and merge them into one token. This is done | ||||
|             # after setting the entities – otherwise, it would cause mismatched | ||||
|             # indices! | ||||
|             span.merge() | ||||
|         return doc  # don't forget to return the Doc! | ||||
| 
 | ||||
|     def has_tech_org(self, tokens): | ||||
|         """Getter for Doc and Span attributes. Returns True if one of the tokens | ||||
|         is a tech org. Since the getter is only called when we access the | ||||
|         attribute, we can refer to the Token's 'is_tech_org' attribute here, | ||||
|         which is already set in the processing step.""" | ||||
|         return any([t._.get('is_tech_org') for t in tokens]) | ||||
| 
 | ||||
| 
 | ||||
| # For simplicity, we start off with only the blank English Language class and | ||||
| # no model or pre-defined pipeline loaded. | ||||
| 
 | ||||
| nlp = English() | ||||
| companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc. | ||||
| component = TechCompanyRecognizer(nlp, companies)  # initialise component | ||||
| nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element | ||||
| 
 | ||||
| doc = nlp(u"Alphabet Inc. is the company behind Google.") | ||||
| 
 | ||||
| print('Pipeline', nlp.pipe_names)  # pipeline contains component name | ||||
| print('Tokens', [t.text for t in doc])  # company names from the list are merged | ||||
| print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs | ||||
| print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org | ||||
| print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not | ||||
| print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities | ||||
|  | @ -6,7 +6,7 @@ To achieve that, it duplicates some of spaCy's internal functionality. | |||
| 
 | ||||
| Specifically, in this example, we don't use spaCy's built-in Language class to | ||||
| wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write | ||||
| our own simle Pipeline class, so that it's easier to see how the pieces | ||||
| our own simple Pipeline class, so that it's easier to see how the pieces | ||||
| interact. | ||||
| 
 | ||||
| Input data: | ||||
|  | @ -142,16 +142,15 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5): | |||
|             inputs, annots = zip(*batch) | ||||
|             nlp.update(list(inputs), list(annots), sgd, losses=losses) | ||||
|         scores = nlp.evaluate(dev_examples) | ||||
|         report_scores(i, losses['ner'], scores) | ||||
|     scores = nlp.evaluate(dev_examples) | ||||
|     report_scores(channels, i+1, loss, scores) | ||||
|         report_scores(i+1, losses['ner'], scores) | ||||
| 
 | ||||
| 
 | ||||
| def report_scores(i, loss, scores): | ||||
|     precision = '%.2f' % scores['ents_p'] | ||||
|     recall = '%.2f' % scores['ents_r'] | ||||
|     f_measure = '%.2f' % scores['ents_f'] | ||||
|     print('%d %s %s %s' % (int(loss), precision, recall, f_measure)) | ||||
|     print('Epoch %d: %d %s %s %s' % ( | ||||
|         i, int(loss), precision, recall, f_measure)) | ||||
| 
 | ||||
| 
 | ||||
| def read_examples(path): | ||||
|  |  | |||
|  | @ -7,7 +7,7 @@ if __name__ == '__main__': | |||
|     import plac | ||||
|     import sys | ||||
|     from spacy.cli import download, link, info, package, train, convert, model | ||||
|     from spacy.cli import profile, evaluate | ||||
|     from spacy.cli import profile, evaluate, validate | ||||
|     from spacy.util import prints | ||||
| 
 | ||||
|     commands = { | ||||
|  | @ -20,6 +20,7 @@ if __name__ == '__main__': | |||
|         'package': package, | ||||
|         'model': model, | ||||
|         'profile': profile, | ||||
|         'validate': validate | ||||
|     } | ||||
|     if len(sys.argv) == 1: | ||||
|         prints(', '.join(commands), title="Available commands", exits=1) | ||||
|  |  | |||
|  | @ -311,7 +311,7 @@ def link_vectors_to_models(vocab): | |||
| 
 | ||||
| def Tok2Vec(width, embed_size, **kwargs): | ||||
|     pretrained_dims = kwargs.get('pretrained_dims', 0) | ||||
|     cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3) | ||||
|     cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||
|     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, | ||||
|                                  '*': reapply}): | ||||
|  |  | |||
|  | @ -7,3 +7,4 @@ from .train import train | |||
| from .evaluate import evaluate | ||||
| from .convert import convert | ||||
| from .model import model | ||||
| from .validate import validate | ||||
|  |  | |||
|  | @ -4,7 +4,7 @@ from __future__ import unicode_literals | |||
| import plac | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from .converters import conllu2json, iob2json | ||||
| from .converters import conllu2json, iob2json, conll_ner2json | ||||
| from ..util import prints | ||||
| 
 | ||||
| # Converters are matched by file extension. To add a converter, add a new entry | ||||
|  | @ -12,9 +12,10 @@ from ..util import prints | |||
| # from /converters. | ||||
| 
 | ||||
| CONVERTERS = { | ||||
|     '.conllu': conllu2json, | ||||
|     '.conll': conllu2json, | ||||
|     '.iob': iob2json, | ||||
|     'conllu': conllu2json, | ||||
|     'conll': conllu2json, | ||||
|     'ner': conll_ner2json, | ||||
|     'iob': iob2json, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
|  | @ -22,9 +23,11 @@ CONVERTERS = { | |||
|     input_file=("input file", "positional", None, str), | ||||
|     output_dir=("output directory for converted file", "positional", None, str), | ||||
|     n_sents=("Number of sentences per doc", "option", "n", int), | ||||
|     converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), | ||||
|     morphology=("Enable appending morphology to tags", "flag", "m", bool) | ||||
| ) | ||||
| def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): | ||||
| def convert(cmd, input_file, output_dir, n_sents=1, morphology=False, | ||||
|             converter='auto'): | ||||
|     """ | ||||
|     Convert files into JSON format for use with train command and other | ||||
|     experiment management functions. | ||||
|  | @ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): | |||
|         prints(input_path, title="Input file not found", exits=1) | ||||
|     if not output_path.exists(): | ||||
|         prints(output_path, title="Output directory not found", exits=1) | ||||
|     file_ext = input_path.suffix | ||||
|     if not file_ext in CONVERTERS: | ||||
|         prints("Can't find converter for %s" % input_path.parts[-1], | ||||
|                title="Unknown format", exits=1) | ||||
|     CONVERTERS[file_ext](input_path, output_path, | ||||
|             n_sents=n_sents, use_morphology=morphology) | ||||
|     if converter == 'auto': | ||||
|         converter = input_path.suffix[1:] | ||||
|     if not converter in CONVERTERS: | ||||
|             prints("Can't find converter for %s" % converter, | ||||
|                 title="Unknown format", exits=1) | ||||
|     func = CONVERTERS[converter] | ||||
|     func(input_path, output_path, | ||||
|          n_sents=n_sents, use_morphology=morphology) | ||||
|  |  | |||
|  | @ -1,2 +1,3 @@ | |||
| from .conllu2json import conllu2json | ||||
| from .iob2json import iob2json | ||||
| from .conll_ner2json import conll_ner2json | ||||
|  |  | |||
							
								
								
									
										50
									
								
								spacy/cli/converters/conll_ner2json.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								spacy/cli/converters/conll_ner2json.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,50 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...compat import json_dumps, path2str | ||||
| from ...util import prints | ||||
| from ...gold import iob_to_biluo | ||||
| 
 | ||||
| 
 | ||||
| def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): | ||||
|     """ | ||||
|     Convert files in the CoNLL-2003 NER format into JSON format for use with train cli. | ||||
|     """ | ||||
|     docs = read_conll_ner(input_path) | ||||
| 
 | ||||
|     output_filename = input_path.parts[-1].replace(".conll", "") + ".json" | ||||
|     output_filename = input_path.parts[-1].replace(".conll", "") + ".json" | ||||
|     output_file = output_path / output_filename | ||||
|     with output_file.open('w', encoding='utf-8') as f: | ||||
|         f.write(json_dumps(docs)) | ||||
|     prints("Created %d documents" % len(docs), | ||||
|            title="Generated output file %s" % path2str(output_file)) | ||||
| 
 | ||||
| 
 | ||||
| def read_conll_ner(input_path): | ||||
|     text = input_path.open('r', encoding='utf-8').read() | ||||
|     i = 0 | ||||
|     delimit_docs = '-DOCSTART- -X- O O' | ||||
|     output_docs = [] | ||||
|     for doc in text.strip().split(delimit_docs): | ||||
|         doc = doc.strip() | ||||
|         if not doc: | ||||
|             continue | ||||
|         output_doc = [] | ||||
|         for sent in doc.split('\n\n'): | ||||
|             sent = sent.strip() | ||||
|             if not sent: | ||||
|                 continue | ||||
|             lines = [line.strip() for line in sent.split('\n') if line.strip()] | ||||
|             words, tags, chunks, iob_ents = zip(*[line.split() for line in lines]) | ||||
|             biluo_ents = iob_to_biluo(iob_ents) | ||||
|             output_doc.append({'tokens': [ | ||||
|                 {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in | ||||
|                 zip(words, tags, biluo_ents) | ||||
|             ]}) | ||||
|         output_docs.append({ | ||||
|             'id': len(output_docs), | ||||
|             'paragraphs': [{'sentences': output_doc}] | ||||
|         }) | ||||
|         output_doc = [] | ||||
|     return output_docs | ||||
|  | @ -44,7 +44,7 @@ numpy.random.seed(0) | |||
|     version=("Model version", "option", "V", str), | ||||
|     meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) | ||||
| ) | ||||
| def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | ||||
| def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, | ||||
|           use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, | ||||
|           gold_preproc=False, version="0.0.0", meta_path=None): | ||||
|     """ | ||||
|  | @ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | |||
|     if not isinstance(meta, dict): | ||||
|         prints("Expected dict but got: {}".format(type(meta)), | ||||
|                title="Not a valid meta.json format", exits=1) | ||||
|     meta.setdefault('lang', lang) | ||||
|     meta.setdefault('name', 'unnamed') | ||||
| 
 | ||||
|     pipeline = ['tagger', 'parser', 'ner'] | ||||
|     if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') | ||||
|  | @ -88,9 +90,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | |||
|     n_train_words = corpus.count_train() | ||||
| 
 | ||||
|     lang_class = util.get_lang_class(lang) | ||||
|     nlp = lang_class(pipeline=pipeline) | ||||
|     nlp = lang_class() | ||||
|     meta['pipeline'] = pipeline | ||||
|     nlp.meta.update(meta) | ||||
|     if vectors: | ||||
|         util.load_model(vectors, vocab=nlp.vocab) | ||||
|     for name in pipeline: | ||||
|         nlp.add_pipe(nlp.create_pipe(name), name=name) | ||||
|     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) | ||||
|     nlp._optimizer = None | ||||
| 
 | ||||
|  | @ -112,17 +118,33 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | |||
|                 util.set_env_log(False) | ||||
|                 epoch_model_path = output_path / ('model%d' % i) | ||||
|                 nlp.to_disk(epoch_model_path) | ||||
|                 nlp_loaded = lang_class(pipeline=pipeline) | ||||
|                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path) | ||||
|                 scorer = nlp_loaded.evaluate( | ||||
|                             list(corpus.dev_docs( | ||||
|                 nlp_loaded = util.load_model_from_path(epoch_model_path) | ||||
|                 dev_docs = list(corpus.dev_docs( | ||||
|                                 nlp_loaded, | ||||
|                                 gold_preproc=gold_preproc))) | ||||
|                                 gold_preproc=gold_preproc)) | ||||
|                 nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) | ||||
|                 start_time = timer() | ||||
|                 scorer = nlp_loaded.evaluate(dev_docs) | ||||
|                 end_time = timer() | ||||
|                 if use_gpu < 0: | ||||
|                     gpu_wps = None | ||||
|                     cpu_wps = nwords/(end_time-start_time) | ||||
|                 else: | ||||
|                     gpu_wps = nwords/(end_time-start_time) | ||||
|                     with Model.use_device('cpu'): | ||||
|                         nlp_loaded = util.load_model_from_path(epoch_model_path) | ||||
|                         dev_docs = list(corpus.dev_docs( | ||||
|                                         nlp_loaded, gold_preproc=gold_preproc)) | ||||
|                         start_time = timer() | ||||
|                         scorer = nlp_loaded.evaluate(dev_docs) | ||||
|                         end_time = timer() | ||||
|                         cpu_wps = nwords/(end_time-start_time) | ||||
|                 acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') | ||||
|                 with acc_loc.open('w') as file_: | ||||
|                     file_.write(json_dumps(scorer.scores)) | ||||
|                 meta_loc = output_path / ('model%d' % i) / 'meta.json' | ||||
|                 meta['accuracy'] = scorer.scores | ||||
|                 meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps} | ||||
|                 meta['lang'] = nlp.lang | ||||
|                 meta['pipeline'] = pipeline | ||||
|                 meta['spacy_version'] = '>=%s' % about.__version__ | ||||
|  | @ -132,7 +154,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | |||
|                 with meta_loc.open('w') as file_: | ||||
|                     file_.write(json_dumps(meta)) | ||||
|                 util.set_env_log(True) | ||||
|             print_progress(i, losses, scorer.scores) | ||||
|             print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) | ||||
|     finally: | ||||
|         print("Saving model...") | ||||
|         try: | ||||
|  | @ -153,16 +175,17 @@ def _render_parses(i, to_render): | |||
|         file_.write(html) | ||||
| 
 | ||||
| 
 | ||||
| def print_progress(itn, losses, dev_scores, wps=0.0): | ||||
| def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): | ||||
|     scores = {} | ||||
|     for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', | ||||
|                 'ents_p', 'ents_r', 'ents_f', 'wps']: | ||||
|                 'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']: | ||||
|         scores[col] = 0.0 | ||||
|     scores['dep_loss'] = losses.get('parser', 0.0) | ||||
|     scores['ner_loss'] = losses.get('ner', 0.0) | ||||
|     scores['tag_loss'] = losses.get('tagger', 0.0) | ||||
|     scores.update(dev_scores) | ||||
|     scores['wps'] = wps | ||||
|     scores['cpu_wps'] = cpu_wps | ||||
|     scores['gpu_wps'] = gpu_wps or 0.0 | ||||
|     tpl = '\t'.join(( | ||||
|         '{:d}', | ||||
|         '{dep_loss:.3f}', | ||||
|  | @ -173,7 +196,9 @@ def print_progress(itn, losses, dev_scores, wps=0.0): | |||
|         '{ents_f:.3f}', | ||||
|         '{tags_acc:.3f}', | ||||
|         '{token_acc:.3f}', | ||||
|         '{wps:.1f}')) | ||||
|         '{cpu_wps:.1f}', | ||||
|         '{gpu_wps:.1f}', | ||||
|     )) | ||||
|     print(tpl.format(itn, **scores)) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										123
									
								
								spacy/cli/validate.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										123
									
								
								spacy/cli/validate.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,123 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import requests | ||||
| import pkg_resources | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from ..compat import path2str, locale_escape | ||||
| from ..util import prints, get_data_path, read_json | ||||
| from .. import about | ||||
| 
 | ||||
| 
 | ||||
| def validate(cmd): | ||||
|     """Validate that the currently installed version of spaCy is compatible | ||||
|     with the installed models. Should be run after `pip install -U spacy`. | ||||
|     """ | ||||
|     r = requests.get(about.__compatibility__) | ||||
|     if r.status_code != 200: | ||||
|         prints("Couldn't fetch compatibility table.", | ||||
|                title="Server error (%d)" % r.status_code, exits=1) | ||||
|     compat = r.json()['spacy'] | ||||
|     all_models = set() | ||||
|     for spacy_v, models in dict(compat).items(): | ||||
|         all_models.update(models.keys()) | ||||
|         for model, model_vs in models.items(): | ||||
|             compat[spacy_v][model] = [reformat_version(v) for v in model_vs] | ||||
| 
 | ||||
|     current_compat = compat[about.__version__] | ||||
|     model_links = get_model_links(current_compat) | ||||
|     model_pkgs = get_model_pkgs(current_compat, all_models) | ||||
|     incompat_links = {l for l, d in model_links.items() if not d['compat']} | ||||
|     incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']} | ||||
|     incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']]) | ||||
|     na_models = [m for m in incompat_models if m not in current_compat] | ||||
|     update_models = [m for m in incompat_models if m in current_compat] | ||||
| 
 | ||||
|     prints(path2str(Path(__file__).parent.parent), | ||||
|            title="Installed models (spaCy v{})".format(about.__version__)) | ||||
|     if model_links or model_pkgs: | ||||
|         print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) | ||||
|         for name, data in model_pkgs.items(): | ||||
|             print(get_model_row(current_compat, name, data, 'package')) | ||||
|         for name, data in model_links.items(): | ||||
|             print(get_model_row(current_compat, name, data, 'link')) | ||||
|     else: | ||||
|         prints("No models found in your current environment.", exits=0) | ||||
| 
 | ||||
|     if update_models: | ||||
|         cmd = '    python -m spacy download {}' | ||||
|         print("\n    Use the following commands to update the model packages:") | ||||
|         print('\n'.join([cmd.format(pkg) for pkg in update_models])) | ||||
| 
 | ||||
|     if na_models: | ||||
|         prints("The following models are not available for spaCy v{}: {}" | ||||
|                .format(about.__version__, ', '.join(na_models))) | ||||
| 
 | ||||
|     if incompat_links: | ||||
|         prints("You may also want to overwrite the incompatible links using " | ||||
|                "the `spacy link` command with `--force`, or remove them from " | ||||
|                "the data directory. Data path: {}" | ||||
|                .format(path2str(get_data_path()))) | ||||
| 
 | ||||
| 
 | ||||
| def get_model_links(compat): | ||||
|     links = {} | ||||
|     data_path = get_data_path() | ||||
|     if data_path: | ||||
|         models = [p for p in data_path.iterdir() if is_model_path(p)] | ||||
|         for model in models: | ||||
|             meta_path = Path(model) / 'meta.json' | ||||
|             if not meta_path.exists(): | ||||
|                 continue | ||||
|             meta = read_json(meta_path) | ||||
|             link = model.parts[-1] | ||||
|             name = meta['lang'] + '_' + meta['name'] | ||||
|             links[link] = {'name': name, 'version': meta['version'], | ||||
|                            'compat': is_compat(compat, name, meta['version'])} | ||||
|     return links | ||||
| 
 | ||||
| 
 | ||||
| def get_model_pkgs(compat, all_models): | ||||
|     pkgs = {} | ||||
|     for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): | ||||
|         package = pkg_name.replace('-', '_') | ||||
|         if package in all_models: | ||||
|             version = pkg_data.version | ||||
|             pkgs[pkg_name] = {'name': package, 'version': version, | ||||
|                               'compat': is_compat(compat, package, version)} | ||||
|     return pkgs | ||||
| 
 | ||||
| 
 | ||||
| def get_model_row(compat, name, data, type='package'): | ||||
|     tpl_row = '    {:<10}' + ('  {:<20}' * 4) | ||||
|     tpl_red = '\x1b[38;5;1m{}\x1b[0m' | ||||
|     tpl_green = '\x1b[38;5;2m{}\x1b[0m' | ||||
|     if data['compat']: | ||||
|         comp = tpl_green.format(locale_escape('✔', errors='ignore')) | ||||
|         version = tpl_green.format(data['version']) | ||||
|     else: | ||||
|         comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0]) | ||||
|         version = tpl_red.format(data['version']) | ||||
|     return get_row(type, name, data['name'], version, comp) | ||||
| 
 | ||||
| 
 | ||||
| def get_row(*args): | ||||
|     tpl_row = '    {:<10}' + ('  {:<20}' * 4) | ||||
|     return tpl_row.format(*args) | ||||
| 
 | ||||
| 
 | ||||
| def is_model_path(model_path): | ||||
|     exclude = ['cache', 'pycache', '__pycache__'] | ||||
|     name = model_path.parts[-1] | ||||
|     return model_path.is_dir() and name not in exclude and not name.startswith('.') | ||||
| 
 | ||||
| 
 | ||||
| def is_compat(compat, name, version): | ||||
|     return name in compat and version in compat[name] | ||||
| 
 | ||||
| 
 | ||||
| def reformat_version(version): | ||||
|     if version.endswith('-alpha'): | ||||
|         return version.replace('-alpha', 'a0') | ||||
|     return version.replace('-alpha', 'a') | ||||
|  | @ -6,6 +6,7 @@ import ftfy | |||
| import sys | ||||
| import ujson | ||||
| import itertools | ||||
| import locale | ||||
| 
 | ||||
| from thinc.neural.util import copy_array | ||||
| 
 | ||||
|  | @ -113,3 +114,12 @@ def import_file(name, loc): | |||
|         module = importlib.util.module_from_spec(spec) | ||||
|         spec.loader.exec_module(module) | ||||
|         return module | ||||
| 
 | ||||
| 
 | ||||
| def locale_escape(string, errors='replace'): | ||||
|     ''' | ||||
|     Mangle non-supported characters, for savages with ascii terminals. | ||||
|     ''' | ||||
|     encoding = locale.getpreferredencoding() | ||||
|     string = string.encode(encoding, errors).decode('utf8') | ||||
|     return string | ||||
|  |  | |||
|  | @ -213,7 +213,7 @@ class GoldCorpus(object): | |||
|         train_tuples = self.train_tuples | ||||
|         if projectivize: | ||||
|             train_tuples = nonproj.preprocess_training_data( | ||||
|                                self.train_tuples) | ||||
|                                self.train_tuples, label_freq_cutoff=100) | ||||
|         random.shuffle(train_tuples) | ||||
|         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, | ||||
|                                         max_length=max_length, | ||||
|  |  | |||
|  | @ -16,15 +16,13 @@ from ...util import update_exc | |||
| class BengaliDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'bn' | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     tag_map = TAG_MAP | ||||
|     stop_words = STOP_WORDS | ||||
|     lemma_rules = LEMMA_RULES | ||||
| 
 | ||||
|     prefixes = tuple(TOKENIZER_PREFIXES) | ||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||
|     infixes = tuple(TOKENIZER_INFIXES) | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     infixes = TOKENIZER_INFIXES | ||||
| 
 | ||||
| 
 | ||||
| class Bengali(Language): | ||||
|  |  | |||
|  | @ -15,9 +15,8 @@ class DanishDefaults(Language.Defaults): | |||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'da' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class Danish(Language): | ||||
|  |  | |||
|  | @ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...language import Language | ||||
| from ...lemmatizerlookup import Lemmatizer | ||||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
|  | @ -22,16 +21,12 @@ class GermanDefaults(Language.Defaults): | |||
|     lex_attr_getters[LANG] = lambda text: 'de' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], | ||||
|                                          NORM_EXCEPTIONS, BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     infixes = tuple(TOKENIZER_INFIXES) | ||||
|     tag_map = dict(TAG_MAP) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return Lemmatizer(LOOKUP) | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     tag_map = TAG_MAP | ||||
|     stop_words = STOP_WORDS | ||||
|     syntax_iterators = SYNTAX_ITERATORS | ||||
|     lemma_lookup = LOOKUP | ||||
| 
 | ||||
| 
 | ||||
| class German(Language): | ||||
|  |  | |||
|  | @ -7,7 +7,7 @@ from .tag_map import TAG_MAP | |||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .morph_rules import MORPH_RULES | ||||
| from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC | ||||
| from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
|  | @ -23,15 +23,15 @@ class EnglishDefaults(Language.Defaults): | |||
|     lex_attr_getters[LANG] = lambda text: 'en' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], | ||||
|                                          BASE_NORMS, NORM_EXCEPTIONS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     tag_map = dict(TAG_MAP) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     morph_rules = dict(MORPH_RULES) | ||||
|     lemma_rules = dict(LEMMA_RULES) | ||||
|     lemma_index = dict(LEMMA_INDEX) | ||||
|     lemma_exc = dict(LEMMA_EXC) | ||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||
|     tag_map = TAG_MAP | ||||
|     stop_words = STOP_WORDS | ||||
|     morph_rules = MORPH_RULES | ||||
|     lemma_rules = LEMMA_RULES | ||||
|     lemma_index = LEMMA_INDEX | ||||
|     lemma_exc = LEMMA_EXC | ||||
|     lemma_lookup = LOOKUP | ||||
|     syntax_iterators = SYNTAX_ITERATORS | ||||
| 
 | ||||
| 
 | ||||
| class English(Language): | ||||
|  |  | |||
|  | @ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...language import Language | ||||
| from ...lemmatizerlookup import Lemmatizer | ||||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
|  | @ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults): | |||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'es' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     tag_map = dict(TAG_MAP) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     sytax_iterators = dict(SYNTAX_ITERATORS) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return Lemmatizer(LOOKUP) | ||||
|     tag_map = TAG_MAP | ||||
|     stop_words = STOP_WORDS | ||||
|     sytax_iterators = SYNTAX_ITERATORS | ||||
|     lemma_lookup = LOOKUP | ||||
| 
 | ||||
| 
 | ||||
| class Spanish(Language): | ||||
|  |  | |||
|  | @ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults): | |||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'fi' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class Finnish(Language): | ||||
|  |  | |||
|  | @ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...language import Language | ||||
| from ...lemmatizerlookup import Lemmatizer | ||||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
|  | @ -21,17 +20,13 @@ class FrenchDefaults(Language.Defaults): | |||
|     lex_attr_getters.update(LEX_ATTRS) | ||||
|     lex_attr_getters[LANG] = lambda text: 'fr' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     infixes = tuple(TOKENIZER_INFIXES) | ||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||
|     stop_words = STOP_WORDS | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     token_match = TOKEN_MATCH | ||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return Lemmatizer(LOOKUP) | ||||
|     syntax_iterators = SYNTAX_ITERATORS | ||||
|     lemma_lookup = LOOKUP | ||||
| 
 | ||||
| 
 | ||||
| class French(Language): | ||||
|  |  | |||
|  | @ -12,9 +12,8 @@ from ...util import update_exc | |||
| class HebrewDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'he' | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class Hebrew(Language): | ||||
|  |  | |||
|  | @ -9,7 +9,6 @@ from .lemmatizer import LOOKUP | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...language import Language | ||||
| from ...lemmatizerlookup import Lemmatizer | ||||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
|  | @ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults): | |||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'hu' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     prefixes = tuple(TOKENIZER_PREFIXES) | ||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||
|     infixes = tuple(TOKENIZER_INFIXES) | ||||
|     stop_words = STOP_WORDS | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     token_match = TOKEN_MATCH | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return Lemmatizer(LOOKUP) | ||||
|     lemma_lookup = LOOKUP | ||||
| 
 | ||||
| 
 | ||||
| class Hungarian(Language): | ||||
|  |  | |||
|  | @ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS | |||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...language import Language | ||||
| from ...lemmatizerlookup import Lemmatizer | ||||
| from ...attrs import LANG | ||||
| from ...util import update_exc | ||||
| 
 | ||||
|  | @ -19,19 +18,14 @@ from ...util import update_exc | |||
| class IndonesianDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'id' | ||||
| 
 | ||||
|     lex_attr_getters.update(LEX_ATTRS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     prefixes = tuple(TOKENIZER_PREFIXES) | ||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||
|     infixes = tuple(TOKENIZER_INFIXES) | ||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return Lemmatizer(LOOKUP) | ||||
|     stop_words = STOP_WORDS | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     syntax_iterators = SYNTAX_ITERATORS | ||||
|     lemma_lookup = LOOKUP | ||||
| 
 | ||||
| 
 | ||||
| class Indonesian(Language): | ||||
|  |  | |||
|  | @ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', | |||
|               'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta', | ||||
|               'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun', | ||||
|               'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun', | ||||
|               'noniliun', 'desiliun', | ||||
|               ] | ||||
|               'noniliun', 'desiliun'] | ||||
| 
 | ||||
| 
 | ||||
| def like_num(text): | ||||
|  |  | |||
|  | @ -7,7 +7,6 @@ from .lemmatizer import LOOKUP | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...language import Language | ||||
| from ...lemmatizerlookup import Lemmatizer | ||||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
|  | @ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults): | |||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'it' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return Lemmatizer(LOOKUP) | ||||
|     stop_words = STOP_WORDS | ||||
|     lemma_lookup = LOOKUP | ||||
| 
 | ||||
| 
 | ||||
| class Italian(Language): | ||||
|  |  | |||
|  | @ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults): | |||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'nb' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class Norwegian(Language): | ||||
|  |  | |||
|  | @ -16,9 +16,8 @@ class DutchDefaults(Language.Defaults): | |||
|     lex_attr_getters.update(LEX_ATTRS) | ||||
|     lex_attr_getters[LANG] = lambda text: 'nl' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class Dutch(Language): | ||||
|  |  | |||
|  | @ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults): | |||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'pl' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class Polish(Language): | ||||
|  |  | |||
|  | @ -9,7 +9,6 @@ from .lemmatizer import LOOKUP | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...language import Language | ||||
| from ...lemmatizerlookup import Lemmatizer | ||||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
|  | @ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults): | |||
|     lex_attr_getters[LANG] = lambda text: 'pt' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
|     lex_attr_getters.update(LEX_ATTRS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return Lemmatizer(LOOKUP) | ||||
|     stop_words = STOP_WORDS | ||||
|     lemma_lookup = LOOKUP | ||||
| 
 | ||||
| 
 | ||||
| class Portuguese(Language): | ||||
|  |  | |||
|  | @ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...language import Language | ||||
| from ...lemmatizerlookup import Lemmatizer | ||||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
|  | @ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults): | |||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'sv' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     stop_words = set(STOP_WORDS) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return Lemmatizer(LOOKUP) | ||||
|     stop_words = STOP_WORDS | ||||
|     lemma_rules = LEMMA_RULES | ||||
|     lemma_lookup = LOOKUP | ||||
| 
 | ||||
| 
 | ||||
| class Swedish(Language): | ||||
|  |  | |||
|  | @ -12,24 +12,27 @@ from ...language import Language | |||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
| 
 | ||||
| class ThaiDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'th' | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|     tag_map = dict(TAG_MAP) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) | ||||
|     tag_map = TAG_MAP | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class Thai(Language): | ||||
| 	lang = 'th' | ||||
| 	Defaults = ThaiDefaults | ||||
| 	def make_doc(self, text): | ||||
| 		try: | ||||
| 			from pythainlp.tokenize import word_tokenize | ||||
| 		except ImportError: | ||||
| 			raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " | ||||
| 								"https://github.com/wannaphongcom/pythainlp/") | ||||
| 		words = [x for x in list(word_tokenize(text,"newmm"))] | ||||
| 		return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||
|     lang = 'th' | ||||
|     Defaults = ThaiDefaults | ||||
| 
 | ||||
|     def make_doc(self, text): | ||||
|         try: | ||||
|             from pythainlp.tokenize import word_tokenize | ||||
|         except ImportError: | ||||
|             raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " | ||||
|                               "https://github.com/wannaphongcom/pythainlp/") | ||||
|         words = [x for x in list(word_tokenize(text,"newmm"))] | ||||
|         return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ['Thai'] | ||||
|  |  | |||
|  | @ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults): | |||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'xx' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,12 +1,9 @@ | |||
| # coding: utf8 | ||||
| from __future__ import absolute_import, unicode_literals | ||||
| from contextlib import contextmanager | ||||
| import dill | ||||
| 
 | ||||
| import numpy | ||||
| from thinc.neural import Model | ||||
| from thinc.neural.ops import NumpyOps, CupyOps | ||||
| from thinc.neural.optimizers import Adam, SGD | ||||
| from thinc.neural.optimizers import Adam | ||||
| import random | ||||
| import ujson | ||||
| from collections import OrderedDict | ||||
|  | @ -17,30 +14,27 @@ from .vocab import Vocab | |||
| from .tagger import Tagger | ||||
| from .lemmatizer import Lemmatizer | ||||
| from .syntax.parser import get_templates | ||||
| from .syntax import nonproj | ||||
| 
 | ||||
| from .pipeline import NeuralDependencyParser, EntityRecognizer | ||||
| from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer | ||||
| from .pipeline import NeuralLabeller | ||||
| from .pipeline import SimilarityHook | ||||
| from .pipeline import TextCategorizer | ||||
| from . import about | ||||
| from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger | ||||
| from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer | ||||
| 
 | ||||
| from .compat import json_dumps, izip | ||||
| from .scorer import Scorer | ||||
| from ._ml import link_vectors_to_models | ||||
| from .attrs import IS_STOP | ||||
| from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from .lang.tokenizer_exceptions import TOKEN_MATCH | ||||
| from .lang.tag_map import TAG_MAP | ||||
| from .lang.lex_attrs import LEX_ATTRS | ||||
| from . import util | ||||
| from .scorer import Scorer | ||||
| from ._ml import link_vectors_to_models | ||||
| from . import about | ||||
| 
 | ||||
| 
 | ||||
| class BaseDefaults(object): | ||||
|     @classmethod | ||||
|     def create_lemmatizer(cls, nlp=None): | ||||
|         return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules) | ||||
|         return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules, | ||||
|                           cls.lemma_lookup) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_vocab(cls, nlp=None): | ||||
|  | @ -70,59 +64,7 @@ class BaseDefaults(object): | |||
|                          prefix_search=prefix_search, suffix_search=suffix_search, | ||||
|                          infix_finditer=infix_finditer, token_match=token_match) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_tagger(cls, nlp=None, **cfg): | ||||
|         if nlp is None: | ||||
|             return NeuralTagger(cls.create_vocab(nlp), **cfg) | ||||
|         else: | ||||
|             return NeuralTagger(nlp.vocab, **cfg) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_parser(cls, nlp=None, **cfg): | ||||
|         if nlp is None: | ||||
|             return NeuralDependencyParser(cls.create_vocab(nlp), **cfg) | ||||
|         else: | ||||
|             return NeuralDependencyParser(nlp.vocab, **cfg) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_entity(cls, nlp=None, **cfg): | ||||
|         if nlp is None: | ||||
|             return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg) | ||||
|         else: | ||||
|             return NeuralEntityRecognizer(nlp.vocab, **cfg) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_pipeline(cls, nlp=None, disable=tuple()): | ||||
|         meta = nlp.meta if nlp is not None else {} | ||||
|         # Resolve strings, like "cnn", "lstm", etc | ||||
|         pipeline = [] | ||||
|         for entry in meta.get('pipeline', []): | ||||
|             if entry in disable or getattr(entry, 'name', entry) in disable: | ||||
|                 continue | ||||
|             factory = cls.Defaults.factories[entry] | ||||
|             pipeline.append(factory(nlp, **meta.get(entry, {}))) | ||||
|         return pipeline | ||||
| 
 | ||||
|     factories = { | ||||
|         'make_doc': create_tokenizer, | ||||
|         'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], | ||||
|         'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], | ||||
|         'parser': lambda nlp, **cfg: [ | ||||
|             NeuralDependencyParser(nlp.vocab, **cfg), | ||||
|             nonproj.deprojectivize], | ||||
|         'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], | ||||
|         'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)], | ||||
|         'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)], | ||||
|         # Temporary compatibility -- delete after pivot | ||||
|         'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], | ||||
|         'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], | ||||
|         'dependencies': lambda nlp, **cfg: [ | ||||
|             NeuralDependencyParser(nlp.vocab, **cfg), | ||||
|             nonproj.deprojectivize, | ||||
|         ], | ||||
|         'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], | ||||
|     } | ||||
| 
 | ||||
|     pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] | ||||
|     token_match = TOKEN_MATCH | ||||
|     prefixes = tuple(TOKENIZER_PREFIXES) | ||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||
|  | @ -136,6 +78,7 @@ class BaseDefaults(object): | |||
|     lemma_rules = {} | ||||
|     lemma_exc = {} | ||||
|     lemma_index = {} | ||||
|     lemma_lookup = {} | ||||
|     morph_rules = {} | ||||
|     lex_attr_getters = LEX_ATTRS | ||||
|     syntax_iterators = {} | ||||
|  | @ -152,8 +95,17 @@ class Language(object): | |||
|     Defaults = BaseDefaults | ||||
|     lang = None | ||||
| 
 | ||||
|     def __init__(self, vocab=True, make_doc=True, pipeline=None, | ||||
|                  meta={}, disable=tuple(), **kwargs): | ||||
|     factories = { | ||||
|         'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), | ||||
|         'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), | ||||
|         'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), | ||||
|         'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), | ||||
|         'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), | ||||
|         'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), | ||||
|         'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) | ||||
|     } | ||||
| 
 | ||||
|     def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): | ||||
|         """Initialise a Language object. | ||||
| 
 | ||||
|         vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via | ||||
|  | @ -179,28 +131,7 @@ class Language(object): | |||
|             factory = self.Defaults.create_tokenizer | ||||
|             make_doc = factory(self, **meta.get('tokenizer', {})) | ||||
|         self.tokenizer = make_doc | ||||
|         if pipeline is True: | ||||
|             self.pipeline = self.Defaults.create_pipeline(self, disable) | ||||
|         elif pipeline: | ||||
|             # Careful not to do getattr(p, 'name', None) here | ||||
|             # If we had disable=[None], we'd disable everything! | ||||
|             self.pipeline = [p for p in pipeline | ||||
|                              if p not in disable | ||||
|                              and getattr(p, 'name', p) not in disable] | ||||
|             # Resolve strings, like "cnn", "lstm", etc | ||||
|             for i, entry in enumerate(self.pipeline): | ||||
|                 if entry in self.Defaults.factories: | ||||
|                     factory = self.Defaults.factories[entry] | ||||
|                     self.pipeline[i] = factory(self, **meta.get(entry, {})) | ||||
|         else: | ||||
|             self.pipeline = [] | ||||
|         flat_list = [] | ||||
|         for pipe in self.pipeline: | ||||
|             if isinstance(pipe, list): | ||||
|                 flat_list.extend(pipe) | ||||
|             else: | ||||
|                 flat_list.append(pipe) | ||||
|         self.pipeline = flat_list | ||||
|         self.pipeline = [] | ||||
|         self._optimizer = None | ||||
| 
 | ||||
|     @property | ||||
|  | @ -214,11 +145,7 @@ class Language(object): | |||
|         self._meta.setdefault('email', '') | ||||
|         self._meta.setdefault('url', '') | ||||
|         self._meta.setdefault('license', '') | ||||
|         pipeline = [] | ||||
|         for component in self.pipeline: | ||||
|             if hasattr(component, 'name'): | ||||
|                 pipeline.append(component.name) | ||||
|         self._meta['pipeline'] = pipeline | ||||
|         self._meta['pipeline'] = self.pipe_names | ||||
|         return self._meta | ||||
| 
 | ||||
|     @meta.setter | ||||
|  | @ -228,34 +155,144 @@ class Language(object): | |||
|     # Conveniences to access pipeline components | ||||
|     @property | ||||
|     def tensorizer(self): | ||||
|         return self.get_component('tensorizer') | ||||
|         return self.get_pipe('tensorizer') | ||||
| 
 | ||||
|     @property | ||||
|     def tagger(self): | ||||
|         return self.get_component('tagger') | ||||
|         return self.get_pipe('tagger') | ||||
| 
 | ||||
|     @property | ||||
|     def parser(self): | ||||
|         return self.get_component('parser') | ||||
|         return self.get_pipe('parser') | ||||
| 
 | ||||
|     @property | ||||
|     def entity(self): | ||||
|         return self.get_component('ner') | ||||
|         return self.get_pipe('ner') | ||||
| 
 | ||||
|     @property | ||||
|     def matcher(self): | ||||
|         return self.get_component('matcher') | ||||
|         return self.get_pipe('matcher') | ||||
| 
 | ||||
|     def get_component(self, name): | ||||
|         if self.pipeline in (True, None): | ||||
|             return None | ||||
|         for proc in self.pipeline: | ||||
|             if hasattr(proc, 'name') and proc.name.endswith(name): | ||||
|                 return proc | ||||
|         return None | ||||
|     @property | ||||
|     def pipe_names(self): | ||||
|         """Get names of available pipeline components. | ||||
| 
 | ||||
|         RETURNS (list): List of component name strings, in order. | ||||
|         """ | ||||
|         return [pipe_name for pipe_name, _ in self.pipeline] | ||||
| 
 | ||||
|     def get_pipe(self, name): | ||||
|         """Get a pipeline component for a given component name. | ||||
| 
 | ||||
|         name (unicode): Name of pipeline component to get. | ||||
|         RETURNS (callable): The pipeline component. | ||||
|         """ | ||||
|         for pipe_name, component in self.pipeline: | ||||
|             if pipe_name == name: | ||||
|                 return component | ||||
|         msg = "No component '{}' found in pipeline. Available names: {}" | ||||
|         raise KeyError(msg.format(name, self.pipe_names)) | ||||
| 
 | ||||
|     def create_pipe(self, name, config=dict()): | ||||
|         """Create a pipeline component from a factory. | ||||
| 
 | ||||
|         name (unicode): Factory name to look up in `Language.factories`. | ||||
|         config (dict): Configuration parameters to initialise component. | ||||
|         RETURNS (callable): Pipeline component. | ||||
|         """ | ||||
|         if name not in self.factories: | ||||
|             raise KeyError("Can't find factory for '{}'.".format(name)) | ||||
|         factory = self.factories[name] | ||||
|         return factory(self, **config) | ||||
| 
 | ||||
|     def add_pipe(self, component, name=None, before=None, after=None, | ||||
|                  first=None, last=None): | ||||
|         """Add a component to the processing pipeline. Valid components are | ||||
|         callables that take a `Doc` object, modify it and return it. Only one of | ||||
|         before, after, first or last can be set. Default behaviour is "last". | ||||
| 
 | ||||
|         component (callable): The pipeline component. | ||||
|         name (unicode): Name of pipeline component. Overwrites existing | ||||
|             component.name attribute if available. If no name is set and | ||||
|             the component exposes no name attribute, component.__name__ is | ||||
|             used. An error is raised if the name already exists in the pipeline. | ||||
|         before (unicode): Component name to insert component directly before. | ||||
|         after (unicode): Component name to insert component directly after. | ||||
|         first (bool): Insert component first / not first in the pipeline. | ||||
|         last (bool): Insert component last / not last in the pipeline. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> nlp.add_pipe(component, before='ner') | ||||
|             >>> nlp.add_pipe(component, name='custom_name', last=True) | ||||
|         """ | ||||
|         if name is None: | ||||
|             if hasattr(component, 'name'): | ||||
|                 name = component.name | ||||
|             elif hasattr(component, '__name__'): | ||||
|                 name = component.__name__ | ||||
|             elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'): | ||||
|                 name = component.__class__.__name__ | ||||
|             else: | ||||
|                 name = repr(component) | ||||
|         if name in self.pipe_names: | ||||
|             raise ValueError("'{}' already exists in pipeline.".format(name)) | ||||
|         if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: | ||||
|             msg = ("Invalid constraints. You can only set one of the " | ||||
|                    "following: before, after, first, last.") | ||||
|             raise ValueError(msg) | ||||
|         pipe = (name, component) | ||||
|         if last or not any([first, before, after]): | ||||
|             self.pipeline.append(pipe) | ||||
|         elif first: | ||||
|             self.pipeline.insert(0, pipe) | ||||
|         elif before and before in self.pipe_names: | ||||
|             self.pipeline.insert(self.pipe_names.index(before), pipe) | ||||
|         elif after and after in self.pipe_names: | ||||
|             self.pipeline.insert(self.pipe_names.index(after), pipe) | ||||
|         else: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             unfound = before or after | ||||
|             raise ValueError(msg.format(unfound, self.pipe_names)) | ||||
| 
 | ||||
|     def replace_pipe(self, name, component): | ||||
|         """Replace a component in the pipeline. | ||||
| 
 | ||||
|         name (unicode): Name of the component to replace. | ||||
|         component (callable): Pipeline component. | ||||
|         """ | ||||
|         if name not in self.pipe_names: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             raise ValueError(msg.format(name, self.pipe_names)) | ||||
|         self.pipeline[self.pipe_names.index(name)] = (name, component) | ||||
| 
 | ||||
|     def rename_pipe(self, old_name, new_name): | ||||
|         """Rename a pipeline component. | ||||
| 
 | ||||
|         old_name (unicode): Name of the component to rename. | ||||
|         new_name (unicode): New name of the component. | ||||
|         """ | ||||
|         if old_name not in self.pipe_names: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             raise ValueError(msg.format(old_name, self.pipe_names)) | ||||
|         if new_name in self.pipe_names: | ||||
|             msg = "'{}' already exists in pipeline. Existing names: {}" | ||||
|             raise ValueError(msg.format(new_name, self.pipe_names)) | ||||
|         i = self.pipe_names.index(old_name) | ||||
|         self.pipeline[i] = (new_name, self.pipeline[i][1]) | ||||
| 
 | ||||
|     def remove_pipe(self, name): | ||||
|         """Remove a component from the pipeline. | ||||
| 
 | ||||
|         name (unicode): Name of the component to remove. | ||||
|         RETURNS (tuple): A `(name, component)` tuple of the removed component. | ||||
|         """ | ||||
|         if name not in self.pipe_names: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             raise ValueError(msg.format(name, self.pipe_names)) | ||||
|         return self.pipeline.pop(self.pipe_names.index(name)) | ||||
| 
 | ||||
|     def __call__(self, text, disable=[]): | ||||
|         """'Apply the pipeline to some text. The text can span multiple sentences, | ||||
|         """Apply the pipeline to some text. The text can span multiple sentences, | ||||
|         and can contain arbtrary whitespace. Alignment into the original string | ||||
|         is preserved. | ||||
| 
 | ||||
|  | @ -269,8 +306,7 @@ class Language(object): | |||
|             ('An', 'NN') | ||||
|         """ | ||||
|         doc = self.make_doc(text) | ||||
|         for proc in self.pipeline: | ||||
|             name = getattr(proc, 'name', None) | ||||
|         for name, proc in self.pipeline: | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             doc = proc(doc) | ||||
|  | @ -308,7 +344,7 @@ class Language(object): | |||
|             grads[key] = (W, dW) | ||||
|         pipes = list(self.pipeline) | ||||
|         random.shuffle(pipes) | ||||
|         for proc in pipes: | ||||
|         for name, proc in pipes: | ||||
|             if not hasattr(proc, 'update'): | ||||
|                 continue | ||||
|             proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) | ||||
|  | @ -322,7 +358,7 @@ class Language(object): | |||
|         docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. | ||||
|         YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. | ||||
|         """ | ||||
|         for proc in self.pipeline: | ||||
|         for name, proc in self.pipeline: | ||||
|             if hasattr(proc, 'preprocess_gold'): | ||||
|                 docs_golds = proc.preprocess_gold(docs_golds) | ||||
|         for doc, gold in docs_golds: | ||||
|  | @ -354,7 +390,7 @@ class Language(object): | |||
| 
 | ||||
|         get_gold_tuples (function): Function returning gold data | ||||
|         **cfg: Config parameters. | ||||
|         returns: An optimizer | ||||
|         RETURNS: An optimizer | ||||
|         """ | ||||
|         # Populate vocab | ||||
|         if get_gold_tuples is not None: | ||||
|  | @ -371,7 +407,7 @@ class Language(object): | |||
|         else: | ||||
|             device = None | ||||
|         link_vectors_to_models(self.vocab) | ||||
|         for proc in self.pipeline: | ||||
|         for name, proc in self.pipeline: | ||||
|             if hasattr(proc, 'begin_training'): | ||||
|                 context = proc.begin_training(get_gold_tuples(), | ||||
|                                               pipeline=self.pipeline) | ||||
|  | @ -393,7 +429,7 @@ class Language(object): | |||
|         docs, golds = zip(*docs_golds) | ||||
|         docs = list(docs) | ||||
|         golds = list(golds) | ||||
|         for pipe in self.pipeline: | ||||
|         for name, pipe in self.pipeline: | ||||
|             if not hasattr(pipe, 'pipe'): | ||||
|                 for doc in docs: | ||||
|                     pipe(doc) | ||||
|  | @ -419,7 +455,7 @@ class Language(object): | |||
|             >>> with nlp.use_params(optimizer.averages): | ||||
|             >>>     nlp.to_disk('/tmp/checkpoint') | ||||
|         """ | ||||
|         contexts = [pipe.use_params(params) for pipe | ||||
|         contexts = [pipe.use_params(params) for name, pipe | ||||
|                     in self.pipeline if hasattr(pipe, 'use_params')] | ||||
|         # TODO: Having trouble with contextlib | ||||
|         # Workaround: these aren't actually context managers atm. | ||||
|  | @ -466,8 +502,7 @@ class Language(object): | |||
|                 yield (doc, context) | ||||
|             return | ||||
|         docs = (self.make_doc(text) for text in texts) | ||||
|         for proc in self.pipeline: | ||||
|             name = getattr(proc, 'name', None) | ||||
|         for name, proc in self.pipeline: | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if hasattr(proc, 'pipe'): | ||||
|  | @ -495,14 +530,14 @@ class Language(object): | |||
|             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), | ||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||
|         )) | ||||
|         for proc in self.pipeline: | ||||
|         for name, proc in self.pipeline: | ||||
|             if not hasattr(proc, 'name'): | ||||
|                 continue | ||||
|             if proc.name in disable: | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if not hasattr(proc, 'to_disk'): | ||||
|                 continue | ||||
|             serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) | ||||
|             serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) | ||||
|         serializers['vocab'] = lambda p: self.vocab.to_disk(p) | ||||
|         util.to_disk(path, serializers, {p: False for p in disable}) | ||||
| 
 | ||||
|  | @ -526,14 +561,12 @@ class Language(object): | |||
|             ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), | ||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||
|         )) | ||||
|         for proc in self.pipeline: | ||||
|             if not hasattr(proc, 'name'): | ||||
|                 continue | ||||
|             if proc.name in disable: | ||||
|         for name, proc in self.pipeline: | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if not hasattr(proc, 'to_disk'): | ||||
|                 continue | ||||
|             deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) | ||||
|             deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) | ||||
|         exclude = {p: False for p in disable} | ||||
|         if not (path / 'vocab').exists(): | ||||
|             exclude['vocab'] = True | ||||
|  | @ -552,8 +585,8 @@ class Language(object): | |||
|             ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), | ||||
|             ('meta', lambda: ujson.dumps(self.meta)) | ||||
|         )) | ||||
|         for i, proc in enumerate(self.pipeline): | ||||
|             if getattr(proc, 'name', None) in disable: | ||||
|         for i, (name, proc) in enumerate(self.pipeline): | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if not hasattr(proc, 'to_bytes'): | ||||
|                 continue | ||||
|  | @ -572,8 +605,8 @@ class Language(object): | |||
|             ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), | ||||
|             ('meta', lambda b: self.meta.update(ujson.loads(b))) | ||||
|         )) | ||||
|         for i, proc in enumerate(self.pipeline): | ||||
|             if getattr(proc, 'name', None) in disable: | ||||
|         for i, (name, proc) in enumerate(self.pipeline): | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if not hasattr(proc, 'from_bytes'): | ||||
|                 continue | ||||
|  |  | |||
|  | @ -10,20 +10,23 @@ class Lemmatizer(object): | |||
|     def load(cls, path, index=None, exc=None, rules=None): | ||||
|         return cls(index or {}, exc or {}, rules or {}) | ||||
| 
 | ||||
|     def __init__(self, index, exceptions, rules): | ||||
|         self.index = index | ||||
|         self.exc = exceptions | ||||
|         self.rules = rules | ||||
|     def __init__(self, index=None, exceptions=None, rules=None, lookup=None): | ||||
|         self.index = index if index is not None else {} | ||||
|         self.exc = exceptions if exceptions is not None else {} | ||||
|         self.rules = rules if rules is not None else {} | ||||
|         self.lookup_table = lookup if lookup is not None else {} | ||||
| 
 | ||||
|     def __call__(self, string, univ_pos, morphology=None): | ||||
|         if univ_pos == NOUN: | ||||
|         if univ_pos in (NOUN, 'NOUN', 'noun'): | ||||
|             univ_pos = 'noun' | ||||
|         elif univ_pos == VERB: | ||||
|         elif univ_pos in (VERB, 'VERB', 'verb'): | ||||
|             univ_pos = 'verb' | ||||
|         elif univ_pos == ADJ: | ||||
|         elif univ_pos in (ADJ, 'ADJ', 'adj'): | ||||
|             univ_pos = 'adj' | ||||
|         elif univ_pos == PUNCT: | ||||
|         elif univ_pos in (PUNCT, 'PUNCT', 'punct'): | ||||
|             univ_pos = 'punct' | ||||
|         else: | ||||
|             return set([string.lower()]) | ||||
|         # See Issue #435 for example of where this logic is requied. | ||||
|         if self.is_base_form(univ_pos, morphology): | ||||
|             return set([string.lower()]) | ||||
|  | @ -77,6 +80,11 @@ class Lemmatizer(object): | |||
|     def punct(self, string, morphology=None): | ||||
|         return self(string, 'punct', morphology) | ||||
| 
 | ||||
|     def lookup(self, string): | ||||
|         if string in self.lookup_table: | ||||
|             return self.lookup_table[string] | ||||
|         return string | ||||
| 
 | ||||
| 
 | ||||
| def lemmatize(string, index, exceptions, rules): | ||||
|     string = string.lower() | ||||
|  |  | |||
|  | @ -1,19 +0,0 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .lemmatizer import Lemmatizer | ||||
| 
 | ||||
| 
 | ||||
| class Lemmatizer(Lemmatizer): | ||||
|     @classmethod | ||||
|     def load(cls, path, lookup): | ||||
|         return cls(lookup or {}) | ||||
| 
 | ||||
|     def __init__(self, lookup): | ||||
|         self.lookup = lookup | ||||
| 
 | ||||
|     def __call__(self, string, univ_pos, morphology=None): | ||||
|         try: | ||||
|             return set([self.lookup[string]]) | ||||
|         except: | ||||
|             return set([string]) | ||||
|  | @ -35,6 +35,8 @@ cdef class Morphology: | |||
|     cdef RichTagC* rich_tags | ||||
|     cdef PreshMapArray _cache | ||||
| 
 | ||||
|     cdef int assign_untagged(self, TokenC* token) except -1 | ||||
| 
 | ||||
|     cdef int assign_tag(self, TokenC* token, tag) except -1 | ||||
| 
 | ||||
|     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 | ||||
|  |  | |||
|  | @ -42,7 +42,7 @@ cdef class Morphology: | |||
|         self.tag_names = tuple(sorted(tag_map.keys())) | ||||
|         self.reverse_index = {} | ||||
| 
 | ||||
|         self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC)) | ||||
|         self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) | ||||
|         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): | ||||
|             self.tag_map[tag_str] = dict(attrs) | ||||
|             attrs = _normalize_props(attrs) | ||||
|  | @ -52,6 +52,10 @@ cdef class Morphology: | |||
|             self.rich_tags[i].morph = 0 | ||||
|             self.rich_tags[i].pos = attrs[POS] | ||||
|             self.reverse_index[self.rich_tags[i].name] = i | ||||
|         # Add a 'null' tag, which we can reference when assign morphology to | ||||
|         # untagged tokens. | ||||
|         self.rich_tags[self.n_tags].id = self.n_tags | ||||
| 
 | ||||
|         self._cache = PreshMapArray(self.n_tags) | ||||
|         self.exc = {} | ||||
|         if exc is not None: | ||||
|  | @ -62,6 +66,15 @@ cdef class Morphology: | |||
|         return (Morphology, (self.strings, self.tag_map, self.lemmatizer, | ||||
|                              self.exc), None, None) | ||||
| 
 | ||||
|     cdef int assign_untagged(self, TokenC* token) except -1: | ||||
|         """Set morphological attributes on a token without a POS tag. Uses | ||||
|         the lemmatizer's lookup() method, which looks up the string in the | ||||
|         table provided by the language data as lemma_lookup (if available).""" | ||||
|         if token.lemma == 0: | ||||
|             orth_str = self.strings[token.lex.orth] | ||||
|             lemma = self.lemmatizer.lookup(orth_str) | ||||
|             token.lemma = self.strings.add(lemma) | ||||
| 
 | ||||
|     cdef int assign_tag(self, TokenC* token, tag) except -1: | ||||
|         if isinstance(tag, basestring): | ||||
|             tag = self.strings.add(tag) | ||||
|  | @ -72,7 +85,7 @@ cdef class Morphology: | |||
|             token.tag = tag | ||||
| 
 | ||||
|     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: | ||||
|         if tag_id >= self.n_tags: | ||||
|         if tag_id > self.n_tags: | ||||
|             raise ValueError("Unknown tag ID: %s" % tag_id) | ||||
|         # TODO: It's pretty arbitrary to put this logic here. I guess the justification | ||||
|         # is that this is where the specific word and the tag interact. Still, | ||||
|  | @ -151,8 +164,6 @@ cdef class Morphology: | |||
|         cdef unicode py_string = self.strings[orth] | ||||
|         if self.lemmatizer is None: | ||||
|             return self.strings.add(py_string.lower()) | ||||
|         if univ_pos not in (NOUN, VERB, ADJ, PUNCT): | ||||
|             return self.strings.add(py_string.lower()) | ||||
|         cdef set lemma_strings | ||||
|         cdef unicode lemma_string | ||||
|         lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) | ||||
|  |  | |||
|  | @ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity | |||
| from .tokens.doc cimport Doc | ||||
| from .syntax.parser cimport Parser as LinearParser | ||||
| from .syntax.nn_parser cimport Parser as NeuralParser | ||||
| from .syntax import nonproj | ||||
| from .syntax.parser import get_templates as get_feature_templates | ||||
| from .syntax.beam_parser cimport BeamParser | ||||
| from .syntax.ner cimport BiluoPushDown | ||||
|  | @ -157,11 +158,13 @@ class BaseThincComponent(object): | |||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|         """Serialize the pipe to a bytestring.""" | ||||
|         serialize = OrderedDict(( | ||||
|             ('cfg', lambda: json_dumps(self.cfg)), | ||||
|             ('model', lambda: self.model.to_bytes()), | ||||
|             ('vocab', lambda: self.vocab.to_bytes()) | ||||
|         )) | ||||
|         serialize = OrderedDict() | ||||
|         serialize['cfg'] = lambda: json_dumps(self.cfg) | ||||
|         if self.model in (True, False, None): | ||||
|             serialize['model'] = lambda: self.model | ||||
|         else: | ||||
|             serialize['model'] = self.model.to_bytes | ||||
|         serialize['vocab'] = self.vocab.to_bytes | ||||
|         return util.to_bytes(serialize, exclude) | ||||
| 
 | ||||
|     def from_bytes(self, bytes_data, **exclude): | ||||
|  | @ -182,11 +185,11 @@ class BaseThincComponent(object): | |||
| 
 | ||||
|     def to_disk(self, path, **exclude): | ||||
|         """Serialize the pipe to disk.""" | ||||
|         serialize = OrderedDict(( | ||||
|             ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), | ||||
|             ('vocab', lambda p: self.vocab.to_disk(p)), | ||||
|             ('model', lambda p: p.open('wb').write(self.model.to_bytes())), | ||||
|         )) | ||||
|         serialize = OrderedDict() | ||||
|         serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg)) | ||||
|         serialize['vocab'] = lambda p: self.vocab.to_disk(p) | ||||
|         if self.model not in (None, True, False): | ||||
|             serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes()) | ||||
|         util.to_disk(path, serialize, exclude) | ||||
| 
 | ||||
|     def from_disk(self, path, **exclude): | ||||
|  | @ -437,13 +440,16 @@ class NeuralTagger(BaseThincComponent): | |||
|             yield | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|         serialize = OrderedDict(( | ||||
|             ('model', lambda: self.model.to_bytes()), | ||||
|             ('vocab', lambda: self.vocab.to_bytes()), | ||||
|             ('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map, | ||||
|                                              use_bin_type=True, | ||||
|                                              encoding='utf8')) | ||||
|         )) | ||||
|         serialize = OrderedDict() | ||||
|         if self.model in (None, True, False): | ||||
|             serialize['model'] = lambda: self.model | ||||
|         else: | ||||
|             serialize['model'] = self.model.to_bytes | ||||
|         serialize['vocab'] = self.vocab.to_bytes | ||||
| 
 | ||||
|         serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map, | ||||
|                                                      use_bin_type=True, | ||||
|                                                      encoding='utf8') | ||||
|         return util.to_bytes(serialize, exclude) | ||||
| 
 | ||||
|     def from_bytes(self, bytes_data, **exclude): | ||||
|  | @ -778,11 +784,19 @@ cdef class DependencyParser(LinearParser): | |||
|         if isinstance(label, basestring): | ||||
|             label = self.vocab.strings[label] | ||||
| 
 | ||||
|     @property | ||||
|     def postprocesses(self): | ||||
|         return [nonproj.deprojectivize] | ||||
| 
 | ||||
| 
 | ||||
| cdef class NeuralDependencyParser(NeuralParser): | ||||
|     name = 'parser' | ||||
|     TransitionSystem = ArcEager | ||||
| 
 | ||||
|     @property | ||||
|     def postprocesses(self): | ||||
|         return [nonproj.deprojectivize] | ||||
| 
 | ||||
|     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||
|         for target in []: | ||||
|             labeller = NeuralLabeller(self.vocab, target=target) | ||||
|  | @ -823,6 +837,11 @@ cdef class BeamDependencyParser(BeamParser): | |||
|         if isinstance(label, basestring): | ||||
|             label = self.vocab.strings[label] | ||||
| 
 | ||||
|     @property | ||||
|     def postprocesses(self): | ||||
|         return [nonproj.deprojectivize] | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', | ||||
|            'BeamEntityRecognizer', 'TokenVectorEnoder'] | ||||
|  |  | |||
|  | @ -241,8 +241,8 @@ cdef class Parser: | |||
|     def Model(cls, nr_class, **cfg): | ||||
|         depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) | ||||
|         token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) | ||||
|         hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) | ||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1)) | ||||
|         hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200)) | ||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) | ||||
|         embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) | ||||
|         hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) | ||||
|         hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) | ||||
|  | @ -779,6 +779,14 @@ cdef class Parser: | |||
|             for i in range(doc.length): | ||||
|                 doc.c[i] = state.c._sent[i] | ||||
|             self.moves.finalize_doc(doc) | ||||
|             for hook in self.postprocesses: | ||||
|                 for doc in docs: | ||||
|                     hook(doc) | ||||
| 
 | ||||
|     @property | ||||
|     def postprocesses(self): | ||||
|         # Available for subclasses, e.g. to deprojectivize | ||||
|         return [] | ||||
| 
 | ||||
|     def add_label(self, label): | ||||
|         resized = False | ||||
|  | @ -792,16 +800,25 @@ cdef class Parser: | |||
|         if self.model not in (True, False, None) and resized: | ||||
|             # Weights are stored in (nr_out, nr_in) format, so we're basically | ||||
|             # just adding rows here. | ||||
|             smaller = self.model[-1]._layers[-1] | ||||
|             larger = Affine(self.moves.n_moves, smaller.nI) | ||||
|             copy_array(larger.W[:smaller.nO], smaller.W) | ||||
|             copy_array(larger.b[:smaller.nO], smaller.b) | ||||
|             self.model[-1]._layers[-1] = larger | ||||
|             if self.model[-1].is_noop: | ||||
|                 smaller = self.model[1] | ||||
|                 dims = dict(self.model[1]._dims) | ||||
|                 dims['nO'] = self.moves.n_moves | ||||
|                 larger = self.model[1].__class__(**dims) | ||||
|                 copy_array(larger.W[:, :smaller.nO], smaller.W) | ||||
|                 copy_array(larger.b[:smaller.nO], smaller.b) | ||||
|                 self.model = (self.model[0], larger, self.model[2]) | ||||
|             else: | ||||
|                 smaller = self.model[-1]._layers[-1] | ||||
|                 larger = Affine(self.moves.n_moves, smaller.nI) | ||||
|                 copy_array(larger.W[:smaller.nO], smaller.W) | ||||
|                 copy_array(larger.b[:smaller.nO], smaller.b) | ||||
|                 self.model[-1]._layers[-1] = larger | ||||
| 
 | ||||
|     def begin_training(self, gold_tuples, pipeline=None, **cfg): | ||||
|         if 'model' in cfg: | ||||
|             self.model = cfg['model'] | ||||
|         gold_tuples = nonproj.preprocess_training_data(gold_tuples) | ||||
|         gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100) | ||||
|         actions = self.moves.get_actions(gold_parses=gold_tuples) | ||||
|         for action, labels in actions.items(): | ||||
|             for label in labels: | ||||
|  |  | |||
|  | @ -58,8 +58,9 @@ def en_vocab(): | |||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def en_parser(): | ||||
|     return util.get_lang_class('en').Defaults.create_parser() | ||||
| def en_parser(en_vocab): | ||||
|     nlp = util.get_lang_class('en')(en_vocab) | ||||
|     return nlp.create_pipe('parser') | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
|  |  | |||
							
								
								
									
										37
									
								
								spacy/tests/doc/test_creation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								spacy/tests/doc/test_creation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | |||
| '''Test Doc sets up tokens correctly.''' | ||||
| from __future__ import unicode_literals | ||||
| import pytest | ||||
| 
 | ||||
| from ...vocab import Vocab | ||||
| from ...tokens.doc import Doc | ||||
| from ...lemmatizer import Lemmatizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def lemmatizer(): | ||||
|     return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'}) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def vocab(lemmatizer): | ||||
|     return Vocab(lemmatizer=lemmatizer) | ||||
| 
 | ||||
| 
 | ||||
| def test_empty_doc(vocab): | ||||
|     doc = Doc(vocab) | ||||
|     assert len(doc) == 0 | ||||
| 
 | ||||
| 
 | ||||
| def test_single_word(vocab): | ||||
|     doc = Doc(vocab, words=['a']) | ||||
|     assert doc.text == 'a ' | ||||
|     doc = Doc(vocab, words=['a'], spaces=[False]) | ||||
|     assert doc.text == 'a' | ||||
| 
 | ||||
| 
 | ||||
| def test_lookup_lemmatization(vocab): | ||||
|     doc = Doc(vocab, words=['dogs', 'dogses']) | ||||
|     assert doc[0].text == 'dogs' | ||||
|     assert doc[0].lemma_ == 'dog' | ||||
|     assert doc[1].text == 'dogses' | ||||
|     assert doc[1].lemma_ == 'dogses' | ||||
							
								
								
									
										13
									
								
								spacy/tests/lang/de/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								spacy/tests/lang/de/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,13 @@ | |||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'), | ||||
|                                           ('engagierte', 'engagieren'), | ||||
|                                           ('schließt', 'schließen'), | ||||
|                                           ('vorgebenden', 'vorgebend')]) | ||||
| def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma): | ||||
|     tokens = de_tokenizer(string) | ||||
|     assert tokens[0].lemma_ == lemma | ||||
|  | @ -57,6 +57,5 @@ def test_en_lemmatizer_punct(en_lemmatizer): | |||
| def test_en_lemmatizer_lemma_assignment(EN): | ||||
|     text = "Bananas in pyjamas are geese." | ||||
|     doc = EN.make_doc(text) | ||||
|     assert all(t.lemma_ == '' for t in doc) | ||||
|     EN.tagger(doc) | ||||
|     assert all(t.lemma_ != '' for t in doc) | ||||
|  |  | |||
|  | @ -22,14 +22,14 @@ def vocab(): | |||
| @pytest.fixture | ||||
| def parser(vocab): | ||||
|     parser = NeuralDependencyParser(vocab) | ||||
|     parser.cfg['token_vector_width'] = 4 | ||||
|     parser.cfg['hidden_width'] = 6 | ||||
|     parser.cfg['token_vector_width'] = 8 | ||||
|     parser.cfg['hidden_width'] = 30 | ||||
|     parser.cfg['hist_size'] = 0 | ||||
|     parser.add_label('left') | ||||
|     parser.begin_training([], **parser.cfg) | ||||
|     sgd = Adam(NumpyOps(), 0.001) | ||||
| 
 | ||||
|     for i in range(30): | ||||
|     for i in range(10): | ||||
|         losses = {} | ||||
|         doc = Doc(vocab, words=['a', 'b', 'c', 'd']) | ||||
|         gold = GoldParse(doc, heads=[1, 1, 3, 3], | ||||
|  | @ -37,6 +37,8 @@ def parser(vocab): | |||
|         parser.update([doc], [gold], sgd=sgd, losses=losses) | ||||
|     return parser | ||||
| 
 | ||||
| def test_init_parser(parser): | ||||
|     pass | ||||
| 
 | ||||
| def test_add_label(parser): | ||||
|     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) | ||||
|  |  | |||
|  | @ -1,10 +1,11 @@ | |||
| import spacy | ||||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_beam_parse(): | ||||
|     nlp = spacy.load('en_core_web_sm') | ||||
|     doc = nlp(u'Australia is a country', disable=['ner']) | ||||
|     ents = nlp.entity(doc, beam_width=2) | ||||
|     print(ents) | ||||
| 
 | ||||
| @pytest.mark.models('en') | ||||
| def test_beam_parse(EN): | ||||
|     doc = EN(u'Australia is a country', disable=['ner']) | ||||
|     ents = EN.entity(doc, beam_width=2) | ||||
|     print(ents) | ||||
|  |  | |||
|  | @ -35,7 +35,7 @@ def parser(vocab): | |||
| def test_no_sentences(parser): | ||||
|     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) | ||||
|     doc = parser(doc) | ||||
|     assert len(list(doc.sents)) == 2 | ||||
|     assert len(list(doc.sents)) >= 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_sents_1(parser): | ||||
|  | @ -64,7 +64,7 @@ def test_sents_1_3(parser): | |||
|     doc[1].sent_start = True | ||||
|     doc[3].sent_start = True | ||||
|     doc = parser(doc) | ||||
|     assert len(list(doc.sents)) == 4 | ||||
|     assert len(list(doc.sents)) >= 3 | ||||
|     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) | ||||
|     doc[1].sent_start = True | ||||
|     doc[2].sent_start = False | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/pipeline/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/pipeline/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										84
									
								
								spacy/tests/pipeline/test_pipe_methods.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										84
									
								
								spacy/tests/pipeline/test_pipe_methods.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,84 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from ...language import Language | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def nlp(): | ||||
|     return Language() | ||||
| 
 | ||||
| 
 | ||||
| def new_pipe(doc): | ||||
|     return doc | ||||
| 
 | ||||
| 
 | ||||
| def test_add_pipe_no_name(nlp): | ||||
|     nlp.add_pipe(new_pipe) | ||||
|     assert 'new_pipe' in nlp.pipe_names | ||||
| 
 | ||||
| 
 | ||||
| def test_add_pipe_duplicate_name(nlp): | ||||
|     nlp.add_pipe(new_pipe, name='duplicate_name') | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.add_pipe(new_pipe, name='duplicate_name') | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name', ['parser']) | ||||
| def test_add_pipe_first(nlp, name): | ||||
|     nlp.add_pipe(new_pipe, name=name, first=True) | ||||
|     assert nlp.pipeline[0][0] == name | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')]) | ||||
| def test_add_pipe_last(nlp, name1, name2): | ||||
|     nlp.add_pipe(lambda doc: doc, name=name2) | ||||
|     nlp.add_pipe(new_pipe, name=name1, last=True) | ||||
|     assert nlp.pipeline[0][0] != name1 | ||||
|     assert nlp.pipeline[-1][0] == name1 | ||||
| 
 | ||||
| 
 | ||||
| def test_cant_add_pipe_first_and_last(nlp): | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.add_pipe(new_pipe, first=True, last=True) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name', ['my_component']) | ||||
| def test_get_pipe(nlp, name): | ||||
|     with pytest.raises(KeyError): | ||||
|         nlp.get_pipe(name) | ||||
|     nlp.add_pipe(new_pipe, name=name) | ||||
|     assert nlp.get_pipe(name) == new_pipe | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)]) | ||||
| def test_replace_pipe(nlp, name, replacement): | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.replace_pipe(name, new_pipe) | ||||
|     nlp.add_pipe(new_pipe, name=name) | ||||
|     nlp.replace_pipe(name, replacement) | ||||
|     assert nlp.get_pipe(name) != new_pipe | ||||
|     assert nlp.get_pipe(name) == replacement | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')]) | ||||
| def test_rename_pipe(nlp, old_name, new_name): | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.rename_pipe(old_name, new_name) | ||||
|     nlp.add_pipe(new_pipe, name=old_name) | ||||
|     nlp.rename_pipe(old_name, new_name) | ||||
|     assert nlp.pipeline[0][0] == new_name | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name', ['my_component']) | ||||
| def test_remove_pipe(nlp, name): | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.remove_pipe(name) | ||||
|     nlp.add_pipe(new_pipe, name=name) | ||||
|     assert len(nlp.pipeline) == 1 | ||||
|     removed_name, removed_component = nlp.remove_pipe(name) | ||||
|     assert not len(nlp.pipeline) | ||||
|     assert removed_name == name | ||||
|     assert removed_component == new_pipe | ||||
|  | @ -7,6 +7,7 @@ from ..util import get_doc | |||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_issue589(): | ||||
|     vocab = Vocab() | ||||
|     vocab.strings.set_frozen(True) | ||||
|  |  | |||
							
								
								
									
										9
									
								
								spacy/tests/serialize/test_serialize_empty_model.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								spacy/tests/serialize/test_serialize_empty_model.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,9 @@ | |||
| import spacy | ||||
| import spacy.lang.en | ||||
| from spacy.pipeline import TextCategorizer | ||||
| 
 | ||||
| def test_bytes_serialize_issue_1105(): | ||||
|     nlp = spacy.lang.en.English() | ||||
|     tokenizer = nlp.tokenizer | ||||
|     textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER']) | ||||
|     textcat_bytes = textcat.to_bytes() | ||||
							
								
								
									
										53
									
								
								spacy/tests/test_underscore.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								spacy/tests/test_underscore.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,53 @@ | |||
| from mock import Mock | ||||
| from ..tokens.underscore import Underscore | ||||
| 
 | ||||
| 
 | ||||
| def test_create_doc_underscore(): | ||||
|     doc = Mock() | ||||
|     doc.doc = doc | ||||
|     uscore = Underscore(Underscore.doc_extensions, doc) | ||||
|     assert uscore._doc is doc | ||||
|     assert uscore._start is None | ||||
|     assert uscore._end is None | ||||
| 
 | ||||
| 
 | ||||
| def test_doc_underscore_getattr_setattr(): | ||||
|     doc = Mock() | ||||
|     doc.doc = doc | ||||
|     doc.user_data = {} | ||||
|     Underscore.doc_extensions['hello'] = (False, None, None, None) | ||||
|     doc._ = Underscore(Underscore.doc_extensions, doc) | ||||
|     assert doc._.hello == False | ||||
|     doc._.hello = True | ||||
|     assert doc._.hello == True | ||||
| 
 | ||||
| 
 | ||||
| def test_create_span_underscore(): | ||||
|     span = Mock(doc=Mock(), start=0, end=2) | ||||
|     uscore = Underscore(Underscore.span_extensions, span, | ||||
|                         start=span.start, end=span.end) | ||||
|     assert uscore._doc is span.doc | ||||
|     assert uscore._start is span.start | ||||
|     assert uscore._end is span.end | ||||
| 
 | ||||
| 
 | ||||
| def test_span_underscore_getter_setter(): | ||||
|     span = Mock(doc=Mock(), start=0, end=2) | ||||
|     Underscore.span_extensions['hello'] = (None, None, | ||||
|                                            lambda s: (s.start, 'hi'), | ||||
|                                            lambda s, value: setattr(s, 'start', | ||||
|                                                                     value)) | ||||
|     span._ = Underscore(Underscore.span_extensions, span, | ||||
|                         start=span.start, end=span.end) | ||||
| 
 | ||||
|     assert span._.hello == (0, 'hi') | ||||
|     span._.hello = 1 | ||||
|     assert span._.hello == (1, 'hi') | ||||
| 
 | ||||
| 
 | ||||
| def test_token_underscore_method(): | ||||
|     token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese') | ||||
|     Underscore.token_extensions['hello'] = (None, token.say_cheese, | ||||
|                                             None, None) | ||||
|     token._ = Underscore(Underscore.token_extensions, token, start=token.idx) | ||||
|     assert token._.hello() == 'cheese' | ||||
|  | @ -30,7 +30,7 @@ from ..util import normalize_slice | |||
| from ..compat import is_config | ||||
| from .. import about | ||||
| from .. import util | ||||
| 
 | ||||
| from .underscore import Underscore | ||||
| 
 | ||||
| DEF PADDING = 5 | ||||
| 
 | ||||
|  | @ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: | |||
|     else: | ||||
|         return Lexeme.get_struct_attr(token.lex, feat_name) | ||||
| 
 | ||||
| 
 | ||||
| def _get_chunker(lang): | ||||
|     try: | ||||
|         cls = util.get_lang_class(lang) | ||||
|  | @ -73,6 +74,7 @@ def _get_chunker(lang): | |||
|         return None | ||||
|     return cls.Defaults.syntax_iterators.get(u'noun_chunks') | ||||
| 
 | ||||
| 
 | ||||
| cdef class Doc: | ||||
|     """A sequence of Token objects. Access sentences and named entities, export | ||||
|     annotations to numpy arrays, losslessly serialize to compressed binary strings. | ||||
|  | @ -87,6 +89,21 @@ cdef class Doc: | |||
|         >>> from spacy.tokens import Doc | ||||
|         >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) | ||||
|     """ | ||||
|     @classmethod | ||||
|     def set_extension(cls, name, default=None, method=None, | ||||
|                       getter=None, setter=None): | ||||
|         nr_defined = sum(t is not None for t in (default, getter, setter, method)) | ||||
|         assert nr_defined == 1 | ||||
|         Underscore.doc_extensions[name] = (default, method, getter, setter)  | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_extension(cls, name): | ||||
|         return Underscore.doc_extensions.get(name) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def has_extension(cls, name): | ||||
|         return name in Underscore.doc_extensions | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): | ||||
|         """Create a Doc object. | ||||
| 
 | ||||
|  | @ -159,6 +176,10 @@ cdef class Doc: | |||
|             self.is_tagged = True | ||||
|             self.is_parsed = True | ||||
| 
 | ||||
|     @property | ||||
|     def _(self): | ||||
|         return Underscore(Underscore.doc_extensions, self) | ||||
| 
 | ||||
|     def __getitem__(self, object i): | ||||
|         """Get a `Token` or `Span` object. | ||||
| 
 | ||||
|  | @ -512,6 +533,8 @@ cdef class Doc: | |||
|         assert t.lex.orth != 0 | ||||
|         t.spacy = has_space | ||||
|         self.length += 1 | ||||
|         # Set morphological attributes, e.g. by lemma, if possible | ||||
|         self.vocab.morphology.assign_untagged(t) | ||||
|         self._py_tokens.append(None) | ||||
|         return t.idx + t.lex.length + t.spacy | ||||
| 
 | ||||
|  |  | |||
|  | @ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE | |||
| from ..lexeme cimport Lexeme | ||||
| from ..compat import is_config | ||||
| from .. import about | ||||
| from .underscore import Underscore | ||||
| 
 | ||||
| 
 | ||||
| cdef class Span: | ||||
|     """A slice from a Doc object.""" | ||||
|     @classmethod | ||||
|     def set_extension(cls, name, default=None, method=None, | ||||
|                       getter=None, setter=None): | ||||
|         Underscore.span_extensions[name] = (default, method, getter, setter) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_extension(cls, name): | ||||
|         return Underscore.span_extensions.get(name) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def has_extension(cls, name): | ||||
|         return name in Underscore.span_extensions | ||||
| 
 | ||||
|     def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, | ||||
|                   vector_norm=None): | ||||
|         """Create a `Span` object from the slice `doc[start : end]`. | ||||
|  | @ -111,10 +125,14 @@ cdef class Span: | |||
|         for i in range(self.start, self.end): | ||||
|             yield self.doc[i] | ||||
| 
 | ||||
|     @property | ||||
|     def _(self): | ||||
|         return Underscore(Underscore.span_extensions, self, | ||||
|                           start=self.start_char, end=self.end_char) | ||||
|     def as_doc(self): | ||||
|         '''Create a Doc object view of the Span's data. | ||||
| 
 | ||||
|         This is mostly useful for C-typed interfaces.  | ||||
|         This is mostly useful for C-typed interfaces. | ||||
|         ''' | ||||
|         cdef Doc doc = Doc(self.doc.vocab) | ||||
|         doc.length = self.end-self.start | ||||
|  |  | |||
|  | @ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST | |||
| from ..attrs cimport LEMMA, POS, TAG, DEP | ||||
| from ..compat import is_config | ||||
| from .. import about | ||||
| from .underscore import Underscore | ||||
| 
 | ||||
| 
 | ||||
| cdef class Token: | ||||
|     """An individual token – i.e. a word, punctuation symbol, whitespace, etc.""" | ||||
|     @classmethod | ||||
|     def set_extension(cls, name, default=None, method=None, | ||||
|                       getter=None, setter=None): | ||||
|         Underscore.token_extensions[name] = (default, method, getter, setter) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_extension(cls, name): | ||||
|         return Underscore.span_extensions.get(name) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def has_extension(cls, name): | ||||
|         return name in Underscore.span_extensions | ||||
| 
 | ||||
|     def __cinit__(self, Vocab vocab, Doc doc, int offset): | ||||
|         """Construct a `Token` object. | ||||
| 
 | ||||
|  | @ -87,6 +101,11 @@ cdef class Token: | |||
|         else: | ||||
|             raise ValueError(op) | ||||
| 
 | ||||
|     @property | ||||
|     def _(self): | ||||
|         return Underscore(Underscore.token_extensions, self, | ||||
|                           start=self.idx, end=None) | ||||
| 
 | ||||
|     cpdef bint check_flag(self, attr_id_t flag_id) except -1: | ||||
|         """Check the value of a boolean flag. | ||||
| 
 | ||||
|  | @ -266,7 +285,7 @@ cdef class Token: | |||
|         def __get__(self): | ||||
|             if 'vector_norm' in self.doc.user_token_hooks: | ||||
|                 return self.doc.user_token_hooks['vector_norm'](self) | ||||
|             vector = self.vector  | ||||
|             vector = self.vector | ||||
|             return numpy.sqrt((vector ** 2).sum()) | ||||
| 
 | ||||
|     property n_lefts: | ||||
|  |  | |||
							
								
								
									
										50
									
								
								spacy/tokens/underscore.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								spacy/tokens/underscore.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,50 @@ | |||
| import functools | ||||
| 
 | ||||
| class Underscore(object): | ||||
|     doc_extensions = {} | ||||
|     span_extensions = {} | ||||
|     token_extensions = {} | ||||
| 
 | ||||
|     def __init__(self, extensions, obj, start=None, end=None): | ||||
|         object.__setattr__(self, '_extensions', extensions) | ||||
|         object.__setattr__(self, '_obj', obj) | ||||
|         # Assumption is that for doc values, _start and _end will both be None | ||||
|         # Span will set non-None values for _start and _end | ||||
|         # Token will have _start be non-None, _end be None | ||||
|         # This lets us key everything into the doc.user_data dictionary, | ||||
|         # (see _get_key), and lets us use a single Underscore class. | ||||
|         object.__setattr__(self, '_doc', obj.doc) | ||||
|         object.__setattr__(self, '_start', start) | ||||
|         object.__setattr__(self, '_end', end) | ||||
| 
 | ||||
|     def __getattr__(self, name): | ||||
|         if name not in self._extensions: | ||||
|             raise AttributeError(name) | ||||
|         default, method, getter, setter = self._extensions[name] | ||||
|         if getter is not None: | ||||
|             return getter(self._obj) | ||||
|         elif method is not None: | ||||
|             return functools.partial(method, self._obj) | ||||
|         else: | ||||
|             return self._doc.user_data.get(self._get_key(name), default) | ||||
| 
 | ||||
|     def __setattr__(self, name, value): | ||||
|         if name not in self._extensions: | ||||
|             raise AttributeError(name) | ||||
|         default, method, getter, setter = self._extensions[name] | ||||
|         if setter is not None: | ||||
|             return setter(self._obj, value) | ||||
|         else: | ||||
|             self._doc.user_data[self._get_key(name)] = value | ||||
| 
 | ||||
|     def set(self, name, value): | ||||
|         return self.__setattr__(name, value) | ||||
| 
 | ||||
|     def get(self, name): | ||||
|         return self.__getattr__(name) | ||||
| 
 | ||||
|     def has(self, name): | ||||
|         return name in self._extensions | ||||
| 
 | ||||
|     def _get_key(self, name): | ||||
|         return ('._.', name, self._start, self._end) | ||||
|  | @ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides): | |||
|     if not meta: | ||||
|         meta = get_model_meta(model_path) | ||||
|     cls = get_lang_class(meta['lang']) | ||||
|     nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides) | ||||
|     nlp = cls(meta=meta, **overrides) | ||||
|     pipeline = meta.get('pipeline', []) | ||||
|     disable = overrides.get('disable', []) | ||||
|     if pipeline is True: | ||||
|         pipeline = nlp.Defaults.pipe_names | ||||
|     elif pipeline in (False, None): | ||||
|         pipeline = [] | ||||
|     for name in pipeline: | ||||
|         if name not in disable: | ||||
|             config = meta.get('pipeline_args', {}).get(name, {}) | ||||
|             component = nlp.create_pipe(name, config=config) | ||||
|             nlp.add_pipe(component, name=name) | ||||
|     return nlp.from_disk(model_path) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap) | |||
| 
 | ||||
| //- Code blocks to display old/new versions | ||||
| 
 | ||||
| mixin code-wrapper() | ||||
|     span.u-inline-block.u-padding-top.u-width-full | ||||
|         block | ||||
| 
 | ||||
| mixin code-old() | ||||
|     +code(false, false, false, false, "reject").o-block-small | ||||
|         block | ||||
|  |  | |||
|  | @ -113,6 +113,22 @@ p | |||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(3, "validate") Validate | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Find all models installed in the current environment (both packages and | ||||
|     |  shortcut links) and check whether they are compatible with the currently | ||||
|     |  installed version of spaCy. Should be run after upgrading spaCy via | ||||
|     |  #[code pip install -U spacy] to ensure that all installed models are | ||||
|     |  can be used with the new version. The command is also useful to detect | ||||
|     |  out-of-sync model links resulting from links created in different virtual | ||||
|     |  environments. Prints a list of models, the installed versions, the latest | ||||
|     |  compatible version (if out of date) and the commands for updating. | ||||
| 
 | ||||
| +code(false, "bash", "$"). | ||||
|     spacy validate | ||||
| 
 | ||||
| +h(3, "convert") Convert | ||||
| 
 | ||||
| p | ||||
|  |  | |||
|  | @ -43,6 +43,20 @@ p | |||
|         +cell #[code Language] | ||||
|         +cell A #[code Language] object with the loaded model. | ||||
| 
 | ||||
| p | ||||
|     |  Essentially, #[code spacy.load()] is a convenience wrapper that reads | ||||
|     |  the language ID and pipeline components from a model's #[code meta.json], | ||||
|     |  initialises the #[code Language] class, loads in the model data and | ||||
|     |  returns it. | ||||
| 
 | ||||
| +code("Abstract example"). | ||||
|     cls = util.get_lang_class(lang)         #  get language for ID, e.g. 'en' | ||||
|     nlp = cls()                             #  initialise the language | ||||
|     for name in pipeline: | ||||
|         component = nlp.create_pipe(name)   #  create each pipeline component | ||||
|         nlp.add_pipe(component)             #  add component to pipeline | ||||
|     nlp.from_disk(model_data_path)          #  load in model data | ||||
| 
 | ||||
| +infobox("Deprecation note", "⚠️") | ||||
|     .o-block | ||||
|         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy | ||||
|  | @ -141,37 +155,3 @@ p | |||
|         +cell returns | ||||
|         +cell unicode | ||||
|         +cell The explanation, or #[code None] if not found in the glossary. | ||||
| 
 | ||||
| +h(3, "spacy.set_factory") spacy.set_factory | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Set a factory that returns a custom | ||||
|     |  #[+a("/usage/processing-pipelines") processing pipeline] | ||||
|     |  component. Factories are useful for creating stateful components, especially ones which depend on shared data. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def my_factory(vocab): | ||||
|         def my_component(doc): | ||||
|             return doc | ||||
|         return my_component | ||||
| 
 | ||||
|     spacy.set_factory('my_factory', my_factory) | ||||
|     nlp = Language(pipeline=['my_factory']) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code factory_id] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  Unique name of factory. If added to a new pipeline, spaCy will | ||||
|             |  look up the factory for this ID and use it to create the | ||||
|             |  component. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code factory] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Callable that takes a #[code Vocab] object and returns a pipeline | ||||
|             |  component. | ||||
|  |  | |||
|  | @ -138,6 +138,109 @@ p Get the number of tokens in the document. | |||
|         +cell int | ||||
|         +cell The number of tokens in the document. | ||||
| 
 | ||||
| +h(2, "set_extension") Doc.set_extension | ||||
|     +tag classmethod | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Define a custom attribute on the #[code Doc] which becomes available via | ||||
|     |  #[code Doc._]. For details, see the documentation on | ||||
|     |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.tokens import Doc | ||||
|     city_getter = lambda doc: doc.text in ('New York', 'Paris', 'Berlin') | ||||
|     Doc.set_extension('has_city', getter=city_getter) | ||||
|     doc = nlp(u'I like New York') | ||||
|     assert doc._.has_city | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  Name of the attribute to set by the extension. For example, | ||||
|             |  #[code 'my_attr'] will be available as #[code doc._.my_attr]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code default] | ||||
|         +cell - | ||||
|         +cell | ||||
|             |  Optional default value of the attribute if no getter or method | ||||
|             |  is defined. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code method] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Set a custom method on the object, for example | ||||
|             |  #[code doc._.compare(other_doc)]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code getter] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Getter function that takes the object and returns an attribute | ||||
|             |  value. Is called when the user accesses the #[code ._] attribute. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code setter] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Setter function that takes the #[code Doc] and a value, and | ||||
|             |  modifies the object. Is called when the user writes to the | ||||
|             |  #[code Doc._] attribute. | ||||
| 
 | ||||
| +h(2, "get_extension") Doc.get_extension | ||||
|     +tag classmethod | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Look up a previously registered extension by name. Returns a 4-tuple | ||||
|     |  #[code.u-break (default, method, getter, setter)] if the extension is | ||||
|     |  registered. Raises a #[code KeyError] otherwise. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.tokens import Doc | ||||
|     Doc.set_extension('is_city', default=False) | ||||
|     extension = Doc.get_extension('is_city') | ||||
|     assert extension == (False, None, None, None) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the extension. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell tuple | ||||
|         +cell | ||||
|             |  A #[code.u-break (default, method, getter, setter)] tuple of the | ||||
|             |  extension. | ||||
| 
 | ||||
| +h(2, "has_extension") Doc.has_extension | ||||
|     +tag classmethod | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Check whether an extension has been registered on the #[code Doc] class. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.tokens import Doc | ||||
|     Doc.set_extension('is_city', default=False) | ||||
|     assert Doc.has_extension('is_city') | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the extension to check. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell bool | ||||
|         +cell Whether the extension has been registered. | ||||
| 
 | ||||
| +h(2, "char_span") Doc.char_span | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
|  |  | |||
|  | @ -4,7 +4,14 @@ include ../_includes/_mixins | |||
| 
 | ||||
| p | ||||
|     |  Usually you'll load this once per process as #[code nlp] and pass the | ||||
|     |  instance around your application. | ||||
|     |  instance around your application. The #[code Language] class is created | ||||
|     |  when you call #[+api("spacy#load") #[code spacy.load()]] and contains | ||||
|     |  the shared vocabulary and #[+a("/usage/adding-languages") language data], | ||||
|     |  optional model data loaded from a #[+a("/models") model package] or | ||||
|     |  a path, and a #[+a("/usage/processing-pipelines") processing pipeline] | ||||
|     |  containing components like the tagger or parser that are called on a | ||||
|     |  document in order. You can also add your own processing pipeline | ||||
|     |  components that take a #[code Doc] object, modify it and return it. | ||||
| 
 | ||||
| +h(2, "init") Language.__init__ | ||||
|     +tag method | ||||
|  | @ -12,9 +19,9 @@ p | |||
| p Initialise a #[code Language] object. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.vocab import Vocab | ||||
|     from spacy.language import Language | ||||
|     nlp = Language(pipeline=['token_vectors', 'tags', | ||||
|                              'dependencies']) | ||||
|     nlp = Language(Vocab()) | ||||
| 
 | ||||
|     from spacy.lang.en import English | ||||
|     nlp = English() | ||||
|  | @ -34,14 +41,6 @@ p Initialise a #[code Language] object. | |||
|             |  A function that takes text and returns a #[code Doc] object. | ||||
|             |  Usually a #[code Tokenizer]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code pipeline] | ||||
|         +cell list | ||||
|         +cell | ||||
|             |  A list of annotation processes or IDs of annotation, processes, | ||||
|             |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked | ||||
|             |  up in #[code Language.Defaults.factories]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code meta] | ||||
|         +cell dict | ||||
|  | @ -235,7 +234,6 @@ p | |||
|     |  Can be called before training to pre-process gold data. By default, it | ||||
|     |  handles nonprojectivity and adds missing tags to the tag map. | ||||
| 
 | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code docs_golds] | ||||
|  | @ -247,6 +245,177 @@ p | |||
|         +cell tuple | ||||
|         +cell Tuples of #[code Doc] and #[code GoldParse] objects. | ||||
| 
 | ||||
| +h(2, "create_pipe") Language.create_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Create a pipeline component from a factory. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     parser = nlp.create_pipe('parser') | ||||
|     nlp.add_pipe(parser) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  Factory name to look up in | ||||
|             |  #[+api("language#class-attributes") #[code Language.factories]]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code config] | ||||
|         +cell dict | ||||
|         +cell Configuration parameters to initialise component. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell callable | ||||
|         +cell The pipeline component. | ||||
| 
 | ||||
| +h(2, "add_pipe") Language.add_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Add a component to the processing pipeline. Valid components are | ||||
|     |  callables that take a #[code Doc] object, modify it and return it. Only | ||||
|     |  one of #[code before], #[code after], #[code first] or #[code last] can | ||||
|     |  be set. Default behaviour is #[code last=True]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def component(doc): | ||||
|         # modify Doc and return it | ||||
|         return doc | ||||
| 
 | ||||
|     nlp.add_pipe(component, before='ner') | ||||
|     nlp.add_pipe(component, name='custom_name', last=True) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code component] | ||||
|         +cell callable | ||||
|         +cell The pipeline component. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  Name of pipeline component. Overwrites existing | ||||
|             |  #[code component.name] attribute if available. If no #[code name] | ||||
|             |  is set and the component exposes no name attribute, | ||||
|             |  #[code component.__name__] is used. An error is raised if the | ||||
|             |  name already exists in the pipeline. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code before] | ||||
|         +cell unicode | ||||
|         +cell Component name to insert component directly before. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code after] | ||||
|         +cell unicode | ||||
|         +cell Component name to insert component directly after: | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code first] | ||||
|         +cell bool | ||||
|         +cell Insert component first / not first in the pipeline. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code last] | ||||
|         +cell bool | ||||
|         +cell Insert component last / not last in the pipeline. | ||||
| 
 | ||||
| +h(2, "get_pipe") Language.get_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Get a pipeline component for a given component name. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     parser = nlp.get_pipe('parser') | ||||
|     custom_component = nlp.get_pipe('custom_component') | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the pipeline component to get. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell callable | ||||
|         +cell The pipeline component. | ||||
| 
 | ||||
| +h(2, "replace_pipe") Language.replace_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Replace a component in the pipeline. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     nlp.replace_pipe('parser', my_custom_parser) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the component to replace. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code component] | ||||
|         +cell callable | ||||
|         +cell The pipeline component to inser. | ||||
| 
 | ||||
| 
 | ||||
| +h(2, "rename_pipe") Language.rename_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Rename a component in the pipeline. Useful to create custom names for | ||||
|     |  pre-defined and pre-loaded components. To change the default name of | ||||
|     |  a component added to the pipeline, you can also use the #[code name] | ||||
|     |  argument on #[+api("language#add_pipe") #[code add_pipe]]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     nlp.rename_pipe('parser', 'spacy_parser') | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code old_name] | ||||
|         +cell unicode | ||||
|         +cell Name of the component to rename. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code new_name] | ||||
|         +cell unicode | ||||
|         +cell New name of the component. | ||||
| 
 | ||||
| +h(2, "remove_pipe") Language.remove_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Remove a component from the pipeline. Returns the removed component name | ||||
|     |  and component function. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     name, component = nlp.remove_pipe('parser') | ||||
|     assert name == 'parser' | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the component to remove. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell tuple | ||||
|         +cell A #[code (name, component)] tuple of the removed component. | ||||
| 
 | ||||
| +h(2, "to_disk") Language.to_disk | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
|  | @ -399,7 +568,15 @@ p Load state from a binary string. | |||
|     +row | ||||
|         +cell #[code pipeline] | ||||
|         +cell list | ||||
|         +cell Sequence of annotation functions. | ||||
|         +cell | ||||
|             |  List of #[code (name, component)] tuples describing the current | ||||
|             |  processing pipeline, in order. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code pipe_names] | ||||
|             +tag-new(2) | ||||
|         +cell list | ||||
|         +cell List of pipeline component names, in order. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code meta] | ||||
|  | @ -424,3 +601,12 @@ p Load state from a binary string. | |||
|         +cell | ||||
|             |  Two-letter language ID, i.e. | ||||
|             |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code factories] | ||||
|             +tag-new(2) | ||||
|         +cell dict | ||||
|         +cell | ||||
|             |  Factories that create pre-defined pipeline components, e.g. the | ||||
|             |  tagger, parser or entity recognizer, keyed by their component | ||||
|             |  name. | ||||
|  |  | |||
|  | @ -116,6 +116,109 @@ p Get the number of tokens in the span. | |||
|         +cell int | ||||
|         +cell The number of tokens in the span. | ||||
| 
 | ||||
| +h(2, "set_extension") Span.set_extension | ||||
|     +tag classmethod | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Define a custom attribute on the #[code Span] which becomes available via | ||||
|     |  #[code Span._]. For details, see the documentation on | ||||
|     |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.tokens import Span | ||||
|     city_getter = lambda span: span.text in ('New York', 'Paris', 'Berlin') | ||||
|     Span.set_extension('has_city', getter=city_getter) | ||||
|     doc = nlp(u'I like New York in Autumn') | ||||
|     assert doc[1:4]._.has_city | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  Name of the attribute to set by the extension. For example, | ||||
|             |  #[code 'my_attr'] will be available as #[code span._.my_attr]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code default] | ||||
|         +cell - | ||||
|         +cell | ||||
|             |  Optional default value of the attribute if no getter or method | ||||
|             |  is defined. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code method] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Set a custom method on the object, for example | ||||
|             |  #[code span._.compare(other_span)]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code getter] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Getter function that takes the object and returns an attribute | ||||
|             |  value. Is called when the user accesses the #[code ._] attribute. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code setter] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Setter function that takes the #[code Span] and a value, and | ||||
|             |  modifies the object. Is called when the user writes to the | ||||
|             |  #[code Span._] attribute. | ||||
| 
 | ||||
| +h(2, "get_extension") Span.get_extension | ||||
|     +tag classmethod | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Look up a previously registered extension by name. Returns a 4-tuple | ||||
|     |  #[code.u-break (default, method, getter, setter)] if the extension is | ||||
|     |  registered. Raises a #[code KeyError] otherwise. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.tokens import Span | ||||
|     Span.set_extension('is_city', default=False) | ||||
|     extension = Span.get_extension('is_city') | ||||
|     assert extension == (False, None, None, None) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the extension. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell tuple | ||||
|         +cell | ||||
|             |  A #[code.u-break (default, method, getter, setter)] tuple of the | ||||
|             |  extension. | ||||
| 
 | ||||
| +h(2, "has_extension") Span.has_extension | ||||
|     +tag classmethod | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Check whether an extension has been registered on the #[code Span] class. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.tokens import Span | ||||
|     Span.set_extension('is_city', default=False) | ||||
|     assert Span.has_extension('is_city') | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the extension to check. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell bool | ||||
|         +cell Whether the extension has been registered. | ||||
| 
 | ||||
| +h(2, "similarity") Span.similarity | ||||
|     +tag method | ||||
|     +tag-model("vectors") | ||||
|  |  | |||
|  | @ -51,6 +51,109 @@ p The number of unicode characters in the token, i.e. #[code token.text]. | |||
|         +cell int | ||||
|         +cell The number of unicode characters in the token. | ||||
| 
 | ||||
| +h(2, "set_extension") Token.set_extension | ||||
|     +tag classmethod | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Define a custom attribute on the #[code Token] which becomes available | ||||
|     |  via #[code Token._]. For details, see the documentation on | ||||
|     |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.tokens import Token | ||||
|     fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana') | ||||
|     Token.set_extension('is_fruit', getter=fruit_getter) | ||||
|     doc = nlp(u'I have an apple') | ||||
|     assert doc[3]._.is_fruit | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  Name of the attribute to set by the extension. For example, | ||||
|             |  #[code 'my_attr'] will be available as #[code token._.my_attr]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code default] | ||||
|         +cell - | ||||
|         +cell | ||||
|             |  Optional default value of the attribute if no getter or method | ||||
|             |  is defined. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code method] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Set a custom method on the object, for example | ||||
|             |  #[code token._.compare(other_token)]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code getter] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Getter function that takes the object and returns an attribute | ||||
|             |  value. Is called when the user accesses the #[code ._] attribute. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code setter] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Setter function that takes the #[code Token] and a value, and | ||||
|             |  modifies the object. Is called when the user writes to the | ||||
|             |  #[code Token._] attribute. | ||||
| 
 | ||||
| +h(2, "get_extension") Token.get_extension | ||||
|     +tag classmethod | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Look up a previously registered extension by name. Returns a 4-tuple | ||||
|     |  #[code.u-break (default, method, getter, setter)] if the extension is | ||||
|     |  registered. Raises a #[code KeyError] otherwise. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.tokens import Token | ||||
|     Token.set_extension('is_fruit', default=False) | ||||
|     extension = Token.get_extension('is_fruit') | ||||
|     assert extension == (False, None, None, None) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the extension. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell tuple | ||||
|         +cell | ||||
|             |  A #[code.u-break (default, method, getter, setter)] tuple of the | ||||
|             |  extension. | ||||
| 
 | ||||
| +h(2, "has_extension") Token.has_extension | ||||
|     +tag classmethod | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Check whether an extension has been registered on the #[code Token] class. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.tokens import Token | ||||
|     Token.set_extension('is_fruit', default=False) | ||||
|     assert Token.has_extension('is_fruit') | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the extension to check. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell bool | ||||
|         +cell Whether the extension has been registered. | ||||
| 
 | ||||
| +h(2, "check_flag") Token.check_flag | ||||
|     +tag method | ||||
| 
 | ||||
|  |  | |||
|  | @ -143,6 +143,9 @@ | |||
| 
 | ||||
| //- Layout | ||||
| 
 | ||||
| .u-width-full | ||||
|     width: 100% | ||||
| 
 | ||||
| .u-float-left | ||||
|     float: left | ||||
|     margin-right: 1rem | ||||
|  | @ -166,6 +169,9 @@ | |||
| .u-padding-medium | ||||
|     padding: 1.8rem | ||||
| 
 | ||||
| .u-padding-top | ||||
|     padding-top: 2rem | ||||
| 
 | ||||
| .u-inline-block | ||||
|     display: inline-block | ||||
| 
 | ||||
|  |  | |||
|  | @ -25,7 +25,7 @@ | |||
|         display: inline-block | ||||
|         font-size: 0.6em | ||||
|         font-weight: bold | ||||
|         padding-right: 1.25rem | ||||
|         padding-right: 1em | ||||
|         margin-left: -3.75rem | ||||
|         text-align: right | ||||
|         width: 2.5rem | ||||
|  |  | |||
|  | @ -456,24 +456,11 @@ p | |||
|     } | ||||
| 
 | ||||
| p | ||||
|     |  To add a lookup lemmatizer to your language, import the #[code LOOKUP] | ||||
|     |  table and #[code Lemmatizer], and create a new classmethod: | ||||
|     |  To provide a lookup lemmatizer for your language, import the lookup table | ||||
|     |  and add it to the #[code Language] class as #[code lemma_lookup]: | ||||
| 
 | ||||
| 
 | ||||
| +code("__init__py (excerpt)"). | ||||
|     # other imports here, plus lookup table and lookup lemmatizer | ||||
|     from .lemmatizer import LOOKUP | ||||
|     from ...lemmatizerlookup import Lemmatizer | ||||
| 
 | ||||
|     class Xxxxx(Language): | ||||
|         lang = 'xx' | ||||
| 
 | ||||
|         class Defaults(Language.Defaults): | ||||
|             # other language defaults here | ||||
| 
 | ||||
|             @classmethod | ||||
|             def create_lemmatizer(cls, nlp=None): | ||||
|                 return Lemmatizer(LOOKUP) | ||||
| +code. | ||||
|     lemma_lookup = dict(LOOKUP) | ||||
| 
 | ||||
| +h(3, "tag-map") Tag map | ||||
| 
 | ||||
|  |  | |||
|  | @ -103,10 +103,10 @@ | |||
|         "title": "Language Processing Pipelines", | ||||
|         "next": "vectors-similarity", | ||||
|         "menu": { | ||||
|             "How pipelines work": "pipelines", | ||||
|             "Examples": "examples", | ||||
|             "How Pipelines Work": "pipelines", | ||||
|             "Custom Components": "custom-components", | ||||
|             "Developing Extensions": "extensions", | ||||
|             "Multi-threading": "multithreading", | ||||
|             "User Hooks": "user-hooks", | ||||
|             "Serialization": "serialization" | ||||
|         } | ||||
|     }, | ||||
|  | @ -195,6 +195,7 @@ | |||
|         "teaser": "Full code examples you can modify and run.", | ||||
|         "next": "resources", | ||||
|         "menu": { | ||||
|             "Pipeline": "pipeline", | ||||
|             "Matching": "matching", | ||||
|             "Training": "training", | ||||
|             "Deep Learning": "deep-learning" | ||||
|  |  | |||
							
								
								
									
										369
									
								
								website/usage/_processing-pipelines/_custom-components.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										369
									
								
								website/usage/_processing-pipelines/_custom-components.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,369 @@ | |||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS | ||||
| 
 | ||||
| p | ||||
|     |  A component receives a #[code Doc] object and can modify it – for example, | ||||
|     |  by using the current weights to make a prediction and set some annotation | ||||
|     |  on the document. By adding a component to the pipeline, you'll get access | ||||
|     |  to the #[code Doc] at any point #[strong during processing] – instead of | ||||
|     |  only being able to modify it afterwards. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def my_component(doc): | ||||
|         # do something to the doc here | ||||
|         return doc | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code doc] | ||||
|         +cell #[code Doc] | ||||
|         +cell The #[code Doc] object processed by the previous component. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Doc] | ||||
|         +cell The #[code Doc] object processed by this pipeline component. | ||||
| 
 | ||||
| p | ||||
|     |  Custom components can be added to the pipeline using the | ||||
|     |  #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you | ||||
|     |  can either specify a component to add it #[strong before or after], tell | ||||
|     |  spaCy to add it #[strong first or last] in the pipeline, or define a | ||||
|     |  #[strong custom name]. If no name is set and no #[code name] attribute | ||||
|     |  is present on your component, the function name is used. | ||||
| 
 | ||||
| +code("Adding pipeline components"). | ||||
|     def my_component(doc): | ||||
|         print("After tokenization, this doc has %s tokens." % len(doc)) | ||||
|         if len(doc) < 10: | ||||
|             print("This is a pretty short document.") | ||||
|         return doc | ||||
| 
 | ||||
|     nlp = spacy.load('en') | ||||
|     nlp.pipeline.add_pipe(my_component, name='print_info', first=True) | ||||
|     print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner'] | ||||
|     doc = nlp(u"This is a sentence.") | ||||
| 
 | ||||
| p | ||||
|     |  Of course, you can also wrap your component as a class to allow | ||||
|     |  initialising it with custom settings and hold state within the component. | ||||
|     |  This is useful for #[strong stateful components], especially ones which | ||||
|     |  #[strong depend on shared data]. | ||||
| 
 | ||||
| +code. | ||||
|     class MyComponent(object): | ||||
|         name = 'print_info' | ||||
| 
 | ||||
|         def __init__(vocab, short_limit=10): | ||||
|             self.vocab = nlp.vocab | ||||
|             self.short_limit = short_limit | ||||
| 
 | ||||
|         def __call__(doc): | ||||
|             if len(doc) < self.short_limit: | ||||
|                 print("This is a pretty short document.") | ||||
|             return doc | ||||
| 
 | ||||
|     my_component = MyComponent(nlp.vocab, short_limit=25) | ||||
|     nlp.add_pipe(my_component, first=True) | ||||
| 
 | ||||
| +h(3, "custom-components-attributes") | ||||
|     |  Extension attributes on #[code Doc], #[code Span] and #[code Token] | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  As of v2.0, spaCy allows you to set any custom attributes and methods | ||||
|     |  on the #[code Doc], #[code Span] and #[code Token], which become | ||||
|     |  available as #[code Doc._], #[code Span._] and #[code Token._] – for | ||||
|     |  example, #[code Token._.my_attr]. This lets you store additional | ||||
|     |  information relevant to your application, add new features and | ||||
|     |  functionality to spaCy, and implement your own models trained with other | ||||
|     |  machine learning libraries. It also lets you take advantage of spaCy's | ||||
|     |  data structures and the #[code Doc] object as the "single source of | ||||
|     |  truth". | ||||
| 
 | ||||
| +aside("Why ._?") | ||||
|     |  Writing to a #[code ._] attribute instead of to the #[code Doc] directly | ||||
|     |  keeps a clearer separation and makes it easier to ensure backwards | ||||
|     |  compatibility. For example, if you've implemented your own #[code .coref] | ||||
|     |  property and spaCy claims it one day, it'll break your code. Similarly, | ||||
|     |  just by looking at the code, you'll immediately know what's built-in and | ||||
|     |  what's custom – for example, #[code doc.sentiment] is spaCy, while | ||||
|     |  #[code doc._.sent_score] isn't. | ||||
| 
 | ||||
| p | ||||
|     |  There are three main types of extensions, which can be defined using the | ||||
|     |  #[+api("doc#set_extension") #[code Doc.set_extension]], | ||||
|     |  #[+api("span#set_extension") #[code Span.set_extension]] and | ||||
|     |  #[+api("token#set_extension") #[code Token.set_extension]] methods. | ||||
| 
 | ||||
| +list("numbers") | ||||
|     +item #[strong Attribute extensions]. | ||||
|         |  Set a default value for an attribute, which can be overwritten | ||||
|         |  manually at any time. Attribute extensions work like "normal" | ||||
|         |  variables and are the quickest way to store arbitrary information | ||||
|         |  on a #[code Doc], #[code Span] or #[code Token]. | ||||
| 
 | ||||
|         +code-wrapper | ||||
|             +code. | ||||
|                 Doc.set_extension('hello', default=True) | ||||
|                 assert doc._.hello | ||||
|                 doc._.hello = False | ||||
| 
 | ||||
|     +item #[strong Property extensions]. | ||||
|         |  Define a getter and an optional setter function. If no setter is | ||||
|         |  provided, the extension is immutable. Since the getter and setter | ||||
|         |  functions are only called when you #[em retrieve] the attribute, | ||||
|         |  you can also access values of previously added attribute extensions. | ||||
|         |  For example, a #[code Doc] getter can average over #[code Token] | ||||
|         |   attributes. For #[code Span] extensions, you'll almost always want | ||||
|         |  to use a property – otherwise, you'd have to write to | ||||
|         |  #[em every possible] #[code Span] in the #[code Doc] to set up the | ||||
|         |  values correctly. | ||||
| 
 | ||||
|         +code-wrapper | ||||
|             +code. | ||||
|                 Doc.set_extension('hello', getter=get_hello_value, setter=set_hello_value) | ||||
|                 assert doc._.hello | ||||
|                 doc._.hello = 'Hi!' | ||||
| 
 | ||||
|     +item #[strong Method extensions]. | ||||
|         |  Assign a function that becomes available as an object method. Method | ||||
|         |  extensions are always immutable. For more details and implementation | ||||
|         |  ideas, see | ||||
|         |  #[+a("/usage/examples#custom-components-attr-methods") these examples]. | ||||
| 
 | ||||
|         +code-wrapper | ||||
|             +code.o-no-block. | ||||
|                 Doc.set_extension('hello', method=lambda doc, name: 'Hi {}!'.format(name)) | ||||
|                 assert doc._.hello('Bob') == 'Hi Bob!' | ||||
| 
 | ||||
| p | ||||
|     |  Before you can access a custom extension, you need to register it using | ||||
|     |  the #[code set_extension] method on the object you want | ||||
|     |  to add it to, e.g. the #[code Doc]. Keep in mind that extensions are | ||||
|     |  always #[strong added globally] and not just on a particular instance. | ||||
|     |  If an attribute of the same name | ||||
|     |  already exists, or if you're trying to access an attribute that hasn't | ||||
|     |  been registered, spaCy will raise an #[code AttributeError]. | ||||
| 
 | ||||
| +code("Example"). | ||||
|     from spacy.tokens import Doc, Span, Token | ||||
| 
 | ||||
|     fruits = ['apple', 'pear', 'banana', 'orange', 'strawberry'] | ||||
|     is_fruit_getter = lambda token: token.text in fruits | ||||
|     has_fruit_getter = lambda obj: any([t.text in fruits for t in obj]) | ||||
| 
 | ||||
|     Token.set_extension('is_fruit', getter=is_fruit_getter) | ||||
|     Doc.set_extension('has_fruit', getter=has_fruit_getter) | ||||
|     Span.set_extension('has_fruit', getter=has_fruit_getter) | ||||
| 
 | ||||
| +aside-code("Usage example"). | ||||
|     doc = nlp(u"I have an apple and a melon") | ||||
|     assert doc[3]._.is_fruit      # get Token attributes | ||||
|     assert not doc[0]._.is_fruit | ||||
|     assert doc._.has_fruit        # get Doc attributes | ||||
|     assert doc[1:4]._.has_fruit   # get Span attributes | ||||
| 
 | ||||
| p | ||||
|     |  Once you've registered your custom attribute, you can also use the | ||||
|     |  built-in #[code set], #[code get] and #[code has] methods to modify and | ||||
|     |  retrieve the attributes. This is especially useful it you want to pass in | ||||
|     |  a string instead of calling #[code doc._.my_attr]. | ||||
| 
 | ||||
| +table(["Method", "Description", "Valid for", "Example"]) | ||||
|     +row | ||||
|         +cell #[code ._.set()] | ||||
|         +cell Set a value for an attribute. | ||||
|         +cell Attributes, mutable properties. | ||||
|         +cell #[code.u-break token._.set('my_attr', True)] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code ._.get()] | ||||
|         +cell Get the value of an attribute. | ||||
|         +cell Attributes, mutable properties, immutable properties, methods. | ||||
|         +cell #[code.u-break my_attr = span._.get('my_attr')] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code ._.has()] | ||||
|         +cell Check if an attribute exists. | ||||
|         +cell Attributes, mutable properties, immutable properties, methods. | ||||
|         +cell #[code.u-break doc._.has('my_attr')] | ||||
| 
 | ||||
| +infobox("How the ._ is implemented") | ||||
|     |  Extension definitions – the defaults, methods, getters and setters you | ||||
|     |  pass in to #[code set_extension] are stored in class attributes on the | ||||
|     |  #[code Underscore] class. If you write to an extension attribute, e.g. | ||||
|     |  #[code doc._.hello = True], the data is stored within the | ||||
|     |  #[+api("doc#attributes") #[code Doc.user_data]] dictionary. To keep the | ||||
|     |  underscore data separate from your other dictionary entries, the string | ||||
|     |  #[code "._."] is placed before the name, in a tuple. | ||||
| 
 | ||||
| +h(4, "component-example1") Example: Custom sentence segmentation logic | ||||
| 
 | ||||
| p | ||||
|     |  Let's say you want to implement custom logic to improve spaCy's sentence | ||||
|     |  boundary detection. Currently, sentence segmentation is based on the | ||||
|     |  dependency parse, which doesn't always produce ideal results. The custom | ||||
|     |  logic should therefore be applied #[strong after] tokenization, but | ||||
|     |  #[strong before] the dependency parsing – this way, the parser can also | ||||
|     |  take advantage of the sentence boundaries. | ||||
| 
 | ||||
| +code. | ||||
|     def sbd_component(doc): | ||||
|         for i, token in enumerate(doc[:-2]): | ||||
|             # define sentence start if period + titlecase token | ||||
|             if token.text == '.' and doc[i+1].is_title: | ||||
|                 doc[i+1].sent_start = True | ||||
|         return doc | ||||
| 
 | ||||
|     nlp = spacy.load('en') | ||||
|     nlp.add_pipe(sbd_component, before='parser')  # insert before the parser | ||||
| 
 | ||||
| +h(4, "component-example2") | ||||
|     |  Example: Pipeline component for entity matching and tagging with | ||||
|     |  custom attributes | ||||
| 
 | ||||
| p | ||||
|     |  This example shows how to create a spaCy extension that takes a | ||||
|     |  terminology list (in this case, single- and multi-word company names), | ||||
|     |  matches the occurences in a document, labels them as #[code ORG] entities, | ||||
|     |  merges the tokens and sets custom #[code is_tech_org] and | ||||
|     |  #[code has_tech_org] attributes. For efficient matching, the example uses | ||||
|     |  the #[+api("phrasematcher") #[code PhraseMatcher]] which accepts | ||||
|     |  #[code Doc] objects as match patterns and works well for large | ||||
|     |  terminology lists. It also ensures your patterns will always match, even | ||||
|     |  when you customise spaCy's tokenization rules. When you call #[code nlp] | ||||
|     |  on a text, the custom pipeline component is applied to the #[code Doc] | ||||
| 
 | ||||
| +github("spacy", "examples/pipeline/custom_component_entities.py", false, 500) | ||||
| 
 | ||||
| p | ||||
|     |  Wrapping this functionality in a | ||||
|     |  pipeline component allows you to reuse the module with different | ||||
|     |  settings, and have all pre-processing taken care of when you call | ||||
|     |  #[code nlp] on your text and receive a #[code Doc] object. | ||||
| 
 | ||||
| +h(4, "component-example3") | ||||
|     |  Example: Pipeline component for GPE entities and country meta data via a | ||||
|     |  REST API | ||||
| 
 | ||||
| p | ||||
|     |  This example shows the implementation of a pipeline component | ||||
|     |  that fetches country meta data via the | ||||
|     |  #[+a("https://restcountries.eu") REST Countries API] sets entity | ||||
|     |  annotations for countries, merges entities into one token and | ||||
|     |  sets custom attributes on the #[code Doc], #[code Span] and | ||||
|     |  #[code Token] – for example, the capital, latitude/longitude coordinates | ||||
|     |  and even the country flag. | ||||
| 
 | ||||
| +github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500) | ||||
| 
 | ||||
| p | ||||
|     |  In this case, all data can be fetched on initialisation in one request. | ||||
|     |  However, if you're working with text that contains incomplete country | ||||
|     |  names, spelling mistakes or foreign-language versions, you could also | ||||
|     |  implement a #[code like_country]-style getter function that makes a | ||||
|     |  request to the search API endpoint and returns the best-matching | ||||
|     |  result. | ||||
| 
 | ||||
| +h(4, "custom-components-usage-ideas") Other usage ideas | ||||
| 
 | ||||
| +list | ||||
|     +item | ||||
|         |  #[strong Adding new features and hooking in models]. For example, | ||||
|         |  a sentiment analysis model, or your preferred solution for | ||||
|         |  lemmatization or sentiment analysis. spaCy's built-in tagger, | ||||
|         |  parser and entity recognizer respect annotations that were already | ||||
|         |  set on the #[code Doc] in a previous step of the pipeline. | ||||
|     +item | ||||
|         |  #[strong Integrating other libraries and APIs]. For example, your | ||||
|         |  pipeline component can write additional information and data | ||||
|         |  directly to the #[code Doc] or #[code Token] as custom attributes, | ||||
|         |  while making sure no information is lost in the process. This can | ||||
|         |  be output generated by other libraries and models, or an external | ||||
|         |  service with a REST API. | ||||
|     +item | ||||
|         |  #[strong Debugging and logging]. For example, a component which | ||||
|         |  stores and/or exports relevant information about the current state | ||||
|         |  of the processed document, and insert it at any point of your | ||||
|         |  pipeline. | ||||
| 
 | ||||
| +infobox("Developing third-party extensions") | ||||
|     |  The new pipeline management and custom attributes finally make it easy | ||||
|     |  to develop your own spaCy extensions and plugins and share them with | ||||
|     |  others. Extensions can claim their own #[code ._] namespace and exist as | ||||
|     |  standalone packages. If you're developing a tool or library and want to | ||||
|     |  make it easy for others to use it with spaCy and add it to their | ||||
|     |  pipeline, all you have to do is expose a function that takes a | ||||
|     |  #[code Doc], modifies it and returns it. For more details and | ||||
|     |  #[strong best practices], see the section on | ||||
|     |  #[+a("#extensions") developing spaCy extensions]. | ||||
| 
 | ||||
| +h(3, "custom-components-user-hooks") User hooks | ||||
| 
 | ||||
| p | ||||
|     |  While it's generally recommended to use the #[code Doc._], #[code Span._] | ||||
|     |  and #[code Token._] proxies to add your own custom attributes, spaCy | ||||
|     |  offers a few exceptions to allow #[strong customising the built-in methods] | ||||
|     |  like #[+api("doc#similarity") #[code Doc.similarity]] or | ||||
|     |  #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can | ||||
|     |  rely on statistical models you train yourself. For instance, you can | ||||
|     |  provide your own on-the-fly sentence segmentation algorithm or document | ||||
|     |  similarity method. | ||||
| 
 | ||||
| p | ||||
|     |  Hooks let you customize some of the behaviours of the #[code Doc], | ||||
|     |  #[code Span] or #[code Token] objects by adding a component to the | ||||
|     |  pipeline. For instance, to customize the | ||||
|     |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a | ||||
|     |  component that sets a custom function to | ||||
|     |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] | ||||
|     |  method will check the #[code user_hooks] dict, and delegate to your | ||||
|     |  function if you've set one. Similar results can be achieved by setting | ||||
|     |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. | ||||
| 
 | ||||
| +aside("Implementation note") | ||||
|     |  The hooks live on the #[code Doc] object because the #[code Span] and | ||||
|     |  #[code Token] objects are created lazily, and don't own any data. They | ||||
|     |  just proxy to their parent #[code Doc]. This turns out to be convenient | ||||
|     |  here — we only have to worry about installing hooks in one place. | ||||
| 
 | ||||
| +table(["Name", "Customises"]) | ||||
|     +row | ||||
|         +cell #[code user_hooks] | ||||
|         +cell | ||||
|             +api("doc#vector") #[code Doc.vector] | ||||
|             +api("doc#has_vector") #[code Doc.has_vector] | ||||
|             +api("doc#vector_norm") #[code Doc.vector_norm] | ||||
|             +api("doc#sents") #[code Doc.sents] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code user_token_hooks] | ||||
|         +cell | ||||
|             +api("token#similarity") #[code Token.similarity] | ||||
|             +api("token#vector") #[code Token.vector] | ||||
|             +api("token#has_vector") #[code Token.has_vector] | ||||
|             +api("token#vector_norm") #[code Token.vector_norm] | ||||
|             +api("token#conjuncts") #[code Token.conjuncts] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code user_span_hooks] | ||||
|         +cell | ||||
|             +api("span#similarity") #[code Span.similarity] | ||||
|             +api("span#vector") #[code Span.vector] | ||||
|             +api("span#has_vector") #[code Span.has_vector] | ||||
|             +api("span#vector_norm") #[code Span.vector_norm] | ||||
|             +api("span#root") #[code Span.root] | ||||
| 
 | ||||
| +code("Add custom similarity hooks"). | ||||
|     class SimilarityModel(object): | ||||
|         def __init__(self, model): | ||||
|             self._model = model | ||||
| 
 | ||||
|         def __call__(self, doc): | ||||
|             doc.user_hooks['similarity'] = self.similarity | ||||
|             doc.user_span_hooks['similarity'] = self.similarity | ||||
|             doc.user_token_hooks['similarity'] = self.similarity | ||||
| 
 | ||||
|         def similarity(self, obj1, obj2): | ||||
|             y = self._model([obj1.vector, obj2.vector]) | ||||
|             return float(y[0]) | ||||
|  | @ -1,126 +0,0 @@ | |||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES | ||||
| 
 | ||||
| p | ||||
|     |  To see real-world examples of pipeline factories and components in action, | ||||
|     |  you can have a look at the source of spaCy's built-in components, e.g. | ||||
|     |  the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or | ||||
|     |  #[+api("entityrecognizer") #[code EntityRecongnizer]]. | ||||
| 
 | ||||
| +h(3, "example1") Example: Custom sentence segmentation logic | ||||
| 
 | ||||
| p | ||||
|     |  Let's say you want to implement custom logic to improve spaCy's sentence | ||||
|     |  boundary detection. Currently, sentence segmentation is based on the | ||||
|     |  dependency parse, which doesn't always produce ideal results. The custom | ||||
|     |  logic should therefore be applied #[strong after] tokenization, but | ||||
|     |  #[strong before] the dependency parsing – this way, the parser can also | ||||
|     |  take advantage of the sentence boundaries. | ||||
| 
 | ||||
| +code. | ||||
|     def sbd_component(doc): | ||||
|         for i, token in enumerate(doc[:-2]): | ||||
|             # define sentence start if period + titlecase token | ||||
|             if token.text == '.' and doc[i+1].is_title: | ||||
|                 doc[i+1].sent_start = True | ||||
|         return doc | ||||
| 
 | ||||
| p | ||||
|     |  In this case, we simply want to add the component to the existing | ||||
|     |  pipeline of the English model. We can do this by inserting it at index 0 | ||||
|     |  of #[code nlp.pipeline]: | ||||
| 
 | ||||
| +code. | ||||
|     nlp = spacy.load('en') | ||||
|     nlp.pipeline.insert(0, sbd_component) | ||||
| 
 | ||||
| p | ||||
|     |  When you call #[code nlp] on some text, spaCy will tokenize it to create | ||||
|     |  a #[code Doc] object, and first call #[code sbd_component] on it, followed | ||||
|     |  by the model's default pipeline. | ||||
| 
 | ||||
| +h(3, "example2") Example: Sentiment model | ||||
| 
 | ||||
| p | ||||
|     |  Let's say you have trained your own document sentiment model on English | ||||
|     |  text. After tokenization, you want spaCy to first execute the | ||||
|     |  #[strong default tensorizer], followed by a custom | ||||
|     |  #[strong sentiment component] that adds a #[code .sentiment] | ||||
|     |  property to the #[code Doc], containing your model's sentiment precition. | ||||
| 
 | ||||
| p | ||||
|     |  Your component class will have a #[code from_disk()] method that spaCy | ||||
|     |  calls to load the model data. When called, the component will compute | ||||
|     |  the sentiment score, add it to the #[code Doc] and return the modified | ||||
|     |  document. Optionally, the component can include an #[code update()] method | ||||
|     |  to allow training the model. | ||||
| 
 | ||||
| +code. | ||||
|     import pickle | ||||
|     from pathlib import Path | ||||
| 
 | ||||
|     class SentimentComponent(object): | ||||
|         def __init__(self, vocab): | ||||
|             self.weights = None | ||||
| 
 | ||||
|         def __call__(self, doc): | ||||
|             doc.sentiment = sum(self.weights*doc.vector) # set sentiment property | ||||
|             return doc | ||||
| 
 | ||||
|         def from_disk(self, path): # path = model path + factory ID ('sentiment') | ||||
|             self.weights = pickle.load(Path(path) / 'weights.bin') # load weights | ||||
|             return self | ||||
| 
 | ||||
|         def update(self, doc, gold): # update weights – allows training! | ||||
|             prediction = sum(self.weights*doc.vector) | ||||
|             self.weights -= 0.001*doc.vector*(prediction-gold.sentiment) | ||||
| 
 | ||||
| p | ||||
|     |  The factory will initialise the component with the #[code Vocab] object. | ||||
|     |  To be able to add it to your model's pipeline as #[code 'sentiment'], | ||||
|     |  it also needs to be registered via | ||||
|     |  #[+api("spacy#set_factory") #[code set_factory()]]. | ||||
| 
 | ||||
| +code. | ||||
|     def sentiment_factory(vocab): | ||||
|         component = SentimentComponent(vocab) # initialise component | ||||
|         return component | ||||
| 
 | ||||
|     spacy.set_factory('sentiment', sentiment_factory) | ||||
| 
 | ||||
| p | ||||
|     |  The above code should be #[strong shipped with your model]. You can use | ||||
|     |  the #[+api("cli#package") #[code package]] command to create all required | ||||
|     |  files and directories. The model package will include an | ||||
|     |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]] | ||||
|     |  with a #[code load()] method, that will initialise the language class with | ||||
|     |  the model's pipeline and call the #[code from_disk()] method to load | ||||
|     |  the model data. | ||||
| 
 | ||||
| p | ||||
|     |  In the model package's meta.json, specify the language class and pipeline | ||||
|     |  IDs: | ||||
| 
 | ||||
| +code("meta.json (excerpt)", "json"). | ||||
|     { | ||||
|         "name": "sentiment_model", | ||||
|         "lang": "en", | ||||
|         "version": "1.0.0", | ||||
|         "spacy_version": ">=2.0.0,<3.0.0", | ||||
|         "pipeline": ["tensorizer", "sentiment"] | ||||
|     } | ||||
| 
 | ||||
| p | ||||
|     |  When you load your new model, spaCy will call the model's #[code load()] | ||||
|     |  method. This will return a #[code Language] object with a pipeline | ||||
|     |  containing the default tensorizer, and the sentiment component returned | ||||
|     |  by your custom #[code "sentiment"] factory. | ||||
| 
 | ||||
| +code. | ||||
|     nlp = spacy.load('en_sentiment_model') | ||||
|     doc = nlp(u'I love pizza') | ||||
|     assert doc.sentiment | ||||
| 
 | ||||
| +infobox("Saving and loading models") | ||||
|     |  For more information and a detailed guide on how to package your model, | ||||
|     |  see the documentation on | ||||
|     |  #[+a("/usage/training#saving-loading") saving and loading models]. | ||||
							
								
								
									
										110
									
								
								website/usage/_processing-pipelines/_extensions.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										110
									
								
								website/usage/_processing-pipelines/_extensions.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,110 @@ | |||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS | ||||
| 
 | ||||
| p | ||||
|     |  We're very excited about all the new possibilities for community | ||||
|     |  extensions and plugins in spaCy v2.0, and we can't wait to see what | ||||
|     |  you build with it! To get you started, here are a few tips, tricks and | ||||
|     |  best practices: | ||||
| 
 | ||||
| +list | ||||
|     +item | ||||
|         |  Make sure to choose a #[strong descriptive and specific name] for | ||||
|         |  your pipeline component class, and set it as its #[code name] | ||||
|         |  attribute. Avoid names that are too common or likely to clash with | ||||
|         |  built-in or a user's other custom components. While it's fine to call | ||||
|         |  your package "spacy_my_extension", avoid component names including | ||||
|         |  "spacy", since this can easily lead to confusion. | ||||
| 
 | ||||
|         +code-wrapper | ||||
|             +code-new name = 'myapp_lemmatizer' | ||||
|             +code-old name = 'lemmatizer' | ||||
| 
 | ||||
|     +item | ||||
|         |  When writing to #[code Doc], #[code Token] or #[code Span] objects, | ||||
|         |  #[strong use getter functions] wherever possible, and avoid setting | ||||
|         |  values explicitly. Tokens and spans don't own any data themselves, | ||||
|         |  so you should provide a function that allows them to compute the | ||||
|         |  values instead of writing static properties to individual objects. | ||||
| 
 | ||||
|         +code-wrapper | ||||
|             +code-new. | ||||
|                 is_fruit = lambda token: token.text in ('apple', 'orange') | ||||
|                 Token.set_extension('is_fruit', getter=is_fruit) | ||||
|             +code-old. | ||||
|                 token._.set_extension('is_fruit', default=False) | ||||
|                 if token.text in ('apple', 'orange'): | ||||
|                     token._.set('is_fruit', True) | ||||
| 
 | ||||
|     +item | ||||
|         |  Always add your custom attributes to the #[strong global] #[code Doc] | ||||
|         |  #[code Token] or #[code Span] objects, not a particular instance of | ||||
|         |  them. Add the attributes #[strong as early as possible], e.g. in | ||||
|         |  your extension's #[code __init__] method or in the global scope of | ||||
|         |  your module. This means that in the case of namespace collisions, | ||||
|         |  the user will see an error immediately, not just when they run their | ||||
|         |  pipeline. | ||||
| 
 | ||||
|         +code-wrapper | ||||
|             +code-new. | ||||
|                 from spacy.tokens import Doc | ||||
|                 def __init__(attr='my_attr'): | ||||
|                     Doc.set_extension(attr, getter=self.get_doc_attr) | ||||
|             +code-old. | ||||
|                 def __call__(doc): | ||||
|                     doc.set_extension('my_attr', getter=self.get_doc_attr) | ||||
| 
 | ||||
|     +item | ||||
|         |  If your extension is setting properties on the #[code Doc], | ||||
|         |  #[code Token] or #[code Span], include an option to | ||||
|         |  #[strong let the user to change those attribute names]. This makes | ||||
|         |  it easier to avoid namespace collisions and accommodate users with | ||||
|         |  different naming preferences. We recommend adding an #[code attrs] | ||||
|         |  argument to the #[code __init__] method of your class so you can | ||||
|         |  write the names to class attributes and reuse them across your | ||||
|         |  component. | ||||
| 
 | ||||
|         +code-wrapper | ||||
|             +code-new Doc.set_extension(self.doc_attr, default='some value') | ||||
|             +code-old Doc.set_extension('my_doc_attr', default='some value') | ||||
| 
 | ||||
|     +item | ||||
|         |  Ideally, extensions should be #[strong standalone packages] with | ||||
|         |  spaCy and optionally, other packages specified as a dependency. They | ||||
|         |  can freely assign to their own #[code ._] namespace, but should stick | ||||
|         |  to that. If your extension's only job is to provide a better | ||||
|         |  #[code .similarity] implementation, and your docs state this | ||||
|         |  explicitly, there's no problem with writing to the | ||||
|         |  #[+a("#custom-components-user-hooks") #[code user_hooks]], and | ||||
|         |  overwriting spaCy's built-in method. However, a third-party | ||||
|         |  extension should #[strong never silently overwrite built-ins], or | ||||
|         |  attributes set by other extensions. | ||||
| 
 | ||||
|     +item | ||||
|         |  If you're looking to publish a model that depends on a custom | ||||
|         |  pipeline component, you can either #[strong require it] in the model | ||||
|         |  package's dependencies, or – if the component is specific and | ||||
|         |  lightweight – choose to #[strong ship it with your model package] | ||||
|         |  and add it to the #[code Language] instance returned by the | ||||
|         |  model's #[code load()] method. For examples of this, check out the | ||||
|         |  implementations of spaCy's | ||||
|         |  #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]] | ||||
|         |  and  #[+api("util#load_model_from_path") #[code load_model_from_path()]] | ||||
|         |  utility functions. | ||||
| 
 | ||||
|         +code-wrapper | ||||
|             +code-new. | ||||
|                 nlp.add_pipe(my_custom_component) | ||||
|                 return nlp.from_disk(model_path) | ||||
| 
 | ||||
|     +item | ||||
|         |  Once you're ready to share your extension with others, make sure to | ||||
|         |  #[strong add docs and installation instructions] (you can | ||||
|         |  always link to this page for more info). Make it easy for others to | ||||
|         |  install and use your extension, for example by uploading it to | ||||
|         |  #[+a("https://pypi.python.org") PyPi]. If you're sharing your code on | ||||
|         |  GitHub, don't forget to tag it | ||||
|         |  with #[+a("https://github.com/search?q=topic%3Aspacy") #[code spacy]] | ||||
|         |  and #[+a("https://github.com/search?q=topic%3Aspacy-pipeline") #[code spacy-pipeline]] | ||||
|         |  to help people find it. If you post it on Twitter, feel free to tag | ||||
|         |  #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}] | ||||
|         |  so we can check it out. | ||||
|  | @ -11,7 +11,7 @@ p | |||
| 
 | ||||
| p | ||||
|     |  When you load a model, spaCy first consults the model's | ||||
|     |  #[+a("/usage/saving-loading#models-generating") meta.json]. The | ||||
|     |  #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The | ||||
|     |  meta typically includes the model details, the ID of a language class, | ||||
|     |  and an optional list of pipeline components. spaCy then does the | ||||
|     |  following: | ||||
|  | @ -21,24 +21,26 @@ p | |||
|         "name": "example_model", | ||||
|         "lang": "en" | ||||
|         "description": "Example model for spaCy", | ||||
|         "pipeline": ["tensorizer", "tagger"] | ||||
|         "pipeline": ["tagger", "parser"] | ||||
|     } | ||||
| 
 | ||||
| +list("numbers") | ||||
|     +item | ||||
|         |  Look up #[strong pipeline IDs] in the available | ||||
|         |  #[strong pipeline factories]. | ||||
|     +item | ||||
|         |  Initialise the #[strong pipeline components] by calling their | ||||
|         |  factories with the #[code Vocab] as an argument. This gives each | ||||
|         |  factory and component access to the pipeline's shared data, like | ||||
|         |  strings, morphology and annotation scheme. | ||||
|     +item | ||||
|         |  Load the #[strong language class and data] for the given ID via | ||||
|         |  #[+api("util.get_lang_class") #[code get_lang_class]]. | ||||
|         |  #[+api("util.get_lang_class") #[code get_lang_class]] and initialise | ||||
|         |  it. The #[code Language] class contains the shared vocabulary, | ||||
|         |  tokenization rules and the language-specific annotation scheme. | ||||
|     +item | ||||
|         |  Pass the path to the #[strong model data] to the #[code Language] | ||||
|         |  class and return it. | ||||
|         |  Iterate over the #[strong pipeline names] and create each component | ||||
|         |  using #[+api("language#create_pipe") #[code create_pipe]], which | ||||
|         |  looks them up in #[code Language.factories]. | ||||
|     +item | ||||
|         |  Add each pipeline component to the pipeline in order, using | ||||
|         |  #[+api("language#add_pipe") #[code add_pipe]]. | ||||
|     +item | ||||
|         |  Make the #[strong model data] available to the #[code Language] class | ||||
|         |  by calling #[+api("language#from_disk") #[code from_disk]] with the | ||||
|         |  path to the model data ditectory. | ||||
| 
 | ||||
| p | ||||
|     |  So when you call this... | ||||
|  | @ -47,12 +49,12 @@ p | |||
|     nlp = spacy.load('en') | ||||
| 
 | ||||
| p | ||||
|     | ... the model tells spaCy to use the pipeline | ||||
|     | ... the model tells spaCy to use the language #[code "en"] and the pipeline | ||||
|     |  #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will | ||||
|     |  then look up each string in its internal factories registry and | ||||
|     |  initialise the individual components. It'll then load | ||||
|     |  #[code spacy.lang.en.English], pass it the path to the model's data | ||||
|     |  directory, and return it for you to use as the #[code nlp] object. | ||||
|     |  then initialise #[code spacy.lang.en.English], and create each pipeline | ||||
|     |  component and add it to the processing pipeline. It'll then load in the | ||||
|     |  model's data from its data ditectory and return the modified | ||||
|     |  #[code Language] class for you to use as the #[code nlp] object. | ||||
| 
 | ||||
| p | ||||
|     |  Fundamentally, a #[+a("/models") spaCy model] consists of three | ||||
|  | @ -73,9 +75,12 @@ p | |||
|     pipeline = ['tensorizer', 'tagger', 'parser', 'ner'] | ||||
|     data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0' | ||||
| 
 | ||||
|     cls = spacy.util.get_lang_class(lang)  # 1. get Language instance, e.g. English() | ||||
|     nlp = cls(pipeline=pipeline)           # 2. initialise it with the pipeline | ||||
|     nlp.from_disk(model_data_path)         # 3. load in the binary data | ||||
|     cls = spacy.util.get_lang_class(lang)   # 1. get Language instance, e.g. English() | ||||
|     nlp = cls()                             # 2. initialise it | ||||
|     for name in pipeline: | ||||
|         component = nlp.create_pipe(name)   # 3. create the pipeline components | ||||
|         nlp.add_pipe(component)             # 4. add the component to the pipeline | ||||
|     nlp.from_disk(model_data_path)          # 5. load in the binary data | ||||
| 
 | ||||
| p | ||||
|     |  When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and | ||||
|  | @ -87,124 +92,23 @@ p | |||
|     |  document, which is then processed by the component next in the pipeline. | ||||
| 
 | ||||
| +code("The pipeline under the hood"). | ||||
|     doc = nlp.make_doc(u'This is a sentence') | ||||
|     for proc in nlp.pipeline: | ||||
|         doc = proc(doc) | ||||
| 
 | ||||
| +h(3, "creating") Creating pipeline components and factories | ||||
|     doc = nlp.make_doc(u'This is a sentence')   # create a Doc from raw text | ||||
|     for name, proc in nlp.pipeline:             # iterate over components in order | ||||
|         doc = proc(doc)                         # apply each component | ||||
| 
 | ||||
| p | ||||
|     |  spaCy lets you customise the pipeline with your own components. Components | ||||
|     |  are functions that receive a #[code Doc] object, modify and return it. | ||||
|     |  If your component is stateful, you'll want to create a new one for each | ||||
|     |  pipeline. You can do that by defining and registering a factory which | ||||
|     |  receives the shared #[code Vocab] object and returns a component. | ||||
| 
 | ||||
| +h(4, "creating-component") Creating a  component | ||||
| 
 | ||||
| p | ||||
|     |  A component receives a #[code Doc] object and | ||||
|     |  #[strong performs the actual processing] – for example, using the current | ||||
|     |  weights to make a prediction and set some annotation on the document. By | ||||
|     |  adding a component to the pipeline, you'll get access to the #[code Doc] | ||||
|     |  at any point #[strong during] processing – instead of only being able to | ||||
|     |  modify it afterwards. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def my_component(doc): | ||||
|         # do something to the doc here | ||||
|         return doc | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code doc] | ||||
|         +cell #[code Doc] | ||||
|         +cell The #[code Doc] object processed by the previous component. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Doc] | ||||
|         +cell The #[code Doc] object processed by this pipeline component. | ||||
| 
 | ||||
| p | ||||
|     |  When creating a new #[code Language] class, you can pass it a list of | ||||
|     |  pipeline component functions to execute in that order. You can also | ||||
|     |  add it to an existing pipeline by modifying #[code nlp.pipeline] – just | ||||
|     |  be careful not to overwrite a pipeline or its components by accident! | ||||
|     |  The current processing pipeline is available as #[code nlp.pipeline], | ||||
|     |  which returns a list of #[code (name, component)] tuples, or | ||||
|     |  #[code nlp.pipe_names], which only returns a list of human-readable | ||||
|     |  component names. | ||||
| 
 | ||||
| +code. | ||||
|     # Create a new Language object with a pipeline | ||||
|     from spacy.language import Language | ||||
|     nlp = Language(pipeline=[my_component]) | ||||
|     nlp.pipeline | ||||
|     # [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)] | ||||
|     nlp.pipe_names | ||||
|     # ['tagger', 'parser', 'ner'] | ||||
| 
 | ||||
|     # Modify an existing pipeline | ||||
|     nlp = spacy.load('en') | ||||
|     nlp.pipeline.append(my_component) | ||||
| 
 | ||||
| +h(4, "creating-factory") Creating a factory | ||||
| 
 | ||||
| p | ||||
|     |  A factory is a #[strong function that returns a pipeline component]. | ||||
|     |  It's called with the #[code Vocab] object, to give it access to the | ||||
|     |  shared data between components – for example, the strings, morphology, | ||||
|     |  vectors or annotation scheme. Factories are useful for creating | ||||
|     |  #[strong stateful components], especially ones which | ||||
|     |  #[strong depend on shared data]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def my_factory(vocab): | ||||
|         # load some state | ||||
|         def my_component(doc): | ||||
|             # process the doc | ||||
|             return doc | ||||
|         return my_component | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code vocab] | ||||
|         +cell #[code Vocab] | ||||
|         +cell | ||||
|             |  Shared data between components, including strings, morphology, | ||||
|             |  vectors etc. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell callable | ||||
|         +cell The pipeline component. | ||||
| 
 | ||||
| p | ||||
|     |  By creating a factory, you're essentially telling spaCy how to get the | ||||
|     |  pipeline component #[strong once the vocab is available]. Factories need to | ||||
|     |  be registered via #[+api("spacy#set_factory") #[code set_factory()]] and | ||||
|     |  by assigning them a unique ID. This ID can be added to the pipeline as a | ||||
|     |  string. When creating a pipeline, you're free to mix strings and | ||||
|     |  callable components: | ||||
| 
 | ||||
| +code. | ||||
|     spacy.set_factory('my_factory', my_factory) | ||||
|     nlp = Language(pipeline=['my_factory', my_other_component]) | ||||
| 
 | ||||
| p | ||||
|     |  If spaCy comes across a string in the pipeline, it will try to resolve it | ||||
|     |  by looking it up in the available factories. The factory will then be | ||||
|     |  initialised with the #[code Vocab]. Providing factory names instead of | ||||
|     |  callables also makes it easy to specify them in the model's | ||||
|     |  #[+a("/usage/saving-loading#models-generating") meta.json]. If you're | ||||
|     |  training your own model and want to use one of spaCy's default components, | ||||
|     |  you won't have to worry about finding and implementing it either – to use | ||||
|     |  the default tagger, simply add #[code "tagger"] to the pipeline, and | ||||
|     |  #[strong spaCy will know what to do]. | ||||
| 
 | ||||
| +infobox("Important note") | ||||
|     |  Because factories are #[strong resolved on initialisation] of the | ||||
|     |  #[code Language] class, it's #[strong not possible] to add them to the | ||||
|     |  pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only | ||||
|     |  works with individual component functions. To use factories, you need to | ||||
|     |  create a new #[code Language] object, or generate a | ||||
|     |  #[+a("/usage/training#models-generating") model package] with | ||||
|     |  a custom pipeline. | ||||
| 
 | ||||
| +h(3, "disabling") Disabling pipeline components | ||||
| +h(3, "disabling") Disabling and modifying pipeline components | ||||
| 
 | ||||
| p | ||||
|     |  If you don't need a particular component of the pipeline – for | ||||
|  | @ -217,16 +121,19 @@ p | |||
| +code. | ||||
|     nlp = spacy.load('en', disable['parser', 'tagger']) | ||||
|     nlp = English().from_disk('/model', disable=['tensorizer', 'ner']) | ||||
|     doc = nlp(u"I don't want parsed", disable=['parser']) | ||||
| 
 | ||||
| p | ||||
|     |  Note that you can't write directly to #[code nlp.pipeline], as this list | ||||
|     |  holds the #[em actual components], not the IDs. However, if you know the | ||||
|     |  order of the components, you can still slice the list: | ||||
|     |  You can also use the #[+api("language#remove_pipe") #[code remove_pipe]] | ||||
|     |  method to remove pipeline components from an existing pipeline, the | ||||
|     |  #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them, | ||||
|     |  or the #[+api("language#replace_pipe") #[code replace_pipe]] method | ||||
|     |  to replace them with a custom component entirely (more details on this | ||||
|     |  in the section on #[+a("#custom-components") custom components]. | ||||
| 
 | ||||
| +code. | ||||
|     nlp = spacy.load('en') | ||||
|     nlp.pipeline = nlp.pipeline[:2] # only use the first two components | ||||
|     nlp.remove_pipe('parser') | ||||
|     nlp.rename_pipe('ner', 'entityrecognizer') | ||||
|     nlp.replace_pipe('tagger', my_custom_tagger) | ||||
| 
 | ||||
| +infobox("Important note: disabling pipeline components") | ||||
|     .o-block | ||||
|  | @ -234,12 +141,14 @@ p | |||
|         |  processing pipeline components, the #[code parser], #[code tagger] | ||||
|         |  and #[code entity] keyword arguments have been replaced with | ||||
|         |  #[code disable], which takes a list of pipeline component names. | ||||
|         |  This lets you disable both default and custom components when loading | ||||
|         |  This lets you disable pre-defined components when loading | ||||
|         |  a model, or initialising a Language class via | ||||
|         |  #[+api("language-from_disk") #[code from_disk]]. | ||||
| 
 | ||||
|     +code-new. | ||||
|         nlp = spacy.load('en', disable=['tagger', 'ner']) | ||||
|         doc = nlp(u"I don't want parsed", disable=['parser']) | ||||
|         nlp = spacy.load('en', disable=['ner']) | ||||
|         nlp.remove_pipe('parser') | ||||
|         doc = nlp(u"I don't want parsed") | ||||
|     +code-old. | ||||
|         nlp = spacy.load('en', tagger=False, entity=False) | ||||
|         doc = nlp(u"I don't want parsed", parse=False) | ||||
|  |  | |||
|  | @ -1,61 +0,0 @@ | |||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS | ||||
| 
 | ||||
| p | ||||
|     |  Hooks let you customize some of the behaviours of the #[code Doc], | ||||
|     |  #[code Span] or #[code Token] objects by adding a component to the | ||||
|     |  pipeline. For instance, to customize the | ||||
|     |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a | ||||
|     |  component that sets a custom function to | ||||
|     |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] | ||||
|     |  method will check the #[code user_hooks] dict, and delegate to your | ||||
|     |  function if you've set one. Similar results can be achieved by setting | ||||
|     |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. | ||||
| 
 | ||||
| +code("Polymorphic similarity example"). | ||||
|     span.similarity(doc) | ||||
|     token.similarity(span) | ||||
|     doc1.similarity(doc2) | ||||
| 
 | ||||
| p | ||||
|     |  By default, this just averages the vectors for each document, and | ||||
|     |  computes their cosine. Obviously, spaCy should make it easy for you to | ||||
|     |  install your own similarity model. This introduces a tricky design | ||||
|     |  challenge. The current solution is to add three more dicts to the | ||||
|     |  #[code Doc] object: | ||||
| 
 | ||||
| +aside("Implementation note") | ||||
|     |  The hooks live on the #[code Doc] object because the #[code Span] and | ||||
|     |  #[code Token] objects are created lazily, and don't own any data. They | ||||
|     |  just proxy to their parent #[code Doc]. This turns out to be convenient | ||||
|     |  here — we only have to worry about installing hooks in one place. | ||||
| 
 | ||||
| +table(["Name", "Description"]) | ||||
|     +row | ||||
|         +cell #[code user_hooks] | ||||
|         +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code user_token_hooks] | ||||
|         +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code user_span_hooks] | ||||
|         +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root] | ||||
| 
 | ||||
| p | ||||
|     |  To sum up, here's an example of hooking in custom #[code .similarity()] | ||||
|     |  methods: | ||||
| 
 | ||||
| +code("Add custom similarity hooks"). | ||||
|     class SimilarityModel(object): | ||||
|         def __init__(self, model): | ||||
|             self._model = model | ||||
| 
 | ||||
|         def __call__(self, doc): | ||||
|             doc.user_hooks['similarity'] = self.similarity | ||||
|             doc.user_span_hooks['similarity'] = self.similarity | ||||
|             doc.user_token_hooks['similarity'] = self.similarity | ||||
| 
 | ||||
|         def similarity(self, obj1, obj2): | ||||
|             y = self._model([obj1.vector, obj2.vector]) | ||||
|             return float(y[0]) | ||||
|  | @ -175,7 +175,7 @@ p | |||
| 
 | ||||
| +code. | ||||
|     import spacy | ||||
|     from spacy.tokens.doc import Doc | ||||
|     from spacy.tokens import Doc | ||||
|     from spacy.vocab import Vocab | ||||
| 
 | ||||
|     nlp = spacy.load('en') | ||||
|  |  | |||
|  | @ -61,7 +61,7 @@ p | |||
|         output_path.open('w', encoding='utf-8').write(svg) | ||||
| 
 | ||||
| p | ||||
|     |  The above code will generate the dependency visualizations and them to | ||||
|     |  The above code will generate the dependency visualizations as to | ||||
|     |  two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -2,6 +2,44 @@ | |||
| 
 | ||||
| include ../_includes/_mixins | ||||
| 
 | ||||
| +section("pipeline") | ||||
|     +h(3, "custom-components-entities") Custom pipeline components and attribute extensions | ||||
|         +tag-new(2) | ||||
| 
 | ||||
|     p | ||||
|         |  This example shows the implementation of a pipeline component | ||||
|         |  that sets entity annotations based on a list of single or | ||||
|         |  multiple-word company names, merges entities into one token and | ||||
|         |  sets custom attributes on the #[code Doc], #[code Span] and | ||||
|         |  #[code Token]. | ||||
| 
 | ||||
|     +github("spacy", "examples/pipeline/custom_component_entities.py") | ||||
| 
 | ||||
|     +h(3, "custom-components-api") | ||||
|         |  Custom pipeline components and attribute extensions via a REST API | ||||
|         +tag-new(2) | ||||
| 
 | ||||
|     p | ||||
|         |  This example shows the implementation of a pipeline component | ||||
|         |  that fetches country meta data via the | ||||
|         |  #[+a("https://restcountries.eu") REST Countries API] sets entity | ||||
|         |  annotations for countries, merges entities into one token and | ||||
|         |  sets custom attributes on the #[code Doc], #[code Span] and | ||||
|         |  #[code Token] – for example, the capital, latitude/longitude | ||||
|         |  coordinates and the country flag. | ||||
| 
 | ||||
|     +github("spacy", "examples/pipeline/custom_component_countries_api.py") | ||||
| 
 | ||||
|     +h(3, "custom-components-attr-methods") Custom method extensions | ||||
|         +tag-new(2) | ||||
| 
 | ||||
|     p | ||||
|         |  A collection of snippets showing examples of extensions adding | ||||
|         |  custom methods to the #[code Doc], #[code Token] and | ||||
|         |  #[code Span]. | ||||
| 
 | ||||
|     +github("spacy", "examples/pipeline/custom_attr_methods.py") | ||||
| 
 | ||||
| +section("matching") | ||||
|     +h(3, "matcher") Using spaCy's rule-based matcher | ||||
| 
 | ||||
|  |  | |||
|  | @ -8,18 +8,18 @@ include _spacy-101/_pipelines | |||
|     +h(2, "pipelines") How pipelines work | ||||
|     include _processing-pipelines/_pipelines | ||||
| 
 | ||||
| +section("examples") | ||||
|     +h(2, "examples") Examples | ||||
|     include _processing-pipelines/_examples | ||||
| +section("custom-components") | ||||
|     +h(2, "custom-components") Creating custom pipeline components | ||||
|     include _processing-pipelines/_custom-components | ||||
| 
 | ||||
| +section("extensions") | ||||
|     +h(2, "extensions") Developing spaCy extensions | ||||
|     include _processing-pipelines/_extensions | ||||
| 
 | ||||
| +section("multithreading") | ||||
|     +h(2, "multithreading") Multi-threading | ||||
|     include _processing-pipelines/_multithreading | ||||
| 
 | ||||
| +section("user-hooks") | ||||
|     +h(2, "user-hooks") User hooks | ||||
|     include _processing-pipelines/_user-hooks | ||||
| 
 | ||||
| +section("serialization") | ||||
|     +h(2, "serialization") Serialization | ||||
|     include _processing-pipelines/_serialization | ||||
|  |  | |||
|  | @ -102,30 +102,36 @@ p | |||
|     +h(3, "features-pipelines") Improved processing pipelines | ||||
| 
 | ||||
|     +aside-code("Example"). | ||||
|         # Modify an existing pipeline | ||||
|         nlp = spacy.load('en') | ||||
|         nlp.pipeline.append(my_component) | ||||
|         # Set custom attributes | ||||
|         Doc.set_extension('my_attr', default=False) | ||||
|         Token.set_extension('my_attr', getter=my_token_getter) | ||||
|         assert doc._.my_attr, token._.my_attr | ||||
| 
 | ||||
|         # Register a factory to create a component | ||||
|         spacy.set_factory('my_factory', my_factory) | ||||
|         nlp = Language(pipeline=['my_factory', mycomponent]) | ||||
|         # Add components to the pipeline | ||||
|         my_component = lambda doc: doc | ||||
|         nlp.add_pipe(my_component) | ||||
| 
 | ||||
|     p | ||||
|         |  It's now much easier to #[strong customise the pipeline] with your own | ||||
|         |  components, functions that receive a #[code Doc] object, modify and | ||||
|         |  return it. If your component is stateful, you can define and register a | ||||
|         |  factory which receives the shared #[code Vocab] object and returns a | ||||
|         |  component. spaCy's default components can be added to your pipeline by | ||||
|         |  using their string IDs. This way, you won't have to worry about finding | ||||
|         |  and implementing them – simply add #[code "tagger"] to the pipeline, | ||||
|         |  and spaCy will know what to do. | ||||
|         |  components: functions that receive a #[code Doc] object, modify and | ||||
|         |  return it. Extensions let you write any | ||||
|         |  #[strong attributes, properties and methods] to the #[code Doc], | ||||
|         |  #[code Token] and #[code Span]. You can add data, implement new | ||||
|         |  features, integrate other libraries with spaCy or plug in your own | ||||
|         |  machine learning models. | ||||
| 
 | ||||
|     +image | ||||
|         include ../assets/img/pipeline.svg | ||||
| 
 | ||||
|     +infobox | ||||
|         |  #[+label-inline API:] #[+api("language") #[code Language]] | ||||
|         |  #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text] | ||||
|         |  #[+label-inline API:] #[+api("language") #[code Language]], | ||||
|         |  #[+api("doc#set_extension") #[code Doc.set_extension]], | ||||
|         |  #[+api("span#set_extension") #[code Span.set_extension]], | ||||
|         |  #[+api("token#set_extension") #[code Token.set_extension]] | ||||
|         |  #[+label-inline Usage:] | ||||
|         |  #[+a("/usage/processing-pipelines") Processing pipelines] | ||||
|         |  #[+label-inline Code:] | ||||
|         |  #[+src("/usage/examples#section-pipeline") Pipeline examples] | ||||
| 
 | ||||
|     +h(3, "features-text-classification") Text classification | ||||
| 
 | ||||
|  | @ -478,15 +484,16 @@ p | |||
|     p | ||||
|         |  If you've been using custom pipeline components, check out the new | ||||
|         |  guide on #[+a("/usage/language-processing-pipelines") processing pipelines]. | ||||
|         |  Appending functions to the pipeline still works – but you might be able | ||||
|         |  to make this more convenient by registering "component factories". | ||||
|         |  Components of the processing pipeline can now be disabled by passing a | ||||
|         |  list of their names to the #[code disable] keyword argument on loading | ||||
|         |  or processing. | ||||
|         |  Appending functions to the pipeline still works – but the | ||||
|         |  #[+api("language#add_pipe") #[code add_pipe]] methods now makes this | ||||
|         |  much more convenient. Components of the processing pipeline can now | ||||
|         |  be disabled by passing a list of their names to the #[code disable] | ||||
|         |  keyword argument on load, or by simply demoving them from the | ||||
|         |  pipeline alltogether. | ||||
| 
 | ||||
|     +code-new. | ||||
|         nlp = spacy.load('en', disable=['tagger', 'ner']) | ||||
|         doc = nlp(u"I don't want parsed", disable=['parser']) | ||||
|         nlp.remove_pipe('parser') | ||||
|     +code-old. | ||||
|         nlp = spacy.load('en', tagger=False, entity=False) | ||||
|         doc = nlp(u"I don't want parsed", parse=False) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user