mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Merge branch 'develop' into feature/fix-matcher-operators
This commit is contained in:
		
						commit
						a928ae2f35
					
				
							
								
								
									
										52
									
								
								examples/pipeline/custom_attr_methods.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								examples/pipeline/custom_attr_methods.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,52 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | """This example contains several snippets of methods that can be set via custom | ||||||
|  | Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like | ||||||
|  | they're "bound" to the object and are partially applied – i.e. the object | ||||||
|  | they're called on is passed in as the first argument.""" | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from spacy.lang.en import English | ||||||
|  | from spacy.tokens import Doc, Span | ||||||
|  | from spacy import displacy | ||||||
|  | from pathlib import Path | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def to_html(doc, output='/tmp', style='dep'): | ||||||
|  |     """Doc method extension for saving the current state as a displaCy | ||||||
|  |     visualization. | ||||||
|  |     """ | ||||||
|  |     # generate filename from first six non-punct tokens | ||||||
|  |     file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html' | ||||||
|  |     output_path = Path(output) / file_name | ||||||
|  |     html = displacy.render(doc, style=style, page=True)  # render markup | ||||||
|  |     output_path.open('w', encoding='utf-8').write(html)  # save to file | ||||||
|  |     print('Saved HTML to {}'.format(output_path)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | Doc.set_extension('to_html', method=to_html) | ||||||
|  | 
 | ||||||
|  | nlp = English() | ||||||
|  | doc = nlp(u"This is a sentence about Apple.") | ||||||
|  | # add entity manually for demo purposes, to make it work without a model | ||||||
|  | doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] | ||||||
|  | doc._.to_html(style='ent') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def overlap_tokens(doc, other_doc): | ||||||
|  |     """Get the tokens from the original Doc that are also in the comparison Doc. | ||||||
|  |     """ | ||||||
|  |     overlap = [] | ||||||
|  |     other_tokens = [token.text for token in other_doc] | ||||||
|  |     for token in doc: | ||||||
|  |         if token.text in other_tokens: | ||||||
|  |             overlap.append(token) | ||||||
|  |     return overlap | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | Doc.set_extension('overlap', method=overlap_tokens) | ||||||
|  | 
 | ||||||
|  | nlp = English() | ||||||
|  | doc1 = nlp(u"Peach emoji is where it has always been.") | ||||||
|  | doc2 = nlp(u"Peach is the superior emoji.") | ||||||
|  | tokens = doc1._.overlap(doc2) | ||||||
|  | print(tokens) | ||||||
							
								
								
									
										108
									
								
								examples/pipeline/custom_component_countries_api.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								examples/pipeline/custom_component_countries_api.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,108 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import requests | ||||||
|  | 
 | ||||||
|  | from spacy.lang.en import English | ||||||
|  | from spacy.matcher import PhraseMatcher | ||||||
|  | from spacy.tokens import Doc, Span, Token | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class RESTCountriesComponent(object): | ||||||
|  |     """Example of a spaCy v2.0 pipeline component that requests all countries | ||||||
|  |     via the REST Countries API, merges country names into one token, assigns | ||||||
|  |     entity labels and sets attributes on country tokens, e.g. the capital and | ||||||
|  |     lat/lng coordinates. Can be extended with more details from the API. | ||||||
|  | 
 | ||||||
|  |     REST Countries API: https://restcountries.eu | ||||||
|  |     API License: Mozilla Public License MPL 2.0 | ||||||
|  |     """ | ||||||
|  |     name = 'rest_countries' # component name, will show up in the pipeline | ||||||
|  | 
 | ||||||
|  |     def __init__(self, nlp, label='GPE'): | ||||||
|  |         """Initialise the pipeline component. The shared nlp instance is used | ||||||
|  |         to initialise the matcher with the shared vocab, get the label ID and | ||||||
|  |         generate Doc objects as phrase match patterns. | ||||||
|  |         """ | ||||||
|  |         # Make request once on initialisation and store the data | ||||||
|  |         r = requests.get('https://restcountries.eu/rest/v2/all') | ||||||
|  |         r.raise_for_status()  # make sure requests raises an error if it fails | ||||||
|  |         countries = r.json() | ||||||
|  | 
 | ||||||
|  |         # Convert API response to dict keyed by country name for easy lookup | ||||||
|  |         # This could also be extended using the alternative and foreign language | ||||||
|  |         # names provided by the API | ||||||
|  |         self.countries = {c['name']: c for c in countries} | ||||||
|  |         self.label = nlp.vocab.strings[label]  # get entity label ID | ||||||
|  | 
 | ||||||
|  |         # Set up the PhraseMatcher with Doc patterns for each country name | ||||||
|  |         patterns = [nlp(c) for c in self.countries.keys()] | ||||||
|  |         self.matcher = PhraseMatcher(nlp.vocab) | ||||||
|  |         self.matcher.add('COUNTRIES', None, *patterns) | ||||||
|  | 
 | ||||||
|  |         # Register attribute on the Token. We'll be overwriting this based on | ||||||
|  |         # the matches, so we're only setting a default value, not a getter. | ||||||
|  |         # If no default value is set, it defaults to None. | ||||||
|  |         Token.set_extension('is_country', default=False) | ||||||
|  |         Token.set_extension('country_capital') | ||||||
|  |         Token.set_extension('country_latlng') | ||||||
|  |         Token.set_extension('country_flag') | ||||||
|  | 
 | ||||||
|  |         # Register attributes on Doc and Span via a getter that checks if one of | ||||||
|  |         # the contained tokens is set to is_country == True. | ||||||
|  |         Doc.set_extension('has_country', getter=self.has_country) | ||||||
|  |         Span.set_extension('has_country', getter=self.has_country) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def __call__(self, doc): | ||||||
|  |         """Apply the pipeline component on a Doc object and modify it if matches | ||||||
|  |         are found. Return the Doc, so it can be processed by the next component | ||||||
|  |         in the pipeline, if available. | ||||||
|  |         """ | ||||||
|  |         matches = self.matcher(doc) | ||||||
|  |         spans = []  # keep the spans for later so we can merge them afterwards | ||||||
|  |         for _, start, end in matches: | ||||||
|  |             # Generate Span representing the entity & set label | ||||||
|  |             entity = Span(doc, start, end, label=self.label) | ||||||
|  |             spans.append(entity) | ||||||
|  |             # Set custom attribute on each token of the entity | ||||||
|  |             # Can be extended with other data returned by the API, like | ||||||
|  |             # currencies, country code, flag, calling code etc. | ||||||
|  |             for token in entity: | ||||||
|  |                 token._.set('is_country', True) | ||||||
|  |                 token._.set('country_capital', self.countries[entity.text]['capital']) | ||||||
|  |                 token._.set('country_latlng', self.countries[entity.text]['latlng']) | ||||||
|  |                 token._.set('country_flag', self.countries[entity.text]['flag']) | ||||||
|  |             # Overwrite doc.ents and add entity – be careful not to replace! | ||||||
|  |             doc.ents = list(doc.ents) + [entity] | ||||||
|  |         for span in spans: | ||||||
|  |             # Iterate over all spans and merge them into one token. This is done | ||||||
|  |             # after setting the entities – otherwise, it would cause mismatched | ||||||
|  |             # indices! | ||||||
|  |             span.merge() | ||||||
|  |         return doc  # don't forget to return the Doc! | ||||||
|  | 
 | ||||||
|  |     def has_country(self, tokens): | ||||||
|  |         """Getter for Doc and Span attributes. Returns True if one of the tokens | ||||||
|  |         is a country. Since the getter is only called when we access the | ||||||
|  |         attribute, we can refer to the Token's 'is_country' attribute here, | ||||||
|  |         which is already set in the processing step.""" | ||||||
|  |         return any([t._.get('is_country') for t in tokens]) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # For simplicity, we start off with only the blank English Language class and | ||||||
|  | # no model or pre-defined pipeline loaded. | ||||||
|  | 
 | ||||||
|  | nlp = English() | ||||||
|  | rest_countries = RESTCountriesComponent(nlp)  # initialise component | ||||||
|  | nlp.add_pipe(rest_countries) # add it to the pipeline | ||||||
|  | 
 | ||||||
|  | doc = nlp(u"Some text about Colombia and the Czech Republic") | ||||||
|  | 
 | ||||||
|  | print('Pipeline', nlp.pipe_names)  # pipeline contains component name | ||||||
|  | print('Doc has countries', doc._.has_country)  # Doc contains countries | ||||||
|  | for token in doc: | ||||||
|  |     if token._.is_country: | ||||||
|  |         print(token.text, token._.country_capital, token._.country_latlng, | ||||||
|  |               token._.country_flag)  # country data | ||||||
|  | print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities | ||||||
							
								
								
									
										85
									
								
								examples/pipeline/custom_component_entities.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								examples/pipeline/custom_component_entities.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,85 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from spacy.lang.en import English | ||||||
|  | from spacy.matcher import PhraseMatcher | ||||||
|  | from spacy.tokens import Doc, Span, Token | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TechCompanyRecognizer(object): | ||||||
|  |     """Example of a spaCy v2.0 pipeline component that sets entity annotations | ||||||
|  |     based on list of single or multiple-word company names. Companies are | ||||||
|  |     labelled as ORG and their spans are merged into one token. Additionally, | ||||||
|  |     ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token | ||||||
|  |     respectively.""" | ||||||
|  |     name = 'tech_companies'  # component name, will show up in the pipeline | ||||||
|  | 
 | ||||||
|  |     def __init__(self, nlp, companies=tuple(), label='ORG'): | ||||||
|  |         """Initialise the pipeline component. The shared nlp instance is used | ||||||
|  |         to initialise the matcher with the shared vocab, get the label ID and | ||||||
|  |         generate Doc objects as phrase match patterns. | ||||||
|  |         """ | ||||||
|  |         self.label = nlp.vocab.strings[label]  # get entity label ID | ||||||
|  | 
 | ||||||
|  |         # Set up the PhraseMatcher – it can now take Doc objects as patterns, | ||||||
|  |         # so even if the list of companies is long, it's very efficient | ||||||
|  |         patterns = [nlp(org) for org in companies] | ||||||
|  |         self.matcher = PhraseMatcher(nlp.vocab) | ||||||
|  |         self.matcher.add('TECH_ORGS', None, *patterns) | ||||||
|  | 
 | ||||||
|  |         # Register attribute on the Token. We'll be overwriting this based on | ||||||
|  |         # the matches, so we're only setting a default value, not a getter. | ||||||
|  |         Token.set_extension('is_tech_org', default=False) | ||||||
|  | 
 | ||||||
|  |         # Register attributes on Doc and Span via a getter that checks if one of | ||||||
|  |         # the contained tokens is set to is_tech_org == True. | ||||||
|  |         Doc.set_extension('has_tech_org', getter=self.has_tech_org) | ||||||
|  |         Span.set_extension('has_tech_org', getter=self.has_tech_org) | ||||||
|  | 
 | ||||||
|  |     def __call__(self, doc): | ||||||
|  |         """Apply the pipeline component on a Doc object and modify it if matches | ||||||
|  |         are found. Return the Doc, so it can be processed by the next component | ||||||
|  |         in the pipeline, if available. | ||||||
|  |         """ | ||||||
|  |         matches = self.matcher(doc) | ||||||
|  |         spans = []  # keep the spans for later so we can merge them afterwards | ||||||
|  |         for _, start, end in matches: | ||||||
|  |             # Generate Span representing the entity & set label | ||||||
|  |             entity = Span(doc, start, end, label=self.label) | ||||||
|  |             spans.append(entity) | ||||||
|  |             # Set custom attribute on each token of the entity | ||||||
|  |             for token in entity: | ||||||
|  |                 token._.set('is_tech_org', True) | ||||||
|  |             # Overwrite doc.ents and add entity – be careful not to replace! | ||||||
|  |             doc.ents = list(doc.ents) + [entity] | ||||||
|  |         for span in spans: | ||||||
|  |             # Iterate over all spans and merge them into one token. This is done | ||||||
|  |             # after setting the entities – otherwise, it would cause mismatched | ||||||
|  |             # indices! | ||||||
|  |             span.merge() | ||||||
|  |         return doc  # don't forget to return the Doc! | ||||||
|  | 
 | ||||||
|  |     def has_tech_org(self, tokens): | ||||||
|  |         """Getter for Doc and Span attributes. Returns True if one of the tokens | ||||||
|  |         is a tech org. Since the getter is only called when we access the | ||||||
|  |         attribute, we can refer to the Token's 'is_tech_org' attribute here, | ||||||
|  |         which is already set in the processing step.""" | ||||||
|  |         return any([t._.get('is_tech_org') for t in tokens]) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # For simplicity, we start off with only the blank English Language class and | ||||||
|  | # no model or pre-defined pipeline loaded. | ||||||
|  | 
 | ||||||
|  | nlp = English() | ||||||
|  | companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc. | ||||||
|  | component = TechCompanyRecognizer(nlp, companies)  # initialise component | ||||||
|  | nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element | ||||||
|  | 
 | ||||||
|  | doc = nlp(u"Alphabet Inc. is the company behind Google.") | ||||||
|  | 
 | ||||||
|  | print('Pipeline', nlp.pipe_names)  # pipeline contains component name | ||||||
|  | print('Tokens', [t.text for t in doc])  # company names from the list are merged | ||||||
|  | print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs | ||||||
|  | print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org | ||||||
|  | print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not | ||||||
|  | print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities | ||||||
|  | @ -6,7 +6,7 @@ To achieve that, it duplicates some of spaCy's internal functionality. | ||||||
| 
 | 
 | ||||||
| Specifically, in this example, we don't use spaCy's built-in Language class to | Specifically, in this example, we don't use spaCy's built-in Language class to | ||||||
| wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write | wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write | ||||||
| our own simle Pipeline class, so that it's easier to see how the pieces | our own simple Pipeline class, so that it's easier to see how the pieces | ||||||
| interact. | interact. | ||||||
| 
 | 
 | ||||||
| Input data: | Input data: | ||||||
|  | @ -142,16 +142,15 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5): | ||||||
|             inputs, annots = zip(*batch) |             inputs, annots = zip(*batch) | ||||||
|             nlp.update(list(inputs), list(annots), sgd, losses=losses) |             nlp.update(list(inputs), list(annots), sgd, losses=losses) | ||||||
|         scores = nlp.evaluate(dev_examples) |         scores = nlp.evaluate(dev_examples) | ||||||
|         report_scores(i, losses['ner'], scores) |         report_scores(i+1, losses['ner'], scores) | ||||||
|     scores = nlp.evaluate(dev_examples) |  | ||||||
|     report_scores(channels, i+1, loss, scores) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def report_scores(i, loss, scores): | def report_scores(i, loss, scores): | ||||||
|     precision = '%.2f' % scores['ents_p'] |     precision = '%.2f' % scores['ents_p'] | ||||||
|     recall = '%.2f' % scores['ents_r'] |     recall = '%.2f' % scores['ents_r'] | ||||||
|     f_measure = '%.2f' % scores['ents_f'] |     f_measure = '%.2f' % scores['ents_f'] | ||||||
|     print('%d %s %s %s' % (int(loss), precision, recall, f_measure)) |     print('Epoch %d: %d %s %s %s' % ( | ||||||
|  |         i, int(loss), precision, recall, f_measure)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def read_examples(path): | def read_examples(path): | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ if __name__ == '__main__': | ||||||
|     import plac |     import plac | ||||||
|     import sys |     import sys | ||||||
|     from spacy.cli import download, link, info, package, train, convert, model |     from spacy.cli import download, link, info, package, train, convert, model | ||||||
|     from spacy.cli import profile, evaluate |     from spacy.cli import profile, evaluate, validate | ||||||
|     from spacy.util import prints |     from spacy.util import prints | ||||||
| 
 | 
 | ||||||
|     commands = { |     commands = { | ||||||
|  | @ -20,6 +20,7 @@ if __name__ == '__main__': | ||||||
|         'package': package, |         'package': package, | ||||||
|         'model': model, |         'model': model, | ||||||
|         'profile': profile, |         'profile': profile, | ||||||
|  |         'validate': validate | ||||||
|     } |     } | ||||||
|     if len(sys.argv) == 1: |     if len(sys.argv) == 1: | ||||||
|         prints(', '.join(commands), title="Available commands", exits=1) |         prints(', '.join(commands), title="Available commands", exits=1) | ||||||
|  |  | ||||||
|  | @ -311,7 +311,7 @@ def link_vectors_to_models(vocab): | ||||||
| 
 | 
 | ||||||
| def Tok2Vec(width, embed_size, **kwargs): | def Tok2Vec(width, embed_size, **kwargs): | ||||||
|     pretrained_dims = kwargs.get('pretrained_dims', 0) |     pretrained_dims = kwargs.get('pretrained_dims', 0) | ||||||
|     cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3) |     cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) | ||||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] |     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||||
|     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, |     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, | ||||||
|                                  '*': reapply}): |                                  '*': reapply}): | ||||||
|  |  | ||||||
|  | @ -7,3 +7,4 @@ from .train import train | ||||||
| from .evaluate import evaluate | from .evaluate import evaluate | ||||||
| from .convert import convert | from .convert import convert | ||||||
| from .model import model | from .model import model | ||||||
|  | from .validate import validate | ||||||
|  |  | ||||||
|  | @ -4,7 +4,7 @@ from __future__ import unicode_literals | ||||||
| import plac | import plac | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 | 
 | ||||||
| from .converters import conllu2json, iob2json | from .converters import conllu2json, iob2json, conll_ner2json | ||||||
| from ..util import prints | from ..util import prints | ||||||
| 
 | 
 | ||||||
| # Converters are matched by file extension. To add a converter, add a new entry | # Converters are matched by file extension. To add a converter, add a new entry | ||||||
|  | @ -12,9 +12,10 @@ from ..util import prints | ||||||
| # from /converters. | # from /converters. | ||||||
| 
 | 
 | ||||||
| CONVERTERS = { | CONVERTERS = { | ||||||
|     '.conllu': conllu2json, |     'conllu': conllu2json, | ||||||
|     '.conll': conllu2json, |     'conll': conllu2json, | ||||||
|     '.iob': iob2json, |     'ner': conll_ner2json, | ||||||
|  |     'iob': iob2json, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -22,9 +23,11 @@ CONVERTERS = { | ||||||
|     input_file=("input file", "positional", None, str), |     input_file=("input file", "positional", None, str), | ||||||
|     output_dir=("output directory for converted file", "positional", None, str), |     output_dir=("output directory for converted file", "positional", None, str), | ||||||
|     n_sents=("Number of sentences per doc", "option", "n", int), |     n_sents=("Number of sentences per doc", "option", "n", int), | ||||||
|  |     converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), | ||||||
|     morphology=("Enable appending morphology to tags", "flag", "m", bool) |     morphology=("Enable appending morphology to tags", "flag", "m", bool) | ||||||
| ) | ) | ||||||
| def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): | def convert(cmd, input_file, output_dir, n_sents=1, morphology=False, | ||||||
|  |             converter='auto'): | ||||||
|     """ |     """ | ||||||
|     Convert files into JSON format for use with train command and other |     Convert files into JSON format for use with train command and other | ||||||
|     experiment management functions. |     experiment management functions. | ||||||
|  | @ -35,9 +38,11 @@ def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): | ||||||
|         prints(input_path, title="Input file not found", exits=1) |         prints(input_path, title="Input file not found", exits=1) | ||||||
|     if not output_path.exists(): |     if not output_path.exists(): | ||||||
|         prints(output_path, title="Output directory not found", exits=1) |         prints(output_path, title="Output directory not found", exits=1) | ||||||
|     file_ext = input_path.suffix |     if converter == 'auto': | ||||||
|     if not file_ext in CONVERTERS: |         converter = input_path.suffix[1:] | ||||||
|         prints("Can't find converter for %s" % input_path.parts[-1], |     if not converter in CONVERTERS: | ||||||
|  |             prints("Can't find converter for %s" % converter, | ||||||
|                 title="Unknown format", exits=1) |                 title="Unknown format", exits=1) | ||||||
|     CONVERTERS[file_ext](input_path, output_path, |     func = CONVERTERS[converter] | ||||||
|  |     func(input_path, output_path, | ||||||
|          n_sents=n_sents, use_morphology=morphology) |          n_sents=n_sents, use_morphology=morphology) | ||||||
|  |  | ||||||
|  | @ -1,2 +1,3 @@ | ||||||
| from .conllu2json import conllu2json | from .conllu2json import conllu2json | ||||||
| from .iob2json import iob2json | from .iob2json import iob2json | ||||||
|  | from .conll_ner2json import conll_ner2json | ||||||
|  |  | ||||||
							
								
								
									
										50
									
								
								spacy/cli/converters/conll_ner2json.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								spacy/cli/converters/conll_ner2json.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,50 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ...compat import json_dumps, path2str | ||||||
|  | from ...util import prints | ||||||
|  | from ...gold import iob_to_biluo | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): | ||||||
|  |     """ | ||||||
|  |     Convert files in the CoNLL-2003 NER format into JSON format for use with train cli. | ||||||
|  |     """ | ||||||
|  |     docs = read_conll_ner(input_path) | ||||||
|  | 
 | ||||||
|  |     output_filename = input_path.parts[-1].replace(".conll", "") + ".json" | ||||||
|  |     output_filename = input_path.parts[-1].replace(".conll", "") + ".json" | ||||||
|  |     output_file = output_path / output_filename | ||||||
|  |     with output_file.open('w', encoding='utf-8') as f: | ||||||
|  |         f.write(json_dumps(docs)) | ||||||
|  |     prints("Created %d documents" % len(docs), | ||||||
|  |            title="Generated output file %s" % path2str(output_file)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def read_conll_ner(input_path): | ||||||
|  |     text = input_path.open('r', encoding='utf-8').read() | ||||||
|  |     i = 0 | ||||||
|  |     delimit_docs = '-DOCSTART- -X- O O' | ||||||
|  |     output_docs = [] | ||||||
|  |     for doc in text.strip().split(delimit_docs): | ||||||
|  |         doc = doc.strip() | ||||||
|  |         if not doc: | ||||||
|  |             continue | ||||||
|  |         output_doc = [] | ||||||
|  |         for sent in doc.split('\n\n'): | ||||||
|  |             sent = sent.strip() | ||||||
|  |             if not sent: | ||||||
|  |                 continue | ||||||
|  |             lines = [line.strip() for line in sent.split('\n') if line.strip()] | ||||||
|  |             words, tags, chunks, iob_ents = zip(*[line.split() for line in lines]) | ||||||
|  |             biluo_ents = iob_to_biluo(iob_ents) | ||||||
|  |             output_doc.append({'tokens': [ | ||||||
|  |                 {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in | ||||||
|  |                 zip(words, tags, biluo_ents) | ||||||
|  |             ]}) | ||||||
|  |         output_docs.append({ | ||||||
|  |             'id': len(output_docs), | ||||||
|  |             'paragraphs': [{'sentences': output_doc}] | ||||||
|  |         }) | ||||||
|  |         output_doc = [] | ||||||
|  |     return output_docs | ||||||
|  | @ -44,7 +44,7 @@ numpy.random.seed(0) | ||||||
|     version=("Model version", "option", "V", str), |     version=("Model version", "option", "V", str), | ||||||
|     meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) |     meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) | ||||||
| ) | ) | ||||||
| def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, | ||||||
|           use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, |           use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, | ||||||
|           gold_preproc=False, version="0.0.0", meta_path=None): |           gold_preproc=False, version="0.0.0", meta_path=None): | ||||||
|     """ |     """ | ||||||
|  | @ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | ||||||
|     if not isinstance(meta, dict): |     if not isinstance(meta, dict): | ||||||
|         prints("Expected dict but got: {}".format(type(meta)), |         prints("Expected dict but got: {}".format(type(meta)), | ||||||
|                title="Not a valid meta.json format", exits=1) |                title="Not a valid meta.json format", exits=1) | ||||||
|  |     meta.setdefault('lang', lang) | ||||||
|  |     meta.setdefault('name', 'unnamed') | ||||||
| 
 | 
 | ||||||
|     pipeline = ['tagger', 'parser', 'ner'] |     pipeline = ['tagger', 'parser', 'ner'] | ||||||
|     if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') |     if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') | ||||||
|  | @ -88,9 +90,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | ||||||
|     n_train_words = corpus.count_train() |     n_train_words = corpus.count_train() | ||||||
| 
 | 
 | ||||||
|     lang_class = util.get_lang_class(lang) |     lang_class = util.get_lang_class(lang) | ||||||
|     nlp = lang_class(pipeline=pipeline) |     nlp = lang_class() | ||||||
|  |     meta['pipeline'] = pipeline | ||||||
|  |     nlp.meta.update(meta) | ||||||
|     if vectors: |     if vectors: | ||||||
|         util.load_model(vectors, vocab=nlp.vocab) |         util.load_model(vectors, vocab=nlp.vocab) | ||||||
|  |     for name in pipeline: | ||||||
|  |         nlp.add_pipe(nlp.create_pipe(name), name=name) | ||||||
|     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) |     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) | ||||||
|     nlp._optimizer = None |     nlp._optimizer = None | ||||||
| 
 | 
 | ||||||
|  | @ -112,17 +118,33 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | ||||||
|                 util.set_env_log(False) |                 util.set_env_log(False) | ||||||
|                 epoch_model_path = output_path / ('model%d' % i) |                 epoch_model_path = output_path / ('model%d' % i) | ||||||
|                 nlp.to_disk(epoch_model_path) |                 nlp.to_disk(epoch_model_path) | ||||||
|                 nlp_loaded = lang_class(pipeline=pipeline) |                 nlp_loaded = util.load_model_from_path(epoch_model_path) | ||||||
|                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path) |                 dev_docs = list(corpus.dev_docs( | ||||||
|                 scorer = nlp_loaded.evaluate( |  | ||||||
|                             list(corpus.dev_docs( |  | ||||||
|                                 nlp_loaded, |                                 nlp_loaded, | ||||||
|                                 gold_preproc=gold_preproc))) |                                 gold_preproc=gold_preproc)) | ||||||
|  |                 nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) | ||||||
|  |                 start_time = timer() | ||||||
|  |                 scorer = nlp_loaded.evaluate(dev_docs) | ||||||
|  |                 end_time = timer() | ||||||
|  |                 if use_gpu < 0: | ||||||
|  |                     gpu_wps = None | ||||||
|  |                     cpu_wps = nwords/(end_time-start_time) | ||||||
|  |                 else: | ||||||
|  |                     gpu_wps = nwords/(end_time-start_time) | ||||||
|  |                     with Model.use_device('cpu'): | ||||||
|  |                         nlp_loaded = util.load_model_from_path(epoch_model_path) | ||||||
|  |                         dev_docs = list(corpus.dev_docs( | ||||||
|  |                                         nlp_loaded, gold_preproc=gold_preproc)) | ||||||
|  |                         start_time = timer() | ||||||
|  |                         scorer = nlp_loaded.evaluate(dev_docs) | ||||||
|  |                         end_time = timer() | ||||||
|  |                         cpu_wps = nwords/(end_time-start_time) | ||||||
|                 acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') |                 acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') | ||||||
|                 with acc_loc.open('w') as file_: |                 with acc_loc.open('w') as file_: | ||||||
|                     file_.write(json_dumps(scorer.scores)) |                     file_.write(json_dumps(scorer.scores)) | ||||||
|                 meta_loc = output_path / ('model%d' % i) / 'meta.json' |                 meta_loc = output_path / ('model%d' % i) / 'meta.json' | ||||||
|                 meta['accuracy'] = scorer.scores |                 meta['accuracy'] = scorer.scores | ||||||
|  |                 meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps} | ||||||
|                 meta['lang'] = nlp.lang |                 meta['lang'] = nlp.lang | ||||||
|                 meta['pipeline'] = pipeline |                 meta['pipeline'] = pipeline | ||||||
|                 meta['spacy_version'] = '>=%s' % about.__version__ |                 meta['spacy_version'] = '>=%s' % about.__version__ | ||||||
|  | @ -132,7 +154,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | ||||||
|                 with meta_loc.open('w') as file_: |                 with meta_loc.open('w') as file_: | ||||||
|                     file_.write(json_dumps(meta)) |                     file_.write(json_dumps(meta)) | ||||||
|                 util.set_env_log(True) |                 util.set_env_log(True) | ||||||
|             print_progress(i, losses, scorer.scores) |             print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) | ||||||
|     finally: |     finally: | ||||||
|         print("Saving model...") |         print("Saving model...") | ||||||
|         try: |         try: | ||||||
|  | @ -153,16 +175,17 @@ def _render_parses(i, to_render): | ||||||
|         file_.write(html) |         file_.write(html) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def print_progress(itn, losses, dev_scores, wps=0.0): | def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): | ||||||
|     scores = {} |     scores = {} | ||||||
|     for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', |     for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', | ||||||
|                 'ents_p', 'ents_r', 'ents_f', 'wps']: |                 'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']: | ||||||
|         scores[col] = 0.0 |         scores[col] = 0.0 | ||||||
|     scores['dep_loss'] = losses.get('parser', 0.0) |     scores['dep_loss'] = losses.get('parser', 0.0) | ||||||
|     scores['ner_loss'] = losses.get('ner', 0.0) |     scores['ner_loss'] = losses.get('ner', 0.0) | ||||||
|     scores['tag_loss'] = losses.get('tagger', 0.0) |     scores['tag_loss'] = losses.get('tagger', 0.0) | ||||||
|     scores.update(dev_scores) |     scores.update(dev_scores) | ||||||
|     scores['wps'] = wps |     scores['cpu_wps'] = cpu_wps | ||||||
|  |     scores['gpu_wps'] = gpu_wps or 0.0 | ||||||
|     tpl = '\t'.join(( |     tpl = '\t'.join(( | ||||||
|         '{:d}', |         '{:d}', | ||||||
|         '{dep_loss:.3f}', |         '{dep_loss:.3f}', | ||||||
|  | @ -173,7 +196,9 @@ def print_progress(itn, losses, dev_scores, wps=0.0): | ||||||
|         '{ents_f:.3f}', |         '{ents_f:.3f}', | ||||||
|         '{tags_acc:.3f}', |         '{tags_acc:.3f}', | ||||||
|         '{token_acc:.3f}', |         '{token_acc:.3f}', | ||||||
|         '{wps:.1f}')) |         '{cpu_wps:.1f}', | ||||||
|  |         '{gpu_wps:.1f}', | ||||||
|  |     )) | ||||||
|     print(tpl.format(itn, **scores)) |     print(tpl.format(itn, **scores)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										123
									
								
								spacy/cli/validate.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										123
									
								
								spacy/cli/validate.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,123 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import requests | ||||||
|  | import pkg_resources | ||||||
|  | from pathlib import Path | ||||||
|  | 
 | ||||||
|  | from ..compat import path2str, locale_escape | ||||||
|  | from ..util import prints, get_data_path, read_json | ||||||
|  | from .. import about | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def validate(cmd): | ||||||
|  |     """Validate that the currently installed version of spaCy is compatible | ||||||
|  |     with the installed models. Should be run after `pip install -U spacy`. | ||||||
|  |     """ | ||||||
|  |     r = requests.get(about.__compatibility__) | ||||||
|  |     if r.status_code != 200: | ||||||
|  |         prints("Couldn't fetch compatibility table.", | ||||||
|  |                title="Server error (%d)" % r.status_code, exits=1) | ||||||
|  |     compat = r.json()['spacy'] | ||||||
|  |     all_models = set() | ||||||
|  |     for spacy_v, models in dict(compat).items(): | ||||||
|  |         all_models.update(models.keys()) | ||||||
|  |         for model, model_vs in models.items(): | ||||||
|  |             compat[spacy_v][model] = [reformat_version(v) for v in model_vs] | ||||||
|  | 
 | ||||||
|  |     current_compat = compat[about.__version__] | ||||||
|  |     model_links = get_model_links(current_compat) | ||||||
|  |     model_pkgs = get_model_pkgs(current_compat, all_models) | ||||||
|  |     incompat_links = {l for l, d in model_links.items() if not d['compat']} | ||||||
|  |     incompat_models = {d['name'] for _, d in model_pkgs.items() if not d['compat']} | ||||||
|  |     incompat_models.update([d['name'] for _, d in model_links.items() if not d['compat']]) | ||||||
|  |     na_models = [m for m in incompat_models if m not in current_compat] | ||||||
|  |     update_models = [m for m in incompat_models if m in current_compat] | ||||||
|  | 
 | ||||||
|  |     prints(path2str(Path(__file__).parent.parent), | ||||||
|  |            title="Installed models (spaCy v{})".format(about.__version__)) | ||||||
|  |     if model_links or model_pkgs: | ||||||
|  |         print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) | ||||||
|  |         for name, data in model_pkgs.items(): | ||||||
|  |             print(get_model_row(current_compat, name, data, 'package')) | ||||||
|  |         for name, data in model_links.items(): | ||||||
|  |             print(get_model_row(current_compat, name, data, 'link')) | ||||||
|  |     else: | ||||||
|  |         prints("No models found in your current environment.", exits=0) | ||||||
|  | 
 | ||||||
|  |     if update_models: | ||||||
|  |         cmd = '    python -m spacy download {}' | ||||||
|  |         print("\n    Use the following commands to update the model packages:") | ||||||
|  |         print('\n'.join([cmd.format(pkg) for pkg in update_models])) | ||||||
|  | 
 | ||||||
|  |     if na_models: | ||||||
|  |         prints("The following models are not available for spaCy v{}: {}" | ||||||
|  |                .format(about.__version__, ', '.join(na_models))) | ||||||
|  | 
 | ||||||
|  |     if incompat_links: | ||||||
|  |         prints("You may also want to overwrite the incompatible links using " | ||||||
|  |                "the `spacy link` command with `--force`, or remove them from " | ||||||
|  |                "the data directory. Data path: {}" | ||||||
|  |                .format(path2str(get_data_path()))) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_model_links(compat): | ||||||
|  |     links = {} | ||||||
|  |     data_path = get_data_path() | ||||||
|  |     if data_path: | ||||||
|  |         models = [p for p in data_path.iterdir() if is_model_path(p)] | ||||||
|  |         for model in models: | ||||||
|  |             meta_path = Path(model) / 'meta.json' | ||||||
|  |             if not meta_path.exists(): | ||||||
|  |                 continue | ||||||
|  |             meta = read_json(meta_path) | ||||||
|  |             link = model.parts[-1] | ||||||
|  |             name = meta['lang'] + '_' + meta['name'] | ||||||
|  |             links[link] = {'name': name, 'version': meta['version'], | ||||||
|  |                            'compat': is_compat(compat, name, meta['version'])} | ||||||
|  |     return links | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_model_pkgs(compat, all_models): | ||||||
|  |     pkgs = {} | ||||||
|  |     for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): | ||||||
|  |         package = pkg_name.replace('-', '_') | ||||||
|  |         if package in all_models: | ||||||
|  |             version = pkg_data.version | ||||||
|  |             pkgs[pkg_name] = {'name': package, 'version': version, | ||||||
|  |                               'compat': is_compat(compat, package, version)} | ||||||
|  |     return pkgs | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_model_row(compat, name, data, type='package'): | ||||||
|  |     tpl_row = '    {:<10}' + ('  {:<20}' * 4) | ||||||
|  |     tpl_red = '\x1b[38;5;1m{}\x1b[0m' | ||||||
|  |     tpl_green = '\x1b[38;5;2m{}\x1b[0m' | ||||||
|  |     if data['compat']: | ||||||
|  |         comp = tpl_green.format(locale_escape('✔', errors='ignore')) | ||||||
|  |         version = tpl_green.format(data['version']) | ||||||
|  |     else: | ||||||
|  |         comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0]) | ||||||
|  |         version = tpl_red.format(data['version']) | ||||||
|  |     return get_row(type, name, data['name'], version, comp) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_row(*args): | ||||||
|  |     tpl_row = '    {:<10}' + ('  {:<20}' * 4) | ||||||
|  |     return tpl_row.format(*args) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def is_model_path(model_path): | ||||||
|  |     exclude = ['cache', 'pycache', '__pycache__'] | ||||||
|  |     name = model_path.parts[-1] | ||||||
|  |     return model_path.is_dir() and name not in exclude and not name.startswith('.') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def is_compat(compat, name, version): | ||||||
|  |     return name in compat and version in compat[name] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def reformat_version(version): | ||||||
|  |     if version.endswith('-alpha'): | ||||||
|  |         return version.replace('-alpha', 'a0') | ||||||
|  |     return version.replace('-alpha', 'a') | ||||||
|  | @ -6,6 +6,7 @@ import ftfy | ||||||
| import sys | import sys | ||||||
| import ujson | import ujson | ||||||
| import itertools | import itertools | ||||||
|  | import locale | ||||||
| 
 | 
 | ||||||
| from thinc.neural.util import copy_array | from thinc.neural.util import copy_array | ||||||
| 
 | 
 | ||||||
|  | @ -113,3 +114,12 @@ def import_file(name, loc): | ||||||
|         module = importlib.util.module_from_spec(spec) |         module = importlib.util.module_from_spec(spec) | ||||||
|         spec.loader.exec_module(module) |         spec.loader.exec_module(module) | ||||||
|         return module |         return module | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def locale_escape(string, errors='replace'): | ||||||
|  |     ''' | ||||||
|  |     Mangle non-supported characters, for savages with ascii terminals. | ||||||
|  |     ''' | ||||||
|  |     encoding = locale.getpreferredencoding() | ||||||
|  |     string = string.encode(encoding, errors).decode('utf8') | ||||||
|  |     return string | ||||||
|  |  | ||||||
|  | @ -213,7 +213,7 @@ class GoldCorpus(object): | ||||||
|         train_tuples = self.train_tuples |         train_tuples = self.train_tuples | ||||||
|         if projectivize: |         if projectivize: | ||||||
|             train_tuples = nonproj.preprocess_training_data( |             train_tuples = nonproj.preprocess_training_data( | ||||||
|                                self.train_tuples) |                                self.train_tuples, label_freq_cutoff=100) | ||||||
|         random.shuffle(train_tuples) |         random.shuffle(train_tuples) | ||||||
|         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, |         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, | ||||||
|                                         max_length=max_length, |                                         max_length=max_length, | ||||||
|  |  | ||||||
|  | @ -16,15 +16,13 @@ from ...util import update_exc | ||||||
| class BengaliDefaults(Language.Defaults): | class BengaliDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'bn' |     lex_attr_getters[LANG] = lambda text: 'bn' | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     tag_map = TAG_MAP |     tag_map = TAG_MAP | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
|     lemma_rules = LEMMA_RULES |     lemma_rules = LEMMA_RULES | ||||||
| 
 |     prefixes = TOKENIZER_PREFIXES | ||||||
|     prefixes = tuple(TOKENIZER_PREFIXES) |     suffixes = TOKENIZER_SUFFIXES | ||||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) |     infixes = TOKENIZER_INFIXES | ||||||
|     infixes = tuple(TOKENIZER_INFIXES) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Bengali(Language): | class Bengali(Language): | ||||||
|  |  | ||||||
|  | @ -15,9 +15,8 @@ class DanishDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'da' |     lex_attr_getters[LANG] = lambda text: 'da' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Danish(Language): | class Danish(Language): | ||||||
|  |  | ||||||
|  | @ -12,7 +12,6 @@ from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer |  | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | @ -22,16 +21,12 @@ class GermanDefaults(Language.Defaults): | ||||||
|     lex_attr_getters[LANG] = lambda text: 'de' |     lex_attr_getters[LANG] = lambda text: 'de' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], | ||||||
|                                          NORM_EXCEPTIONS, BASE_NORMS) |                                          NORM_EXCEPTIONS, BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     infixes = tuple(TOKENIZER_INFIXES) |     infixes = TOKENIZER_INFIXES | ||||||
|     tag_map = dict(TAG_MAP) |     tag_map = TAG_MAP | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) |     syntax_iterators = SYNTAX_ITERATORS | ||||||
| 
 |     lemma_lookup = LOOKUP | ||||||
|     @classmethod |  | ||||||
|     def create_lemmatizer(cls, nlp=None): |  | ||||||
|         return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class German(Language): | class German(Language): | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ from .tag_map import TAG_MAP | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| from .morph_rules import MORPH_RULES | from .morph_rules import MORPH_RULES | ||||||
| from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC | from .lemmatizer import LEMMA_RULES, LEMMA_INDEX, LEMMA_EXC, LOOKUP | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | @ -23,15 +23,15 @@ class EnglishDefaults(Language.Defaults): | ||||||
|     lex_attr_getters[LANG] = lambda text: 'en' |     lex_attr_getters[LANG] = lambda text: 'en' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], | ||||||
|                                          BASE_NORMS, NORM_EXCEPTIONS) |                                          BASE_NORMS, NORM_EXCEPTIONS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     tag_map = dict(TAG_MAP) |     tag_map = TAG_MAP | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
|     morph_rules = dict(MORPH_RULES) |     morph_rules = MORPH_RULES | ||||||
|     lemma_rules = dict(LEMMA_RULES) |     lemma_rules = LEMMA_RULES | ||||||
|     lemma_index = dict(LEMMA_INDEX) |     lemma_index = LEMMA_INDEX | ||||||
|     lemma_exc = dict(LEMMA_EXC) |     lemma_exc = LEMMA_EXC | ||||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) |     lemma_lookup = LOOKUP | ||||||
|  |     syntax_iterators = SYNTAX_ITERATORS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class English(Language): | class English(Language): | ||||||
|  |  | ||||||
|  | @ -10,7 +10,6 @@ from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer |  | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | @ -19,15 +18,11 @@ class SpanishDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'es' |     lex_attr_getters[LANG] = lambda text: 'es' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     tag_map = dict(TAG_MAP) |     tag_map = TAG_MAP | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
|     sytax_iterators = dict(SYNTAX_ITERATORS) |     sytax_iterators = SYNTAX_ITERATORS | ||||||
| 
 |     lemma_lookup = LOOKUP | ||||||
|     @classmethod |  | ||||||
|     def create_lemmatizer(cls, nlp=None): |  | ||||||
|         return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Spanish(Language): | class Spanish(Language): | ||||||
|  |  | ||||||
|  | @ -15,9 +15,8 @@ class FinnishDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'fi' |     lex_attr_getters[LANG] = lambda text: 'fi' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Finnish(Language): | class Finnish(Language): | ||||||
|  |  | ||||||
|  | @ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer |  | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | @ -21,17 +20,13 @@ class FrenchDefaults(Language.Defaults): | ||||||
|     lex_attr_getters.update(LEX_ATTRS) |     lex_attr_getters.update(LEX_ATTRS) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'fr' |     lex_attr_getters[LANG] = lambda text: 'fr' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
|     infixes = tuple(TOKENIZER_INFIXES) |     infixes = TOKENIZER_INFIXES | ||||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) |     suffixes = TOKENIZER_SUFFIXES | ||||||
|     token_match = TOKEN_MATCH |     token_match = TOKEN_MATCH | ||||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) |     syntax_iterators = SYNTAX_ITERATORS | ||||||
| 
 |     lemma_lookup = LOOKUP | ||||||
|     @classmethod |  | ||||||
|     def create_lemmatizer(cls, nlp=None): |  | ||||||
|         return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class French(Language): | class French(Language): | ||||||
|  |  | ||||||
|  | @ -12,9 +12,8 @@ from ...util import update_exc | ||||||
| class HebrewDefaults(Language.Defaults): | class HebrewDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'he' |     lex_attr_getters[LANG] = lambda text: 'he' | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Hebrew(Language): | class Hebrew(Language): | ||||||
|  |  | ||||||
|  | @ -9,7 +9,6 @@ from .lemmatizer import LOOKUP | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer |  | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | @ -18,17 +17,13 @@ class HungarianDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'hu' |     lex_attr_getters[LANG] = lambda text: 'hu' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
|     prefixes = tuple(TOKENIZER_PREFIXES) |     prefixes = TOKENIZER_PREFIXES | ||||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) |     suffixes = TOKENIZER_SUFFIXES | ||||||
|     infixes = tuple(TOKENIZER_INFIXES) |     infixes = TOKENIZER_INFIXES | ||||||
|     token_match = TOKEN_MATCH |     token_match = TOKEN_MATCH | ||||||
| 
 |     lemma_lookup = LOOKUP | ||||||
|     @classmethod |  | ||||||
|     def create_lemmatizer(cls, nlp=None): |  | ||||||
|         return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Hungarian(Language): | class Hungarian(Language): | ||||||
|  |  | ||||||
|  | @ -11,7 +11,6 @@ from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer |  | ||||||
| from ...attrs import LANG | from ...attrs import LANG | ||||||
| from ...util import update_exc | from ...util import update_exc | ||||||
| 
 | 
 | ||||||
|  | @ -19,19 +18,14 @@ from ...util import update_exc | ||||||
| class IndonesianDefaults(Language.Defaults): | class IndonesianDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'id' |     lex_attr_getters[LANG] = lambda text: 'id' | ||||||
| 
 |  | ||||||
|     lex_attr_getters.update(LEX_ATTRS) |     lex_attr_getters.update(LEX_ATTRS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
|     prefixes = tuple(TOKENIZER_PREFIXES) |     prefixes = TOKENIZER_PREFIXES | ||||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) |     suffixes = TOKENIZER_SUFFIXES | ||||||
|     infixes = tuple(TOKENIZER_INFIXES) |     infixes = TOKENIZER_INFIXES | ||||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) |     syntax_iterators = SYNTAX_ITERATORS | ||||||
| 
 |     lemma_lookup = LOOKUP | ||||||
|     @classmethod |  | ||||||
|     def create_lemmatizer(cls, nlp=None): |  | ||||||
|         return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Indonesian(Language): | class Indonesian(Language): | ||||||
|  |  | ||||||
|  | @ -16,8 +16,7 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', | ||||||
|               'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta', |               'sembilanbelas', 'duapuluh', 'seratus', 'seribu', 'sejuta', | ||||||
|               'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun', |               'ribu', 'rb', 'juta', 'jt', 'miliar', 'biliun', 'triliun', | ||||||
|               'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun', |               'kuadriliun', 'kuintiliun', 'sekstiliun', 'septiliun', 'oktiliun', | ||||||
|               'noniliun', 'desiliun', |               'noniliun', 'desiliun'] | ||||||
|               ] |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def like_num(text): | def like_num(text): | ||||||
|  |  | ||||||
|  | @ -7,7 +7,6 @@ from .lemmatizer import LOOKUP | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer |  | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | @ -16,13 +15,9 @@ class ItalianDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'it' |     lex_attr_getters[LANG] = lambda text: 'it' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 |     lemma_lookup = LOOKUP | ||||||
|     @classmethod |  | ||||||
|     def create_lemmatizer(cls, nlp=None): |  | ||||||
|         return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Italian(Language): | class Italian(Language): | ||||||
|  |  | ||||||
|  | @ -16,9 +16,8 @@ class NorwegianDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'nb' |     lex_attr_getters[LANG] = lambda text: 'nb' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Norwegian(Language): | class Norwegian(Language): | ||||||
|  |  | ||||||
|  | @ -16,9 +16,8 @@ class DutchDefaults(Language.Defaults): | ||||||
|     lex_attr_getters.update(LEX_ATTRS) |     lex_attr_getters.update(LEX_ATTRS) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'nl' |     lex_attr_getters[LANG] = lambda text: 'nl' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Dutch(Language): | class Dutch(Language): | ||||||
|  |  | ||||||
|  | @ -15,9 +15,8 @@ class PolishDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'pl' |     lex_attr_getters[LANG] = lambda text: 'pl' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Polish(Language): | class Polish(Language): | ||||||
|  |  | ||||||
|  | @ -9,7 +9,6 @@ from .lemmatizer import LOOKUP | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer |  | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | @ -19,13 +18,9 @@ class PortugueseDefaults(Language.Defaults): | ||||||
|     lex_attr_getters[LANG] = lambda text: 'pt' |     lex_attr_getters[LANG] = lambda text: 'pt' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
|     lex_attr_getters.update(LEX_ATTRS) |     lex_attr_getters.update(LEX_ATTRS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 |     lemma_lookup = LOOKUP | ||||||
|     @classmethod |  | ||||||
|     def create_lemmatizer(cls, nlp=None): |  | ||||||
|         return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Portuguese(Language): | class Portuguese(Language): | ||||||
|  |  | ||||||
|  | @ -9,7 +9,6 @@ from .lemmatizer import LEMMA_RULES, LOOKUP | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| from ...lemmatizerlookup import Lemmatizer |  | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | @ -18,13 +17,10 @@ class SwedishDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'sv' |     lex_attr_getters[LANG] = lambda text: 'sv' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 |     lemma_rules = LEMMA_RULES | ||||||
|     @classmethod |     lemma_lookup = LOOKUP | ||||||
|     def create_lemmatizer(cls, nlp=None): |  | ||||||
|         return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Swedish(Language): | class Swedish(Language): | ||||||
|  |  | ||||||
|  | @ -12,17 +12,19 @@ from ...language import Language | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| class ThaiDefaults(Language.Defaults): | class ThaiDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'th' |     lex_attr_getters[LANG] = lambda text: 'th' | ||||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS |     tokenizer_exceptions = dict(TOKENIZER_EXCEPTIONS) | ||||||
|     tag_map = dict(TAG_MAP) |     tag_map = TAG_MAP | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Thai(Language): | class Thai(Language): | ||||||
|     lang = 'th' |     lang = 'th' | ||||||
|     Defaults = ThaiDefaults |     Defaults = ThaiDefaults | ||||||
|  | 
 | ||||||
|     def make_doc(self, text): |     def make_doc(self, text): | ||||||
|         try: |         try: | ||||||
|             from pythainlp.tokenize import word_tokenize |             from pythainlp.tokenize import word_tokenize | ||||||
|  | @ -32,4 +34,5 @@ class Thai(Language): | ||||||
|         words = [x for x in list(word_tokenize(text,"newmm"))] |         words = [x for x in list(word_tokenize(text,"newmm"))] | ||||||
|         return Doc(self.vocab, words=words, spaces=[False]*len(words)) |         return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| __all__ = ['Thai'] | __all__ = ['Thai'] | ||||||
|  |  | ||||||
|  | @ -13,7 +13,6 @@ class MultiLanguageDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'xx' |     lex_attr_getters[LANG] = lambda text: 'xx' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 |  | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,12 +1,9 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import absolute_import, unicode_literals | from __future__ import absolute_import, unicode_literals | ||||||
| from contextlib import contextmanager | from contextlib import contextmanager | ||||||
| import dill |  | ||||||
| 
 | 
 | ||||||
| import numpy |  | ||||||
| from thinc.neural import Model | from thinc.neural import Model | ||||||
| from thinc.neural.ops import NumpyOps, CupyOps | from thinc.neural.optimizers import Adam | ||||||
| from thinc.neural.optimizers import Adam, SGD |  | ||||||
| import random | import random | ||||||
| import ujson | import ujson | ||||||
| from collections import OrderedDict | from collections import OrderedDict | ||||||
|  | @ -17,30 +14,27 @@ from .vocab import Vocab | ||||||
| from .tagger import Tagger | from .tagger import Tagger | ||||||
| from .lemmatizer import Lemmatizer | from .lemmatizer import Lemmatizer | ||||||
| from .syntax.parser import get_templates | from .syntax.parser import get_templates | ||||||
| from .syntax import nonproj |  | ||||||
| 
 | 
 | ||||||
| from .pipeline import NeuralDependencyParser, EntityRecognizer | from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger | ||||||
| from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer | from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer | ||||||
| from .pipeline import NeuralLabeller |  | ||||||
| from .pipeline import SimilarityHook |  | ||||||
| from .pipeline import TextCategorizer |  | ||||||
| from . import about |  | ||||||
| 
 | 
 | ||||||
| from .compat import json_dumps, izip | from .compat import json_dumps, izip | ||||||
|  | from .scorer import Scorer | ||||||
|  | from ._ml import link_vectors_to_models | ||||||
| from .attrs import IS_STOP | from .attrs import IS_STOP | ||||||
| from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||||
| from .lang.tokenizer_exceptions import TOKEN_MATCH | from .lang.tokenizer_exceptions import TOKEN_MATCH | ||||||
| from .lang.tag_map import TAG_MAP | from .lang.tag_map import TAG_MAP | ||||||
| from .lang.lex_attrs import LEX_ATTRS | from .lang.lex_attrs import LEX_ATTRS | ||||||
| from . import util | from . import util | ||||||
| from .scorer import Scorer | from . import about | ||||||
| from ._ml import link_vectors_to_models |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class BaseDefaults(object): | class BaseDefaults(object): | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create_lemmatizer(cls, nlp=None): |     def create_lemmatizer(cls, nlp=None): | ||||||
|         return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules) |         return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules, | ||||||
|  |                           cls.lemma_lookup) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def create_vocab(cls, nlp=None): |     def create_vocab(cls, nlp=None): | ||||||
|  | @ -70,59 +64,7 @@ class BaseDefaults(object): | ||||||
|                          prefix_search=prefix_search, suffix_search=suffix_search, |                          prefix_search=prefix_search, suffix_search=suffix_search, | ||||||
|                          infix_finditer=infix_finditer, token_match=token_match) |                          infix_finditer=infix_finditer, token_match=token_match) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] | ||||||
|     def create_tagger(cls, nlp=None, **cfg): |  | ||||||
|         if nlp is None: |  | ||||||
|             return NeuralTagger(cls.create_vocab(nlp), **cfg) |  | ||||||
|         else: |  | ||||||
|             return NeuralTagger(nlp.vocab, **cfg) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def create_parser(cls, nlp=None, **cfg): |  | ||||||
|         if nlp is None: |  | ||||||
|             return NeuralDependencyParser(cls.create_vocab(nlp), **cfg) |  | ||||||
|         else: |  | ||||||
|             return NeuralDependencyParser(nlp.vocab, **cfg) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def create_entity(cls, nlp=None, **cfg): |  | ||||||
|         if nlp is None: |  | ||||||
|             return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg) |  | ||||||
|         else: |  | ||||||
|             return NeuralEntityRecognizer(nlp.vocab, **cfg) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def create_pipeline(cls, nlp=None, disable=tuple()): |  | ||||||
|         meta = nlp.meta if nlp is not None else {} |  | ||||||
|         # Resolve strings, like "cnn", "lstm", etc |  | ||||||
|         pipeline = [] |  | ||||||
|         for entry in meta.get('pipeline', []): |  | ||||||
|             if entry in disable or getattr(entry, 'name', entry) in disable: |  | ||||||
|                 continue |  | ||||||
|             factory = cls.Defaults.factories[entry] |  | ||||||
|             pipeline.append(factory(nlp, **meta.get(entry, {}))) |  | ||||||
|         return pipeline |  | ||||||
| 
 |  | ||||||
|     factories = { |  | ||||||
|         'make_doc': create_tokenizer, |  | ||||||
|         'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], |  | ||||||
|         'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], |  | ||||||
|         'parser': lambda nlp, **cfg: [ |  | ||||||
|             NeuralDependencyParser(nlp.vocab, **cfg), |  | ||||||
|             nonproj.deprojectivize], |  | ||||||
|         'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], |  | ||||||
|         'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)], |  | ||||||
|         'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)], |  | ||||||
|         # Temporary compatibility -- delete after pivot |  | ||||||
|         'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], |  | ||||||
|         'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], |  | ||||||
|         'dependencies': lambda nlp, **cfg: [ |  | ||||||
|             NeuralDependencyParser(nlp.vocab, **cfg), |  | ||||||
|             nonproj.deprojectivize, |  | ||||||
|         ], |  | ||||||
|         'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     token_match = TOKEN_MATCH |     token_match = TOKEN_MATCH | ||||||
|     prefixes = tuple(TOKENIZER_PREFIXES) |     prefixes = tuple(TOKENIZER_PREFIXES) | ||||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) |     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||||
|  | @ -136,6 +78,7 @@ class BaseDefaults(object): | ||||||
|     lemma_rules = {} |     lemma_rules = {} | ||||||
|     lemma_exc = {} |     lemma_exc = {} | ||||||
|     lemma_index = {} |     lemma_index = {} | ||||||
|  |     lemma_lookup = {} | ||||||
|     morph_rules = {} |     morph_rules = {} | ||||||
|     lex_attr_getters = LEX_ATTRS |     lex_attr_getters = LEX_ATTRS | ||||||
|     syntax_iterators = {} |     syntax_iterators = {} | ||||||
|  | @ -152,8 +95,17 @@ class Language(object): | ||||||
|     Defaults = BaseDefaults |     Defaults = BaseDefaults | ||||||
|     lang = None |     lang = None | ||||||
| 
 | 
 | ||||||
|     def __init__(self, vocab=True, make_doc=True, pipeline=None, |     factories = { | ||||||
|                  meta={}, disable=tuple(), **kwargs): |         'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), | ||||||
|  |         'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), | ||||||
|  |         'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), | ||||||
|  |         'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), | ||||||
|  |         'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), | ||||||
|  |         'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), | ||||||
|  |         'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): | ||||||
|         """Initialise a Language object. |         """Initialise a Language object. | ||||||
| 
 | 
 | ||||||
|         vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via |         vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via | ||||||
|  | @ -179,28 +131,7 @@ class Language(object): | ||||||
|             factory = self.Defaults.create_tokenizer |             factory = self.Defaults.create_tokenizer | ||||||
|             make_doc = factory(self, **meta.get('tokenizer', {})) |             make_doc = factory(self, **meta.get('tokenizer', {})) | ||||||
|         self.tokenizer = make_doc |         self.tokenizer = make_doc | ||||||
|         if pipeline is True: |  | ||||||
|             self.pipeline = self.Defaults.create_pipeline(self, disable) |  | ||||||
|         elif pipeline: |  | ||||||
|             # Careful not to do getattr(p, 'name', None) here |  | ||||||
|             # If we had disable=[None], we'd disable everything! |  | ||||||
|             self.pipeline = [p for p in pipeline |  | ||||||
|                              if p not in disable |  | ||||||
|                              and getattr(p, 'name', p) not in disable] |  | ||||||
|             # Resolve strings, like "cnn", "lstm", etc |  | ||||||
|             for i, entry in enumerate(self.pipeline): |  | ||||||
|                 if entry in self.Defaults.factories: |  | ||||||
|                     factory = self.Defaults.factories[entry] |  | ||||||
|                     self.pipeline[i] = factory(self, **meta.get(entry, {})) |  | ||||||
|         else: |  | ||||||
|         self.pipeline = [] |         self.pipeline = [] | ||||||
|         flat_list = [] |  | ||||||
|         for pipe in self.pipeline: |  | ||||||
|             if isinstance(pipe, list): |  | ||||||
|                 flat_list.extend(pipe) |  | ||||||
|             else: |  | ||||||
|                 flat_list.append(pipe) |  | ||||||
|         self.pipeline = flat_list |  | ||||||
|         self._optimizer = None |         self._optimizer = None | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|  | @ -214,11 +145,7 @@ class Language(object): | ||||||
|         self._meta.setdefault('email', '') |         self._meta.setdefault('email', '') | ||||||
|         self._meta.setdefault('url', '') |         self._meta.setdefault('url', '') | ||||||
|         self._meta.setdefault('license', '') |         self._meta.setdefault('license', '') | ||||||
|         pipeline = [] |         self._meta['pipeline'] = self.pipe_names | ||||||
|         for component in self.pipeline: |  | ||||||
|             if hasattr(component, 'name'): |  | ||||||
|                 pipeline.append(component.name) |  | ||||||
|         self._meta['pipeline'] = pipeline |  | ||||||
|         return self._meta |         return self._meta | ||||||
| 
 | 
 | ||||||
|     @meta.setter |     @meta.setter | ||||||
|  | @ -228,34 +155,144 @@ class Language(object): | ||||||
|     # Conveniences to access pipeline components |     # Conveniences to access pipeline components | ||||||
|     @property |     @property | ||||||
|     def tensorizer(self): |     def tensorizer(self): | ||||||
|         return self.get_component('tensorizer') |         return self.get_pipe('tensorizer') | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def tagger(self): |     def tagger(self): | ||||||
|         return self.get_component('tagger') |         return self.get_pipe('tagger') | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def parser(self): |     def parser(self): | ||||||
|         return self.get_component('parser') |         return self.get_pipe('parser') | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def entity(self): |     def entity(self): | ||||||
|         return self.get_component('ner') |         return self.get_pipe('ner') | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def matcher(self): |     def matcher(self): | ||||||
|         return self.get_component('matcher') |         return self.get_pipe('matcher') | ||||||
| 
 | 
 | ||||||
|     def get_component(self, name): |     @property | ||||||
|         if self.pipeline in (True, None): |     def pipe_names(self): | ||||||
|             return None |         """Get names of available pipeline components. | ||||||
|         for proc in self.pipeline: | 
 | ||||||
|             if hasattr(proc, 'name') and proc.name.endswith(name): |         RETURNS (list): List of component name strings, in order. | ||||||
|                 return proc |         """ | ||||||
|         return None |         return [pipe_name for pipe_name, _ in self.pipeline] | ||||||
|  | 
 | ||||||
|  |     def get_pipe(self, name): | ||||||
|  |         """Get a pipeline component for a given component name. | ||||||
|  | 
 | ||||||
|  |         name (unicode): Name of pipeline component to get. | ||||||
|  |         RETURNS (callable): The pipeline component. | ||||||
|  |         """ | ||||||
|  |         for pipe_name, component in self.pipeline: | ||||||
|  |             if pipe_name == name: | ||||||
|  |                 return component | ||||||
|  |         msg = "No component '{}' found in pipeline. Available names: {}" | ||||||
|  |         raise KeyError(msg.format(name, self.pipe_names)) | ||||||
|  | 
 | ||||||
|  |     def create_pipe(self, name, config=dict()): | ||||||
|  |         """Create a pipeline component from a factory. | ||||||
|  | 
 | ||||||
|  |         name (unicode): Factory name to look up in `Language.factories`. | ||||||
|  |         config (dict): Configuration parameters to initialise component. | ||||||
|  |         RETURNS (callable): Pipeline component. | ||||||
|  |         """ | ||||||
|  |         if name not in self.factories: | ||||||
|  |             raise KeyError("Can't find factory for '{}'.".format(name)) | ||||||
|  |         factory = self.factories[name] | ||||||
|  |         return factory(self, **config) | ||||||
|  | 
 | ||||||
|  |     def add_pipe(self, component, name=None, before=None, after=None, | ||||||
|  |                  first=None, last=None): | ||||||
|  |         """Add a component to the processing pipeline. Valid components are | ||||||
|  |         callables that take a `Doc` object, modify it and return it. Only one of | ||||||
|  |         before, after, first or last can be set. Default behaviour is "last". | ||||||
|  | 
 | ||||||
|  |         component (callable): The pipeline component. | ||||||
|  |         name (unicode): Name of pipeline component. Overwrites existing | ||||||
|  |             component.name attribute if available. If no name is set and | ||||||
|  |             the component exposes no name attribute, component.__name__ is | ||||||
|  |             used. An error is raised if the name already exists in the pipeline. | ||||||
|  |         before (unicode): Component name to insert component directly before. | ||||||
|  |         after (unicode): Component name to insert component directly after. | ||||||
|  |         first (bool): Insert component first / not first in the pipeline. | ||||||
|  |         last (bool): Insert component last / not last in the pipeline. | ||||||
|  | 
 | ||||||
|  |         EXAMPLE: | ||||||
|  |             >>> nlp.add_pipe(component, before='ner') | ||||||
|  |             >>> nlp.add_pipe(component, name='custom_name', last=True) | ||||||
|  |         """ | ||||||
|  |         if name is None: | ||||||
|  |             if hasattr(component, 'name'): | ||||||
|  |                 name = component.name | ||||||
|  |             elif hasattr(component, '__name__'): | ||||||
|  |                 name = component.__name__ | ||||||
|  |             elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'): | ||||||
|  |                 name = component.__class__.__name__ | ||||||
|  |             else: | ||||||
|  |                 name = repr(component) | ||||||
|  |         if name in self.pipe_names: | ||||||
|  |             raise ValueError("'{}' already exists in pipeline.".format(name)) | ||||||
|  |         if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: | ||||||
|  |             msg = ("Invalid constraints. You can only set one of the " | ||||||
|  |                    "following: before, after, first, last.") | ||||||
|  |             raise ValueError(msg) | ||||||
|  |         pipe = (name, component) | ||||||
|  |         if last or not any([first, before, after]): | ||||||
|  |             self.pipeline.append(pipe) | ||||||
|  |         elif first: | ||||||
|  |             self.pipeline.insert(0, pipe) | ||||||
|  |         elif before and before in self.pipe_names: | ||||||
|  |             self.pipeline.insert(self.pipe_names.index(before), pipe) | ||||||
|  |         elif after and after in self.pipe_names: | ||||||
|  |             self.pipeline.insert(self.pipe_names.index(after), pipe) | ||||||
|  |         else: | ||||||
|  |             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||||
|  |             unfound = before or after | ||||||
|  |             raise ValueError(msg.format(unfound, self.pipe_names)) | ||||||
|  | 
 | ||||||
|  |     def replace_pipe(self, name, component): | ||||||
|  |         """Replace a component in the pipeline. | ||||||
|  | 
 | ||||||
|  |         name (unicode): Name of the component to replace. | ||||||
|  |         component (callable): Pipeline component. | ||||||
|  |         """ | ||||||
|  |         if name not in self.pipe_names: | ||||||
|  |             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||||
|  |             raise ValueError(msg.format(name, self.pipe_names)) | ||||||
|  |         self.pipeline[self.pipe_names.index(name)] = (name, component) | ||||||
|  | 
 | ||||||
|  |     def rename_pipe(self, old_name, new_name): | ||||||
|  |         """Rename a pipeline component. | ||||||
|  | 
 | ||||||
|  |         old_name (unicode): Name of the component to rename. | ||||||
|  |         new_name (unicode): New name of the component. | ||||||
|  |         """ | ||||||
|  |         if old_name not in self.pipe_names: | ||||||
|  |             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||||
|  |             raise ValueError(msg.format(old_name, self.pipe_names)) | ||||||
|  |         if new_name in self.pipe_names: | ||||||
|  |             msg = "'{}' already exists in pipeline. Existing names: {}" | ||||||
|  |             raise ValueError(msg.format(new_name, self.pipe_names)) | ||||||
|  |         i = self.pipe_names.index(old_name) | ||||||
|  |         self.pipeline[i] = (new_name, self.pipeline[i][1]) | ||||||
|  | 
 | ||||||
|  |     def remove_pipe(self, name): | ||||||
|  |         """Remove a component from the pipeline. | ||||||
|  | 
 | ||||||
|  |         name (unicode): Name of the component to remove. | ||||||
|  |         RETURNS (tuple): A `(name, component)` tuple of the removed component. | ||||||
|  |         """ | ||||||
|  |         if name not in self.pipe_names: | ||||||
|  |             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||||
|  |             raise ValueError(msg.format(name, self.pipe_names)) | ||||||
|  |         return self.pipeline.pop(self.pipe_names.index(name)) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, text, disable=[]): |     def __call__(self, text, disable=[]): | ||||||
|         """'Apply the pipeline to some text. The text can span multiple sentences, |         """Apply the pipeline to some text. The text can span multiple sentences, | ||||||
|         and can contain arbtrary whitespace. Alignment into the original string |         and can contain arbtrary whitespace. Alignment into the original string | ||||||
|         is preserved. |         is preserved. | ||||||
| 
 | 
 | ||||||
|  | @ -269,8 +306,7 @@ class Language(object): | ||||||
|             ('An', 'NN') |             ('An', 'NN') | ||||||
|         """ |         """ | ||||||
|         doc = self.make_doc(text) |         doc = self.make_doc(text) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             name = getattr(proc, 'name', None) |  | ||||||
|             if name in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             doc = proc(doc) |             doc = proc(doc) | ||||||
|  | @ -308,7 +344,7 @@ class Language(object): | ||||||
|             grads[key] = (W, dW) |             grads[key] = (W, dW) | ||||||
|         pipes = list(self.pipeline) |         pipes = list(self.pipeline) | ||||||
|         random.shuffle(pipes) |         random.shuffle(pipes) | ||||||
|         for proc in pipes: |         for name, proc in pipes: | ||||||
|             if not hasattr(proc, 'update'): |             if not hasattr(proc, 'update'): | ||||||
|                 continue |                 continue | ||||||
|             proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) |             proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) | ||||||
|  | @ -322,7 +358,7 @@ class Language(object): | ||||||
|         docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. |         docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. | ||||||
|         YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. |         YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. | ||||||
|         """ |         """ | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if hasattr(proc, 'preprocess_gold'): |             if hasattr(proc, 'preprocess_gold'): | ||||||
|                 docs_golds = proc.preprocess_gold(docs_golds) |                 docs_golds = proc.preprocess_gold(docs_golds) | ||||||
|         for doc, gold in docs_golds: |         for doc, gold in docs_golds: | ||||||
|  | @ -354,7 +390,7 @@ class Language(object): | ||||||
| 
 | 
 | ||||||
|         get_gold_tuples (function): Function returning gold data |         get_gold_tuples (function): Function returning gold data | ||||||
|         **cfg: Config parameters. |         **cfg: Config parameters. | ||||||
|         returns: An optimizer |         RETURNS: An optimizer | ||||||
|         """ |         """ | ||||||
|         # Populate vocab |         # Populate vocab | ||||||
|         if get_gold_tuples is not None: |         if get_gold_tuples is not None: | ||||||
|  | @ -371,7 +407,7 @@ class Language(object): | ||||||
|         else: |         else: | ||||||
|             device = None |             device = None | ||||||
|         link_vectors_to_models(self.vocab) |         link_vectors_to_models(self.vocab) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if hasattr(proc, 'begin_training'): |             if hasattr(proc, 'begin_training'): | ||||||
|                 context = proc.begin_training(get_gold_tuples(), |                 context = proc.begin_training(get_gold_tuples(), | ||||||
|                                               pipeline=self.pipeline) |                                               pipeline=self.pipeline) | ||||||
|  | @ -393,7 +429,7 @@ class Language(object): | ||||||
|         docs, golds = zip(*docs_golds) |         docs, golds = zip(*docs_golds) | ||||||
|         docs = list(docs) |         docs = list(docs) | ||||||
|         golds = list(golds) |         golds = list(golds) | ||||||
|         for pipe in self.pipeline: |         for name, pipe in self.pipeline: | ||||||
|             if not hasattr(pipe, 'pipe'): |             if not hasattr(pipe, 'pipe'): | ||||||
|                 for doc in docs: |                 for doc in docs: | ||||||
|                     pipe(doc) |                     pipe(doc) | ||||||
|  | @ -419,7 +455,7 @@ class Language(object): | ||||||
|             >>> with nlp.use_params(optimizer.averages): |             >>> with nlp.use_params(optimizer.averages): | ||||||
|             >>>     nlp.to_disk('/tmp/checkpoint') |             >>>     nlp.to_disk('/tmp/checkpoint') | ||||||
|         """ |         """ | ||||||
|         contexts = [pipe.use_params(params) for pipe |         contexts = [pipe.use_params(params) for name, pipe | ||||||
|                     in self.pipeline if hasattr(pipe, 'use_params')] |                     in self.pipeline if hasattr(pipe, 'use_params')] | ||||||
|         # TODO: Having trouble with contextlib |         # TODO: Having trouble with contextlib | ||||||
|         # Workaround: these aren't actually context managers atm. |         # Workaround: these aren't actually context managers atm. | ||||||
|  | @ -466,8 +502,7 @@ class Language(object): | ||||||
|                 yield (doc, context) |                 yield (doc, context) | ||||||
|             return |             return | ||||||
|         docs = (self.make_doc(text) for text in texts) |         docs = (self.make_doc(text) for text in texts) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             name = getattr(proc, 'name', None) |  | ||||||
|             if name in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             if hasattr(proc, 'pipe'): |             if hasattr(proc, 'pipe'): | ||||||
|  | @ -495,14 +530,14 @@ class Language(object): | ||||||
|             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), |             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), | ||||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) |             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||||
|         )) |         )) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if not hasattr(proc, 'name'): |             if not hasattr(proc, 'name'): | ||||||
|                 continue |                 continue | ||||||
|             if proc.name in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             if not hasattr(proc, 'to_disk'): |             if not hasattr(proc, 'to_disk'): | ||||||
|                 continue |                 continue | ||||||
|             serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) |             serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) | ||||||
|         serializers['vocab'] = lambda p: self.vocab.to_disk(p) |         serializers['vocab'] = lambda p: self.vocab.to_disk(p) | ||||||
|         util.to_disk(path, serializers, {p: False for p in disable}) |         util.to_disk(path, serializers, {p: False for p in disable}) | ||||||
| 
 | 
 | ||||||
|  | @ -526,14 +561,12 @@ class Language(object): | ||||||
|             ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), |             ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), | ||||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) |             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||||
|         )) |         )) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if not hasattr(proc, 'name'): |             if name in disable: | ||||||
|                 continue |  | ||||||
|             if proc.name in disable: |  | ||||||
|                 continue |                 continue | ||||||
|             if not hasattr(proc, 'to_disk'): |             if not hasattr(proc, 'to_disk'): | ||||||
|                 continue |                 continue | ||||||
|             deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) |             deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) | ||||||
|         exclude = {p: False for p in disable} |         exclude = {p: False for p in disable} | ||||||
|         if not (path / 'vocab').exists(): |         if not (path / 'vocab').exists(): | ||||||
|             exclude['vocab'] = True |             exclude['vocab'] = True | ||||||
|  | @ -552,8 +585,8 @@ class Language(object): | ||||||
|             ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), |             ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), | ||||||
|             ('meta', lambda: ujson.dumps(self.meta)) |             ('meta', lambda: ujson.dumps(self.meta)) | ||||||
|         )) |         )) | ||||||
|         for i, proc in enumerate(self.pipeline): |         for i, (name, proc) in enumerate(self.pipeline): | ||||||
|             if getattr(proc, 'name', None) in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             if not hasattr(proc, 'to_bytes'): |             if not hasattr(proc, 'to_bytes'): | ||||||
|                 continue |                 continue | ||||||
|  | @ -572,8 +605,8 @@ class Language(object): | ||||||
|             ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), |             ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), | ||||||
|             ('meta', lambda b: self.meta.update(ujson.loads(b))) |             ('meta', lambda b: self.meta.update(ujson.loads(b))) | ||||||
|         )) |         )) | ||||||
|         for i, proc in enumerate(self.pipeline): |         for i, (name, proc) in enumerate(self.pipeline): | ||||||
|             if getattr(proc, 'name', None) in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             if not hasattr(proc, 'from_bytes'): |             if not hasattr(proc, 'from_bytes'): | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|  | @ -10,20 +10,23 @@ class Lemmatizer(object): | ||||||
|     def load(cls, path, index=None, exc=None, rules=None): |     def load(cls, path, index=None, exc=None, rules=None): | ||||||
|         return cls(index or {}, exc or {}, rules or {}) |         return cls(index or {}, exc or {}, rules or {}) | ||||||
| 
 | 
 | ||||||
|     def __init__(self, index, exceptions, rules): |     def __init__(self, index=None, exceptions=None, rules=None, lookup=None): | ||||||
|         self.index = index |         self.index = index if index is not None else {} | ||||||
|         self.exc = exceptions |         self.exc = exceptions if exceptions is not None else {} | ||||||
|         self.rules = rules |         self.rules = rules if rules is not None else {} | ||||||
|  |         self.lookup_table = lookup if lookup is not None else {} | ||||||
| 
 | 
 | ||||||
|     def __call__(self, string, univ_pos, morphology=None): |     def __call__(self, string, univ_pos, morphology=None): | ||||||
|         if univ_pos == NOUN: |         if univ_pos in (NOUN, 'NOUN', 'noun'): | ||||||
|             univ_pos = 'noun' |             univ_pos = 'noun' | ||||||
|         elif univ_pos == VERB: |         elif univ_pos in (VERB, 'VERB', 'verb'): | ||||||
|             univ_pos = 'verb' |             univ_pos = 'verb' | ||||||
|         elif univ_pos == ADJ: |         elif univ_pos in (ADJ, 'ADJ', 'adj'): | ||||||
|             univ_pos = 'adj' |             univ_pos = 'adj' | ||||||
|         elif univ_pos == PUNCT: |         elif univ_pos in (PUNCT, 'PUNCT', 'punct'): | ||||||
|             univ_pos = 'punct' |             univ_pos = 'punct' | ||||||
|  |         else: | ||||||
|  |             return set([string.lower()]) | ||||||
|         # See Issue #435 for example of where this logic is requied. |         # See Issue #435 for example of where this logic is requied. | ||||||
|         if self.is_base_form(univ_pos, morphology): |         if self.is_base_form(univ_pos, morphology): | ||||||
|             return set([string.lower()]) |             return set([string.lower()]) | ||||||
|  | @ -77,6 +80,11 @@ class Lemmatizer(object): | ||||||
|     def punct(self, string, morphology=None): |     def punct(self, string, morphology=None): | ||||||
|         return self(string, 'punct', morphology) |         return self(string, 'punct', morphology) | ||||||
| 
 | 
 | ||||||
|  |     def lookup(self, string): | ||||||
|  |         if string in self.lookup_table: | ||||||
|  |             return self.lookup_table[string] | ||||||
|  |         return string | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def lemmatize(string, index, exceptions, rules): | def lemmatize(string, index, exceptions, rules): | ||||||
|     string = string.lower() |     string = string.lower() | ||||||
|  |  | ||||||
|  | @ -1,19 +0,0 @@ | ||||||
| # coding: utf8 |  | ||||||
| from __future__ import unicode_literals |  | ||||||
| 
 |  | ||||||
| from .lemmatizer import Lemmatizer |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class Lemmatizer(Lemmatizer): |  | ||||||
|     @classmethod |  | ||||||
|     def load(cls, path, lookup): |  | ||||||
|         return cls(lookup or {}) |  | ||||||
| 
 |  | ||||||
|     def __init__(self, lookup): |  | ||||||
|         self.lookup = lookup |  | ||||||
| 
 |  | ||||||
|     def __call__(self, string, univ_pos, morphology=None): |  | ||||||
|         try: |  | ||||||
|             return set([self.lookup[string]]) |  | ||||||
|         except: |  | ||||||
|             return set([string]) |  | ||||||
|  | @ -35,6 +35,8 @@ cdef class Morphology: | ||||||
|     cdef RichTagC* rich_tags |     cdef RichTagC* rich_tags | ||||||
|     cdef PreshMapArray _cache |     cdef PreshMapArray _cache | ||||||
| 
 | 
 | ||||||
|  |     cdef int assign_untagged(self, TokenC* token) except -1 | ||||||
|  | 
 | ||||||
|     cdef int assign_tag(self, TokenC* token, tag) except -1 |     cdef int assign_tag(self, TokenC* token, tag) except -1 | ||||||
| 
 | 
 | ||||||
|     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 |     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 | ||||||
|  |  | ||||||
|  | @ -42,7 +42,7 @@ cdef class Morphology: | ||||||
|         self.tag_names = tuple(sorted(tag_map.keys())) |         self.tag_names = tuple(sorted(tag_map.keys())) | ||||||
|         self.reverse_index = {} |         self.reverse_index = {} | ||||||
| 
 | 
 | ||||||
|         self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC)) |         self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) | ||||||
|         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): |         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): | ||||||
|             self.tag_map[tag_str] = dict(attrs) |             self.tag_map[tag_str] = dict(attrs) | ||||||
|             attrs = _normalize_props(attrs) |             attrs = _normalize_props(attrs) | ||||||
|  | @ -52,6 +52,10 @@ cdef class Morphology: | ||||||
|             self.rich_tags[i].morph = 0 |             self.rich_tags[i].morph = 0 | ||||||
|             self.rich_tags[i].pos = attrs[POS] |             self.rich_tags[i].pos = attrs[POS] | ||||||
|             self.reverse_index[self.rich_tags[i].name] = i |             self.reverse_index[self.rich_tags[i].name] = i | ||||||
|  |         # Add a 'null' tag, which we can reference when assign morphology to | ||||||
|  |         # untagged tokens. | ||||||
|  |         self.rich_tags[self.n_tags].id = self.n_tags | ||||||
|  | 
 | ||||||
|         self._cache = PreshMapArray(self.n_tags) |         self._cache = PreshMapArray(self.n_tags) | ||||||
|         self.exc = {} |         self.exc = {} | ||||||
|         if exc is not None: |         if exc is not None: | ||||||
|  | @ -62,6 +66,15 @@ cdef class Morphology: | ||||||
|         return (Morphology, (self.strings, self.tag_map, self.lemmatizer, |         return (Morphology, (self.strings, self.tag_map, self.lemmatizer, | ||||||
|                              self.exc), None, None) |                              self.exc), None, None) | ||||||
| 
 | 
 | ||||||
|  |     cdef int assign_untagged(self, TokenC* token) except -1: | ||||||
|  |         """Set morphological attributes on a token without a POS tag. Uses | ||||||
|  |         the lemmatizer's lookup() method, which looks up the string in the | ||||||
|  |         table provided by the language data as lemma_lookup (if available).""" | ||||||
|  |         if token.lemma == 0: | ||||||
|  |             orth_str = self.strings[token.lex.orth] | ||||||
|  |             lemma = self.lemmatizer.lookup(orth_str) | ||||||
|  |             token.lemma = self.strings.add(lemma) | ||||||
|  | 
 | ||||||
|     cdef int assign_tag(self, TokenC* token, tag) except -1: |     cdef int assign_tag(self, TokenC* token, tag) except -1: | ||||||
|         if isinstance(tag, basestring): |         if isinstance(tag, basestring): | ||||||
|             tag = self.strings.add(tag) |             tag = self.strings.add(tag) | ||||||
|  | @ -72,7 +85,7 @@ cdef class Morphology: | ||||||
|             token.tag = tag |             token.tag = tag | ||||||
| 
 | 
 | ||||||
|     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: |     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: | ||||||
|         if tag_id >= self.n_tags: |         if tag_id > self.n_tags: | ||||||
|             raise ValueError("Unknown tag ID: %s" % tag_id) |             raise ValueError("Unknown tag ID: %s" % tag_id) | ||||||
|         # TODO: It's pretty arbitrary to put this logic here. I guess the justification |         # TODO: It's pretty arbitrary to put this logic here. I guess the justification | ||||||
|         # is that this is where the specific word and the tag interact. Still, |         # is that this is where the specific word and the tag interact. Still, | ||||||
|  | @ -151,8 +164,6 @@ cdef class Morphology: | ||||||
|         cdef unicode py_string = self.strings[orth] |         cdef unicode py_string = self.strings[orth] | ||||||
|         if self.lemmatizer is None: |         if self.lemmatizer is None: | ||||||
|             return self.strings.add(py_string.lower()) |             return self.strings.add(py_string.lower()) | ||||||
|         if univ_pos not in (NOUN, VERB, ADJ, PUNCT): |  | ||||||
|             return self.strings.add(py_string.lower()) |  | ||||||
|         cdef set lemma_strings |         cdef set lemma_strings | ||||||
|         cdef unicode lemma_string |         cdef unicode lemma_string | ||||||
|         lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) |         lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) | ||||||
|  |  | ||||||
|  | @ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity | ||||||
| from .tokens.doc cimport Doc | from .tokens.doc cimport Doc | ||||||
| from .syntax.parser cimport Parser as LinearParser | from .syntax.parser cimport Parser as LinearParser | ||||||
| from .syntax.nn_parser cimport Parser as NeuralParser | from .syntax.nn_parser cimport Parser as NeuralParser | ||||||
|  | from .syntax import nonproj | ||||||
| from .syntax.parser import get_templates as get_feature_templates | from .syntax.parser import get_templates as get_feature_templates | ||||||
| from .syntax.beam_parser cimport BeamParser | from .syntax.beam_parser cimport BeamParser | ||||||
| from .syntax.ner cimport BiluoPushDown | from .syntax.ner cimport BiluoPushDown | ||||||
|  | @ -157,11 +158,13 @@ class BaseThincComponent(object): | ||||||
| 
 | 
 | ||||||
|     def to_bytes(self, **exclude): |     def to_bytes(self, **exclude): | ||||||
|         """Serialize the pipe to a bytestring.""" |         """Serialize the pipe to a bytestring.""" | ||||||
|         serialize = OrderedDict(( |         serialize = OrderedDict() | ||||||
|             ('cfg', lambda: json_dumps(self.cfg)), |         serialize['cfg'] = lambda: json_dumps(self.cfg) | ||||||
|             ('model', lambda: self.model.to_bytes()), |         if self.model in (True, False, None): | ||||||
|             ('vocab', lambda: self.vocab.to_bytes()) |             serialize['model'] = lambda: self.model | ||||||
|         )) |         else: | ||||||
|  |             serialize['model'] = self.model.to_bytes | ||||||
|  |         serialize['vocab'] = self.vocab.to_bytes | ||||||
|         return util.to_bytes(serialize, exclude) |         return util.to_bytes(serialize, exclude) | ||||||
| 
 | 
 | ||||||
|     def from_bytes(self, bytes_data, **exclude): |     def from_bytes(self, bytes_data, **exclude): | ||||||
|  | @ -182,11 +185,11 @@ class BaseThincComponent(object): | ||||||
| 
 | 
 | ||||||
|     def to_disk(self, path, **exclude): |     def to_disk(self, path, **exclude): | ||||||
|         """Serialize the pipe to disk.""" |         """Serialize the pipe to disk.""" | ||||||
|         serialize = OrderedDict(( |         serialize = OrderedDict() | ||||||
|             ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), |         serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg)) | ||||||
|             ('vocab', lambda p: self.vocab.to_disk(p)), |         serialize['vocab'] = lambda p: self.vocab.to_disk(p) | ||||||
|             ('model', lambda p: p.open('wb').write(self.model.to_bytes())), |         if self.model not in (None, True, False): | ||||||
|         )) |             serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes()) | ||||||
|         util.to_disk(path, serialize, exclude) |         util.to_disk(path, serialize, exclude) | ||||||
| 
 | 
 | ||||||
|     def from_disk(self, path, **exclude): |     def from_disk(self, path, **exclude): | ||||||
|  | @ -437,13 +440,16 @@ class NeuralTagger(BaseThincComponent): | ||||||
|             yield |             yield | ||||||
| 
 | 
 | ||||||
|     def to_bytes(self, **exclude): |     def to_bytes(self, **exclude): | ||||||
|         serialize = OrderedDict(( |         serialize = OrderedDict() | ||||||
|             ('model', lambda: self.model.to_bytes()), |         if self.model in (None, True, False): | ||||||
|             ('vocab', lambda: self.vocab.to_bytes()), |             serialize['model'] = lambda: self.model | ||||||
|             ('tag_map', lambda: msgpack.dumps(self.vocab.morphology.tag_map, |         else: | ||||||
|  |             serialize['model'] = self.model.to_bytes | ||||||
|  |         serialize['vocab'] = self.vocab.to_bytes | ||||||
|  | 
 | ||||||
|  |         serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map, | ||||||
|                                                      use_bin_type=True, |                                                      use_bin_type=True, | ||||||
|                                              encoding='utf8')) |                                                      encoding='utf8') | ||||||
|         )) |  | ||||||
|         return util.to_bytes(serialize, exclude) |         return util.to_bytes(serialize, exclude) | ||||||
| 
 | 
 | ||||||
|     def from_bytes(self, bytes_data, **exclude): |     def from_bytes(self, bytes_data, **exclude): | ||||||
|  | @ -778,11 +784,19 @@ cdef class DependencyParser(LinearParser): | ||||||
|         if isinstance(label, basestring): |         if isinstance(label, basestring): | ||||||
|             label = self.vocab.strings[label] |             label = self.vocab.strings[label] | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def postprocesses(self): | ||||||
|  |         return [nonproj.deprojectivize] | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| cdef class NeuralDependencyParser(NeuralParser): | cdef class NeuralDependencyParser(NeuralParser): | ||||||
|     name = 'parser' |     name = 'parser' | ||||||
|     TransitionSystem = ArcEager |     TransitionSystem = ArcEager | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def postprocesses(self): | ||||||
|  |         return [nonproj.deprojectivize] | ||||||
|  | 
 | ||||||
|     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): |     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||||
|         for target in []: |         for target in []: | ||||||
|             labeller = NeuralLabeller(self.vocab, target=target) |             labeller = NeuralLabeller(self.vocab, target=target) | ||||||
|  | @ -823,6 +837,11 @@ cdef class BeamDependencyParser(BeamParser): | ||||||
|         if isinstance(label, basestring): |         if isinstance(label, basestring): | ||||||
|             label = self.vocab.strings[label] |             label = self.vocab.strings[label] | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def postprocesses(self): | ||||||
|  |         return [nonproj.deprojectivize] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', | __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', | ||||||
|            'BeamEntityRecognizer', 'TokenVectorEnoder'] |            'BeamEntityRecognizer', 'TokenVectorEnoder'] | ||||||
|  |  | ||||||
|  | @ -241,8 +241,8 @@ cdef class Parser: | ||||||
|     def Model(cls, nr_class, **cfg): |     def Model(cls, nr_class, **cfg): | ||||||
|         depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) |         depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) | ||||||
|         token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) |         token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) | ||||||
|         hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) |         hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200)) | ||||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 1)) |         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) | ||||||
|         embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) |         embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) | ||||||
|         hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) |         hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) | ||||||
|         hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) |         hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) | ||||||
|  | @ -779,6 +779,14 @@ cdef class Parser: | ||||||
|             for i in range(doc.length): |             for i in range(doc.length): | ||||||
|                 doc.c[i] = state.c._sent[i] |                 doc.c[i] = state.c._sent[i] | ||||||
|             self.moves.finalize_doc(doc) |             self.moves.finalize_doc(doc) | ||||||
|  |             for hook in self.postprocesses: | ||||||
|  |                 for doc in docs: | ||||||
|  |                     hook(doc) | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def postprocesses(self): | ||||||
|  |         # Available for subclasses, e.g. to deprojectivize | ||||||
|  |         return [] | ||||||
| 
 | 
 | ||||||
|     def add_label(self, label): |     def add_label(self, label): | ||||||
|         resized = False |         resized = False | ||||||
|  | @ -792,6 +800,15 @@ cdef class Parser: | ||||||
|         if self.model not in (True, False, None) and resized: |         if self.model not in (True, False, None) and resized: | ||||||
|             # Weights are stored in (nr_out, nr_in) format, so we're basically |             # Weights are stored in (nr_out, nr_in) format, so we're basically | ||||||
|             # just adding rows here. |             # just adding rows here. | ||||||
|  |             if self.model[-1].is_noop: | ||||||
|  |                 smaller = self.model[1] | ||||||
|  |                 dims = dict(self.model[1]._dims) | ||||||
|  |                 dims['nO'] = self.moves.n_moves | ||||||
|  |                 larger = self.model[1].__class__(**dims) | ||||||
|  |                 copy_array(larger.W[:, :smaller.nO], smaller.W) | ||||||
|  |                 copy_array(larger.b[:smaller.nO], smaller.b) | ||||||
|  |                 self.model = (self.model[0], larger, self.model[2]) | ||||||
|  |             else: | ||||||
|                 smaller = self.model[-1]._layers[-1] |                 smaller = self.model[-1]._layers[-1] | ||||||
|                 larger = Affine(self.moves.n_moves, smaller.nI) |                 larger = Affine(self.moves.n_moves, smaller.nI) | ||||||
|                 copy_array(larger.W[:smaller.nO], smaller.W) |                 copy_array(larger.W[:smaller.nO], smaller.W) | ||||||
|  | @ -801,7 +818,7 @@ cdef class Parser: | ||||||
|     def begin_training(self, gold_tuples, pipeline=None, **cfg): |     def begin_training(self, gold_tuples, pipeline=None, **cfg): | ||||||
|         if 'model' in cfg: |         if 'model' in cfg: | ||||||
|             self.model = cfg['model'] |             self.model = cfg['model'] | ||||||
|         gold_tuples = nonproj.preprocess_training_data(gold_tuples) |         gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100) | ||||||
|         actions = self.moves.get_actions(gold_parses=gold_tuples) |         actions = self.moves.get_actions(gold_parses=gold_tuples) | ||||||
|         for action, labels in actions.items(): |         for action, labels in actions.items(): | ||||||
|             for label in labels: |             for label in labels: | ||||||
|  |  | ||||||
|  | @ -58,8 +58,9 @@ def en_vocab(): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def en_parser(): | def en_parser(en_vocab): | ||||||
|     return util.get_lang_class('en').Defaults.create_parser() |     nlp = util.get_lang_class('en')(en_vocab) | ||||||
|  |     return nlp.create_pipe('parser') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
|  |  | ||||||
							
								
								
									
										37
									
								
								spacy/tests/doc/test_creation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								spacy/tests/doc/test_creation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | ||||||
|  | '''Test Doc sets up tokens correctly.''' | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | from ...vocab import Vocab | ||||||
|  | from ...tokens.doc import Doc | ||||||
|  | from ...lemmatizer import Lemmatizer | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture | ||||||
|  | def lemmatizer(): | ||||||
|  |     return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'}) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture | ||||||
|  | def vocab(lemmatizer): | ||||||
|  |     return Vocab(lemmatizer=lemmatizer) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_empty_doc(vocab): | ||||||
|  |     doc = Doc(vocab) | ||||||
|  |     assert len(doc) == 0 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_single_word(vocab): | ||||||
|  |     doc = Doc(vocab, words=['a']) | ||||||
|  |     assert doc.text == 'a ' | ||||||
|  |     doc = Doc(vocab, words=['a'], spaces=[False]) | ||||||
|  |     assert doc.text == 'a' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_lookup_lemmatization(vocab): | ||||||
|  |     doc = Doc(vocab, words=['dogs', 'dogses']) | ||||||
|  |     assert doc[0].text == 'dogs' | ||||||
|  |     assert doc[0].lemma_ == 'dog' | ||||||
|  |     assert doc[1].text == 'dogses' | ||||||
|  |     assert doc[1].lemma_ == 'dogses' | ||||||
							
								
								
									
										13
									
								
								spacy/tests/lang/de/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								spacy/tests/lang/de/test_lemma.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,13 @@ | ||||||
|  | # coding: utf-8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('string,lemma', [('Abgehängten', 'Abgehängte'), | ||||||
|  |                                           ('engagierte', 'engagieren'), | ||||||
|  |                                           ('schließt', 'schließen'), | ||||||
|  |                                           ('vorgebenden', 'vorgebend')]) | ||||||
|  | def test_lemmatizer_lookup_assigns(de_tokenizer, string, lemma): | ||||||
|  |     tokens = de_tokenizer(string) | ||||||
|  |     assert tokens[0].lemma_ == lemma | ||||||
|  | @ -57,6 +57,5 @@ def test_en_lemmatizer_punct(en_lemmatizer): | ||||||
| def test_en_lemmatizer_lemma_assignment(EN): | def test_en_lemmatizer_lemma_assignment(EN): | ||||||
|     text = "Bananas in pyjamas are geese." |     text = "Bananas in pyjamas are geese." | ||||||
|     doc = EN.make_doc(text) |     doc = EN.make_doc(text) | ||||||
|     assert all(t.lemma_ == '' for t in doc) |  | ||||||
|     EN.tagger(doc) |     EN.tagger(doc) | ||||||
|     assert all(t.lemma_ != '' for t in doc) |     assert all(t.lemma_ != '' for t in doc) | ||||||
|  |  | ||||||
|  | @ -22,14 +22,14 @@ def vocab(): | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def parser(vocab): | def parser(vocab): | ||||||
|     parser = NeuralDependencyParser(vocab) |     parser = NeuralDependencyParser(vocab) | ||||||
|     parser.cfg['token_vector_width'] = 4 |     parser.cfg['token_vector_width'] = 8 | ||||||
|     parser.cfg['hidden_width'] = 6 |     parser.cfg['hidden_width'] = 30 | ||||||
|     parser.cfg['hist_size'] = 0 |     parser.cfg['hist_size'] = 0 | ||||||
|     parser.add_label('left') |     parser.add_label('left') | ||||||
|     parser.begin_training([], **parser.cfg) |     parser.begin_training([], **parser.cfg) | ||||||
|     sgd = Adam(NumpyOps(), 0.001) |     sgd = Adam(NumpyOps(), 0.001) | ||||||
| 
 | 
 | ||||||
|     for i in range(30): |     for i in range(10): | ||||||
|         losses = {} |         losses = {} | ||||||
|         doc = Doc(vocab, words=['a', 'b', 'c', 'd']) |         doc = Doc(vocab, words=['a', 'b', 'c', 'd']) | ||||||
|         gold = GoldParse(doc, heads=[1, 1, 3, 3], |         gold = GoldParse(doc, heads=[1, 1, 3, 3], | ||||||
|  | @ -37,6 +37,8 @@ def parser(vocab): | ||||||
|         parser.update([doc], [gold], sgd=sgd, losses=losses) |         parser.update([doc], [gold], sgd=sgd, losses=losses) | ||||||
|     return parser |     return parser | ||||||
| 
 | 
 | ||||||
|  | def test_init_parser(parser): | ||||||
|  |     pass | ||||||
| 
 | 
 | ||||||
| def test_add_label(parser): | def test_add_label(parser): | ||||||
|     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) |     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) | ||||||
|  |  | ||||||
|  | @ -1,10 +1,11 @@ | ||||||
| import spacy | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| @pytest.mark.models |  | ||||||
| def test_beam_parse(): |  | ||||||
|     nlp = spacy.load('en_core_web_sm') |  | ||||||
|     doc = nlp(u'Australia is a country', disable=['ner']) |  | ||||||
|     ents = nlp.entity(doc, beam_width=2) |  | ||||||
|     print(ents) |  | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.models('en') | ||||||
|  | def test_beam_parse(EN): | ||||||
|  |     doc = EN(u'Australia is a country', disable=['ner']) | ||||||
|  |     ents = EN.entity(doc, beam_width=2) | ||||||
|  |     print(ents) | ||||||
|  |  | ||||||
|  | @ -35,7 +35,7 @@ def parser(vocab): | ||||||
| def test_no_sentences(parser): | def test_no_sentences(parser): | ||||||
|     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) |     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) | ||||||
|     doc = parser(doc) |     doc = parser(doc) | ||||||
|     assert len(list(doc.sents)) == 2 |     assert len(list(doc.sents)) >= 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_sents_1(parser): | def test_sents_1(parser): | ||||||
|  | @ -64,7 +64,7 @@ def test_sents_1_3(parser): | ||||||
|     doc[1].sent_start = True |     doc[1].sent_start = True | ||||||
|     doc[3].sent_start = True |     doc[3].sent_start = True | ||||||
|     doc = parser(doc) |     doc = parser(doc) | ||||||
|     assert len(list(doc.sents)) == 4 |     assert len(list(doc.sents)) >= 3 | ||||||
|     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) |     doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) | ||||||
|     doc[1].sent_start = True |     doc[1].sent_start = True | ||||||
|     doc[2].sent_start = False |     doc[2].sent_start = False | ||||||
|  |  | ||||||
							
								
								
									
										0
									
								
								spacy/tests/pipeline/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/pipeline/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										84
									
								
								spacy/tests/pipeline/test_pipe_methods.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										84
									
								
								spacy/tests/pipeline/test_pipe_methods.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,84 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | from ...language import Language | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture | ||||||
|  | def nlp(): | ||||||
|  |     return Language() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def new_pipe(doc): | ||||||
|  |     return doc | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_add_pipe_no_name(nlp): | ||||||
|  |     nlp.add_pipe(new_pipe) | ||||||
|  |     assert 'new_pipe' in nlp.pipe_names | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_add_pipe_duplicate_name(nlp): | ||||||
|  |     nlp.add_pipe(new_pipe, name='duplicate_name') | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.add_pipe(new_pipe, name='duplicate_name') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name', ['parser']) | ||||||
|  | def test_add_pipe_first(nlp, name): | ||||||
|  |     nlp.add_pipe(new_pipe, name=name, first=True) | ||||||
|  |     assert nlp.pipeline[0][0] == name | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')]) | ||||||
|  | def test_add_pipe_last(nlp, name1, name2): | ||||||
|  |     nlp.add_pipe(lambda doc: doc, name=name2) | ||||||
|  |     nlp.add_pipe(new_pipe, name=name1, last=True) | ||||||
|  |     assert nlp.pipeline[0][0] != name1 | ||||||
|  |     assert nlp.pipeline[-1][0] == name1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_cant_add_pipe_first_and_last(nlp): | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.add_pipe(new_pipe, first=True, last=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name', ['my_component']) | ||||||
|  | def test_get_pipe(nlp, name): | ||||||
|  |     with pytest.raises(KeyError): | ||||||
|  |         nlp.get_pipe(name) | ||||||
|  |     nlp.add_pipe(new_pipe, name=name) | ||||||
|  |     assert nlp.get_pipe(name) == new_pipe | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)]) | ||||||
|  | def test_replace_pipe(nlp, name, replacement): | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.replace_pipe(name, new_pipe) | ||||||
|  |     nlp.add_pipe(new_pipe, name=name) | ||||||
|  |     nlp.replace_pipe(name, replacement) | ||||||
|  |     assert nlp.get_pipe(name) != new_pipe | ||||||
|  |     assert nlp.get_pipe(name) == replacement | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')]) | ||||||
|  | def test_rename_pipe(nlp, old_name, new_name): | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.rename_pipe(old_name, new_name) | ||||||
|  |     nlp.add_pipe(new_pipe, name=old_name) | ||||||
|  |     nlp.rename_pipe(old_name, new_name) | ||||||
|  |     assert nlp.pipeline[0][0] == new_name | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name', ['my_component']) | ||||||
|  | def test_remove_pipe(nlp, name): | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.remove_pipe(name) | ||||||
|  |     nlp.add_pipe(new_pipe, name=name) | ||||||
|  |     assert len(nlp.pipeline) == 1 | ||||||
|  |     removed_name, removed_component = nlp.remove_pipe(name) | ||||||
|  |     assert not len(nlp.pipeline) | ||||||
|  |     assert removed_name == name | ||||||
|  |     assert removed_component == new_pipe | ||||||
|  | @ -7,6 +7,7 @@ from ..util import get_doc | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.xfail | ||||||
| def test_issue589(): | def test_issue589(): | ||||||
|     vocab = Vocab() |     vocab = Vocab() | ||||||
|     vocab.strings.set_frozen(True) |     vocab.strings.set_frozen(True) | ||||||
|  |  | ||||||
							
								
								
									
										9
									
								
								spacy/tests/serialize/test_serialize_empty_model.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								spacy/tests/serialize/test_serialize_empty_model.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,9 @@ | ||||||
|  | import spacy | ||||||
|  | import spacy.lang.en | ||||||
|  | from spacy.pipeline import TextCategorizer | ||||||
|  | 
 | ||||||
|  | def test_bytes_serialize_issue_1105(): | ||||||
|  |     nlp = spacy.lang.en.English() | ||||||
|  |     tokenizer = nlp.tokenizer | ||||||
|  |     textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER']) | ||||||
|  |     textcat_bytes = textcat.to_bytes() | ||||||
							
								
								
									
										53
									
								
								spacy/tests/test_underscore.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								spacy/tests/test_underscore.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,53 @@ | ||||||
|  | from mock import Mock | ||||||
|  | from ..tokens.underscore import Underscore | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_create_doc_underscore(): | ||||||
|  |     doc = Mock() | ||||||
|  |     doc.doc = doc | ||||||
|  |     uscore = Underscore(Underscore.doc_extensions, doc) | ||||||
|  |     assert uscore._doc is doc | ||||||
|  |     assert uscore._start is None | ||||||
|  |     assert uscore._end is None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_doc_underscore_getattr_setattr(): | ||||||
|  |     doc = Mock() | ||||||
|  |     doc.doc = doc | ||||||
|  |     doc.user_data = {} | ||||||
|  |     Underscore.doc_extensions['hello'] = (False, None, None, None) | ||||||
|  |     doc._ = Underscore(Underscore.doc_extensions, doc) | ||||||
|  |     assert doc._.hello == False | ||||||
|  |     doc._.hello = True | ||||||
|  |     assert doc._.hello == True | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_create_span_underscore(): | ||||||
|  |     span = Mock(doc=Mock(), start=0, end=2) | ||||||
|  |     uscore = Underscore(Underscore.span_extensions, span, | ||||||
|  |                         start=span.start, end=span.end) | ||||||
|  |     assert uscore._doc is span.doc | ||||||
|  |     assert uscore._start is span.start | ||||||
|  |     assert uscore._end is span.end | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_span_underscore_getter_setter(): | ||||||
|  |     span = Mock(doc=Mock(), start=0, end=2) | ||||||
|  |     Underscore.span_extensions['hello'] = (None, None, | ||||||
|  |                                            lambda s: (s.start, 'hi'), | ||||||
|  |                                            lambda s, value: setattr(s, 'start', | ||||||
|  |                                                                     value)) | ||||||
|  |     span._ = Underscore(Underscore.span_extensions, span, | ||||||
|  |                         start=span.start, end=span.end) | ||||||
|  | 
 | ||||||
|  |     assert span._.hello == (0, 'hi') | ||||||
|  |     span._.hello = 1 | ||||||
|  |     assert span._.hello == (1, 'hi') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_token_underscore_method(): | ||||||
|  |     token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese') | ||||||
|  |     Underscore.token_extensions['hello'] = (None, token.say_cheese, | ||||||
|  |                                             None, None) | ||||||
|  |     token._ = Underscore(Underscore.token_extensions, token, start=token.idx) | ||||||
|  |     assert token._.hello() == 'cheese' | ||||||
|  | @ -30,7 +30,7 @@ from ..util import normalize_slice | ||||||
| from ..compat import is_config | from ..compat import is_config | ||||||
| from .. import about | from .. import about | ||||||
| from .. import util | from .. import util | ||||||
| 
 | from .underscore import Underscore | ||||||
| 
 | 
 | ||||||
| DEF PADDING = 5 | DEF PADDING = 5 | ||||||
| 
 | 
 | ||||||
|  | @ -64,6 +64,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: | ||||||
|     else: |     else: | ||||||
|         return Lexeme.get_struct_attr(token.lex, feat_name) |         return Lexeme.get_struct_attr(token.lex, feat_name) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def _get_chunker(lang): | def _get_chunker(lang): | ||||||
|     try: |     try: | ||||||
|         cls = util.get_lang_class(lang) |         cls = util.get_lang_class(lang) | ||||||
|  | @ -73,6 +74,7 @@ def _get_chunker(lang): | ||||||
|         return None |         return None | ||||||
|     return cls.Defaults.syntax_iterators.get(u'noun_chunks') |     return cls.Defaults.syntax_iterators.get(u'noun_chunks') | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| cdef class Doc: | cdef class Doc: | ||||||
|     """A sequence of Token objects. Access sentences and named entities, export |     """A sequence of Token objects. Access sentences and named entities, export | ||||||
|     annotations to numpy arrays, losslessly serialize to compressed binary strings. |     annotations to numpy arrays, losslessly serialize to compressed binary strings. | ||||||
|  | @ -87,6 +89,21 @@ cdef class Doc: | ||||||
|         >>> from spacy.tokens import Doc |         >>> from spacy.tokens import Doc | ||||||
|         >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) |         >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) | ||||||
|     """ |     """ | ||||||
|  |     @classmethod | ||||||
|  |     def set_extension(cls, name, default=None, method=None, | ||||||
|  |                       getter=None, setter=None): | ||||||
|  |         nr_defined = sum(t is not None for t in (default, getter, setter, method)) | ||||||
|  |         assert nr_defined == 1 | ||||||
|  |         Underscore.doc_extensions[name] = (default, method, getter, setter)  | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def get_extension(cls, name): | ||||||
|  |         return Underscore.doc_extensions.get(name) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def has_extension(cls, name): | ||||||
|  |         return name in Underscore.doc_extensions | ||||||
|  | 
 | ||||||
|     def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): |     def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): | ||||||
|         """Create a Doc object. |         """Create a Doc object. | ||||||
| 
 | 
 | ||||||
|  | @ -159,6 +176,10 @@ cdef class Doc: | ||||||
|             self.is_tagged = True |             self.is_tagged = True | ||||||
|             self.is_parsed = True |             self.is_parsed = True | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def _(self): | ||||||
|  |         return Underscore(Underscore.doc_extensions, self) | ||||||
|  | 
 | ||||||
|     def __getitem__(self, object i): |     def __getitem__(self, object i): | ||||||
|         """Get a `Token` or `Span` object. |         """Get a `Token` or `Span` object. | ||||||
| 
 | 
 | ||||||
|  | @ -512,6 +533,8 @@ cdef class Doc: | ||||||
|         assert t.lex.orth != 0 |         assert t.lex.orth != 0 | ||||||
|         t.spacy = has_space |         t.spacy = has_space | ||||||
|         self.length += 1 |         self.length += 1 | ||||||
|  |         # Set morphological attributes, e.g. by lemma, if possible | ||||||
|  |         self.vocab.morphology.assign_untagged(t) | ||||||
|         self._py_tokens.append(None) |         self._py_tokens.append(None) | ||||||
|         return t.idx + t.lex.length + t.spacy |         return t.idx + t.lex.length + t.spacy | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -17,10 +17,24 @@ from ..attrs cimport IS_PUNCT, IS_SPACE | ||||||
| from ..lexeme cimport Lexeme | from ..lexeme cimport Lexeme | ||||||
| from ..compat import is_config | from ..compat import is_config | ||||||
| from .. import about | from .. import about | ||||||
|  | from .underscore import Underscore | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Span: | cdef class Span: | ||||||
|     """A slice from a Doc object.""" |     """A slice from a Doc object.""" | ||||||
|  |     @classmethod | ||||||
|  |     def set_extension(cls, name, default=None, method=None, | ||||||
|  |                       getter=None, setter=None): | ||||||
|  |         Underscore.span_extensions[name] = (default, method, getter, setter) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def get_extension(cls, name): | ||||||
|  |         return Underscore.span_extensions.get(name) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def has_extension(cls, name): | ||||||
|  |         return name in Underscore.span_extensions | ||||||
|  | 
 | ||||||
|     def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, |     def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, | ||||||
|                   vector_norm=None): |                   vector_norm=None): | ||||||
|         """Create a `Span` object from the slice `doc[start : end]`. |         """Create a `Span` object from the slice `doc[start : end]`. | ||||||
|  | @ -111,6 +125,10 @@ cdef class Span: | ||||||
|         for i in range(self.start, self.end): |         for i in range(self.start, self.end): | ||||||
|             yield self.doc[i] |             yield self.doc[i] | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def _(self): | ||||||
|  |         return Underscore(Underscore.span_extensions, self, | ||||||
|  |                           start=self.start_char, end=self.end_char) | ||||||
|     def as_doc(self): |     def as_doc(self): | ||||||
|         '''Create a Doc object view of the Span's data. |         '''Create a Doc object view of the Span's data. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -20,10 +20,24 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST | ||||||
| from ..attrs cimport LEMMA, POS, TAG, DEP | from ..attrs cimport LEMMA, POS, TAG, DEP | ||||||
| from ..compat import is_config | from ..compat import is_config | ||||||
| from .. import about | from .. import about | ||||||
|  | from .underscore import Underscore | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Token: | cdef class Token: | ||||||
|     """An individual token – i.e. a word, punctuation symbol, whitespace, etc.""" |     """An individual token – i.e. a word, punctuation symbol, whitespace, etc.""" | ||||||
|  |     @classmethod | ||||||
|  |     def set_extension(cls, name, default=None, method=None, | ||||||
|  |                       getter=None, setter=None): | ||||||
|  |         Underscore.token_extensions[name] = (default, method, getter, setter) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def get_extension(cls, name): | ||||||
|  |         return Underscore.span_extensions.get(name) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def has_extension(cls, name): | ||||||
|  |         return name in Underscore.span_extensions | ||||||
|  | 
 | ||||||
|     def __cinit__(self, Vocab vocab, Doc doc, int offset): |     def __cinit__(self, Vocab vocab, Doc doc, int offset): | ||||||
|         """Construct a `Token` object. |         """Construct a `Token` object. | ||||||
| 
 | 
 | ||||||
|  | @ -87,6 +101,11 @@ cdef class Token: | ||||||
|         else: |         else: | ||||||
|             raise ValueError(op) |             raise ValueError(op) | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def _(self): | ||||||
|  |         return Underscore(Underscore.token_extensions, self, | ||||||
|  |                           start=self.idx, end=None) | ||||||
|  | 
 | ||||||
|     cpdef bint check_flag(self, attr_id_t flag_id) except -1: |     cpdef bint check_flag(self, attr_id_t flag_id) except -1: | ||||||
|         """Check the value of a boolean flag. |         """Check the value of a boolean flag. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										50
									
								
								spacy/tokens/underscore.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								spacy/tokens/underscore.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,50 @@ | ||||||
|  | import functools | ||||||
|  | 
 | ||||||
|  | class Underscore(object): | ||||||
|  |     doc_extensions = {} | ||||||
|  |     span_extensions = {} | ||||||
|  |     token_extensions = {} | ||||||
|  | 
 | ||||||
|  |     def __init__(self, extensions, obj, start=None, end=None): | ||||||
|  |         object.__setattr__(self, '_extensions', extensions) | ||||||
|  |         object.__setattr__(self, '_obj', obj) | ||||||
|  |         # Assumption is that for doc values, _start and _end will both be None | ||||||
|  |         # Span will set non-None values for _start and _end | ||||||
|  |         # Token will have _start be non-None, _end be None | ||||||
|  |         # This lets us key everything into the doc.user_data dictionary, | ||||||
|  |         # (see _get_key), and lets us use a single Underscore class. | ||||||
|  |         object.__setattr__(self, '_doc', obj.doc) | ||||||
|  |         object.__setattr__(self, '_start', start) | ||||||
|  |         object.__setattr__(self, '_end', end) | ||||||
|  | 
 | ||||||
|  |     def __getattr__(self, name): | ||||||
|  |         if name not in self._extensions: | ||||||
|  |             raise AttributeError(name) | ||||||
|  |         default, method, getter, setter = self._extensions[name] | ||||||
|  |         if getter is not None: | ||||||
|  |             return getter(self._obj) | ||||||
|  |         elif method is not None: | ||||||
|  |             return functools.partial(method, self._obj) | ||||||
|  |         else: | ||||||
|  |             return self._doc.user_data.get(self._get_key(name), default) | ||||||
|  | 
 | ||||||
|  |     def __setattr__(self, name, value): | ||||||
|  |         if name not in self._extensions: | ||||||
|  |             raise AttributeError(name) | ||||||
|  |         default, method, getter, setter = self._extensions[name] | ||||||
|  |         if setter is not None: | ||||||
|  |             return setter(self._obj, value) | ||||||
|  |         else: | ||||||
|  |             self._doc.user_data[self._get_key(name)] = value | ||||||
|  | 
 | ||||||
|  |     def set(self, name, value): | ||||||
|  |         return self.__setattr__(name, value) | ||||||
|  | 
 | ||||||
|  |     def get(self, name): | ||||||
|  |         return self.__getattr__(name) | ||||||
|  | 
 | ||||||
|  |     def has(self, name): | ||||||
|  |         return name in self._extensions | ||||||
|  | 
 | ||||||
|  |     def _get_key(self, name): | ||||||
|  |         return ('._.', name, self._start, self._end) | ||||||
|  | @ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides): | ||||||
|     if not meta: |     if not meta: | ||||||
|         meta = get_model_meta(model_path) |         meta = get_model_meta(model_path) | ||||||
|     cls = get_lang_class(meta['lang']) |     cls = get_lang_class(meta['lang']) | ||||||
|     nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides) |     nlp = cls(meta=meta, **overrides) | ||||||
|  |     pipeline = meta.get('pipeline', []) | ||||||
|  |     disable = overrides.get('disable', []) | ||||||
|  |     if pipeline is True: | ||||||
|  |         pipeline = nlp.Defaults.pipe_names | ||||||
|  |     elif pipeline in (False, None): | ||||||
|  |         pipeline = [] | ||||||
|  |     for name in pipeline: | ||||||
|  |         if name not in disable: | ||||||
|  |             config = meta.get('pipeline_args', {}).get(name, {}) | ||||||
|  |             component = nlp.create_pipe(name, config=config) | ||||||
|  |             nlp.add_pipe(component, name=name) | ||||||
|     return nlp.from_disk(model_path) |     return nlp.from_disk(model_path) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap) | ||||||
| 
 | 
 | ||||||
| //- Code blocks to display old/new versions | //- Code blocks to display old/new versions | ||||||
| 
 | 
 | ||||||
|  | mixin code-wrapper() | ||||||
|  |     span.u-inline-block.u-padding-top.u-width-full | ||||||
|  |         block | ||||||
|  | 
 | ||||||
| mixin code-old() | mixin code-old() | ||||||
|     +code(false, false, false, false, "reject").o-block-small |     +code(false, false, false, false, "reject").o-block-small | ||||||
|         block |         block | ||||||
|  |  | ||||||
|  | @ -113,6 +113,22 @@ p | ||||||
|         +cell flag |         +cell flag | ||||||
|         +cell Show help message and available arguments. |         +cell Show help message and available arguments. | ||||||
| 
 | 
 | ||||||
|  | +h(3, "validate") Validate | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Find all models installed in the current environment (both packages and | ||||||
|  |     |  shortcut links) and check whether they are compatible with the currently | ||||||
|  |     |  installed version of spaCy. Should be run after upgrading spaCy via | ||||||
|  |     |  #[code pip install -U spacy] to ensure that all installed models are | ||||||
|  |     |  can be used with the new version. The command is also useful to detect | ||||||
|  |     |  out-of-sync model links resulting from links created in different virtual | ||||||
|  |     |  environments. Prints a list of models, the installed versions, the latest | ||||||
|  |     |  compatible version (if out of date) and the commands for updating. | ||||||
|  | 
 | ||||||
|  | +code(false, "bash", "$"). | ||||||
|  |     spacy validate | ||||||
|  | 
 | ||||||
| +h(3, "convert") Convert | +h(3, "convert") Convert | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|  |  | ||||||
|  | @ -43,6 +43,20 @@ p | ||||||
|         +cell #[code Language] |         +cell #[code Language] | ||||||
|         +cell A #[code Language] object with the loaded model. |         +cell A #[code Language] object with the loaded model. | ||||||
| 
 | 
 | ||||||
|  | p | ||||||
|  |     |  Essentially, #[code spacy.load()] is a convenience wrapper that reads | ||||||
|  |     |  the language ID and pipeline components from a model's #[code meta.json], | ||||||
|  |     |  initialises the #[code Language] class, loads in the model data and | ||||||
|  |     |  returns it. | ||||||
|  | 
 | ||||||
|  | +code("Abstract example"). | ||||||
|  |     cls = util.get_lang_class(lang)         #  get language for ID, e.g. 'en' | ||||||
|  |     nlp = cls()                             #  initialise the language | ||||||
|  |     for name in pipeline: | ||||||
|  |         component = nlp.create_pipe(name)   #  create each pipeline component | ||||||
|  |         nlp.add_pipe(component)             #  add component to pipeline | ||||||
|  |     nlp.from_disk(model_data_path)          #  load in model data | ||||||
|  | 
 | ||||||
| +infobox("Deprecation note", "⚠️") | +infobox("Deprecation note", "⚠️") | ||||||
|     .o-block |     .o-block | ||||||
|         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy |         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy | ||||||
|  | @ -141,37 +155,3 @@ p | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell unicode |         +cell unicode | ||||||
|         +cell The explanation, or #[code None] if not found in the glossary. |         +cell The explanation, or #[code None] if not found in the glossary. | ||||||
| 
 |  | ||||||
| +h(3, "spacy.set_factory") spacy.set_factory |  | ||||||
|     +tag function |  | ||||||
|     +tag-new(2) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Set a factory that returns a custom |  | ||||||
|     |  #[+a("/usage/processing-pipelines") processing pipeline] |  | ||||||
|     |  component. Factories are useful for creating stateful components, especially ones which depend on shared data. |  | ||||||
| 
 |  | ||||||
| +aside-code("Example"). |  | ||||||
|     def my_factory(vocab): |  | ||||||
|         def my_component(doc): |  | ||||||
|             return doc |  | ||||||
|         return my_component |  | ||||||
| 
 |  | ||||||
|     spacy.set_factory('my_factory', my_factory) |  | ||||||
|     nlp = Language(pipeline=['my_factory']) |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code factory_id] |  | ||||||
|         +cell unicode |  | ||||||
|         +cell |  | ||||||
|             |  Unique name of factory. If added to a new pipeline, spaCy will |  | ||||||
|             |  look up the factory for this ID and use it to create the |  | ||||||
|             |  component. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code factory] |  | ||||||
|         +cell callable |  | ||||||
|         +cell |  | ||||||
|             |  Callable that takes a #[code Vocab] object and returns a pipeline |  | ||||||
|             |  component. |  | ||||||
|  |  | ||||||
|  | @ -138,6 +138,109 @@ p Get the number of tokens in the document. | ||||||
|         +cell int |         +cell int | ||||||
|         +cell The number of tokens in the document. |         +cell The number of tokens in the document. | ||||||
| 
 | 
 | ||||||
|  | +h(2, "set_extension") Doc.set_extension | ||||||
|  |     +tag classmethod | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Define a custom attribute on the #[code Doc] which becomes available via | ||||||
|  |     |  #[code Doc._]. For details, see the documentation on | ||||||
|  |     |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Doc | ||||||
|  |     city_getter = lambda doc: doc.text in ('New York', 'Paris', 'Berlin') | ||||||
|  |     Doc.set_extension('has_city', getter=city_getter) | ||||||
|  |     doc = nlp(u'I like New York') | ||||||
|  |     assert doc._.has_city | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell | ||||||
|  |             |  Name of the attribute to set by the extension. For example, | ||||||
|  |             |  #[code 'my_attr'] will be available as #[code doc._.my_attr]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code default] | ||||||
|  |         +cell - | ||||||
|  |         +cell | ||||||
|  |             |  Optional default value of the attribute if no getter or method | ||||||
|  |             |  is defined. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code method] | ||||||
|  |         +cell callable | ||||||
|  |         +cell | ||||||
|  |             |  Set a custom method on the object, for example | ||||||
|  |             |  #[code doc._.compare(other_doc)]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code getter] | ||||||
|  |         +cell callable | ||||||
|  |         +cell | ||||||
|  |             |  Getter function that takes the object and returns an attribute | ||||||
|  |             |  value. Is called when the user accesses the #[code ._] attribute. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code setter] | ||||||
|  |         +cell callable | ||||||
|  |         +cell | ||||||
|  |             |  Setter function that takes the #[code Doc] and a value, and | ||||||
|  |             |  modifies the object. Is called when the user writes to the | ||||||
|  |             |  #[code Doc._] attribute. | ||||||
|  | 
 | ||||||
|  | +h(2, "get_extension") Doc.get_extension | ||||||
|  |     +tag classmethod | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Look up a previously registered extension by name. Returns a 4-tuple | ||||||
|  |     |  #[code.u-break (default, method, getter, setter)] if the extension is | ||||||
|  |     |  registered. Raises a #[code KeyError] otherwise. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Doc | ||||||
|  |     Doc.set_extension('is_city', default=False) | ||||||
|  |     extension = Doc.get_extension('is_city') | ||||||
|  |     assert extension == (False, None, None, None) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the extension. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell tuple | ||||||
|  |         +cell | ||||||
|  |             |  A #[code.u-break (default, method, getter, setter)] tuple of the | ||||||
|  |             |  extension. | ||||||
|  | 
 | ||||||
|  | +h(2, "has_extension") Doc.has_extension | ||||||
|  |     +tag classmethod | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p Check whether an extension has been registered on the #[code Doc] class. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Doc | ||||||
|  |     Doc.set_extension('is_city', default=False) | ||||||
|  |     assert Doc.has_extension('is_city') | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the extension to check. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell bool | ||||||
|  |         +cell Whether the extension has been registered. | ||||||
|  | 
 | ||||||
| +h(2, "char_span") Doc.char_span | +h(2, "char_span") Doc.char_span | ||||||
|     +tag method |     +tag method | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
|  |  | ||||||
|  | @ -4,7 +4,14 @@ include ../_includes/_mixins | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Usually you'll load this once per process as #[code nlp] and pass the |     |  Usually you'll load this once per process as #[code nlp] and pass the | ||||||
|     |  instance around your application. |     |  instance around your application. The #[code Language] class is created | ||||||
|  |     |  when you call #[+api("spacy#load") #[code spacy.load()]] and contains | ||||||
|  |     |  the shared vocabulary and #[+a("/usage/adding-languages") language data], | ||||||
|  |     |  optional model data loaded from a #[+a("/models") model package] or | ||||||
|  |     |  a path, and a #[+a("/usage/processing-pipelines") processing pipeline] | ||||||
|  |     |  containing components like the tagger or parser that are called on a | ||||||
|  |     |  document in order. You can also add your own processing pipeline | ||||||
|  |     |  components that take a #[code Doc] object, modify it and return it. | ||||||
| 
 | 
 | ||||||
| +h(2, "init") Language.__init__ | +h(2, "init") Language.__init__ | ||||||
|     +tag method |     +tag method | ||||||
|  | @ -12,9 +19,9 @@ p | ||||||
| p Initialise a #[code Language] object. | p Initialise a #[code Language] object. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|  |     from spacy.vocab import Vocab | ||||||
|     from spacy.language import Language |     from spacy.language import Language | ||||||
|     nlp = Language(pipeline=['token_vectors', 'tags', |     nlp = Language(Vocab()) | ||||||
|                              'dependencies']) |  | ||||||
| 
 | 
 | ||||||
|     from spacy.lang.en import English |     from spacy.lang.en import English | ||||||
|     nlp = English() |     nlp = English() | ||||||
|  | @ -34,14 +41,6 @@ p Initialise a #[code Language] object. | ||||||
|             |  A function that takes text and returns a #[code Doc] object. |             |  A function that takes text and returns a #[code Doc] object. | ||||||
|             |  Usually a #[code Tokenizer]. |             |  Usually a #[code Tokenizer]. | ||||||
| 
 | 
 | ||||||
|     +row |  | ||||||
|         +cell #[code pipeline] |  | ||||||
|         +cell list |  | ||||||
|         +cell |  | ||||||
|             |  A list of annotation processes or IDs of annotation, processes, |  | ||||||
|             |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked |  | ||||||
|             |  up in #[code Language.Defaults.factories]. |  | ||||||
| 
 |  | ||||||
|     +row |     +row | ||||||
|         +cell #[code meta] |         +cell #[code meta] | ||||||
|         +cell dict |         +cell dict | ||||||
|  | @ -235,7 +234,6 @@ p | ||||||
|     |  Can be called before training to pre-process gold data. By default, it |     |  Can be called before training to pre-process gold data. By default, it | ||||||
|     |  handles nonprojectivity and adds missing tags to the tag map. |     |  handles nonprojectivity and adds missing tags to the tag map. | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) | +table(["Name", "Type", "Description"]) | ||||||
|     +row |     +row | ||||||
|         +cell #[code docs_golds] |         +cell #[code docs_golds] | ||||||
|  | @ -247,6 +245,177 @@ p | ||||||
|         +cell tuple |         +cell tuple | ||||||
|         +cell Tuples of #[code Doc] and #[code GoldParse] objects. |         +cell Tuples of #[code Doc] and #[code GoldParse] objects. | ||||||
| 
 | 
 | ||||||
|  | +h(2, "create_pipe") Language.create_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p Create a pipeline component from a factory. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     parser = nlp.create_pipe('parser') | ||||||
|  |     nlp.add_pipe(parser) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell | ||||||
|  |             |  Factory name to look up in | ||||||
|  |             |  #[+api("language#class-attributes") #[code Language.factories]]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code config] | ||||||
|  |         +cell dict | ||||||
|  |         +cell Configuration parameters to initialise component. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell callable | ||||||
|  |         +cell The pipeline component. | ||||||
|  | 
 | ||||||
|  | +h(2, "add_pipe") Language.add_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Add a component to the processing pipeline. Valid components are | ||||||
|  |     |  callables that take a #[code Doc] object, modify it and return it. Only | ||||||
|  |     |  one of #[code before], #[code after], #[code first] or #[code last] can | ||||||
|  |     |  be set. Default behaviour is #[code last=True]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     def component(doc): | ||||||
|  |         # modify Doc and return it | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  |     nlp.add_pipe(component, before='ner') | ||||||
|  |     nlp.add_pipe(component, name='custom_name', last=True) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code component] | ||||||
|  |         +cell callable | ||||||
|  |         +cell The pipeline component. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell | ||||||
|  |             |  Name of pipeline component. Overwrites existing | ||||||
|  |             |  #[code component.name] attribute if available. If no #[code name] | ||||||
|  |             |  is set and the component exposes no name attribute, | ||||||
|  |             |  #[code component.__name__] is used. An error is raised if the | ||||||
|  |             |  name already exists in the pipeline. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code before] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Component name to insert component directly before. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code after] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Component name to insert component directly after: | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code first] | ||||||
|  |         +cell bool | ||||||
|  |         +cell Insert component first / not first in the pipeline. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code last] | ||||||
|  |         +cell bool | ||||||
|  |         +cell Insert component last / not last in the pipeline. | ||||||
|  | 
 | ||||||
|  | +h(2, "get_pipe") Language.get_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p Get a pipeline component for a given component name. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     parser = nlp.get_pipe('parser') | ||||||
|  |     custom_component = nlp.get_pipe('custom_component') | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the pipeline component to get. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell callable | ||||||
|  |         +cell The pipeline component. | ||||||
|  | 
 | ||||||
|  | +h(2, "replace_pipe") Language.replace_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p Replace a component in the pipeline. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     nlp.replace_pipe('parser', my_custom_parser) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the component to replace. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code component] | ||||||
|  |         +cell callable | ||||||
|  |         +cell The pipeline component to inser. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | +h(2, "rename_pipe") Language.rename_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Rename a component in the pipeline. Useful to create custom names for | ||||||
|  |     |  pre-defined and pre-loaded components. To change the default name of | ||||||
|  |     |  a component added to the pipeline, you can also use the #[code name] | ||||||
|  |     |  argument on #[+api("language#add_pipe") #[code add_pipe]]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     nlp.rename_pipe('parser', 'spacy_parser') | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code old_name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the component to rename. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code new_name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell New name of the component. | ||||||
|  | 
 | ||||||
|  | +h(2, "remove_pipe") Language.remove_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Remove a component from the pipeline. Returns the removed component name | ||||||
|  |     |  and component function. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     name, component = nlp.remove_pipe('parser') | ||||||
|  |     assert name == 'parser' | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the component to remove. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell tuple | ||||||
|  |         +cell A #[code (name, component)] tuple of the removed component. | ||||||
|  | 
 | ||||||
| +h(2, "to_disk") Language.to_disk | +h(2, "to_disk") Language.to_disk | ||||||
|     +tag method |     +tag method | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
|  | @ -399,7 +568,15 @@ p Load state from a binary string. | ||||||
|     +row |     +row | ||||||
|         +cell #[code pipeline] |         +cell #[code pipeline] | ||||||
|         +cell list |         +cell list | ||||||
|         +cell Sequence of annotation functions. |         +cell | ||||||
|  |             |  List of #[code (name, component)] tuples describing the current | ||||||
|  |             |  processing pipeline, in order. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code pipe_names] | ||||||
|  |             +tag-new(2) | ||||||
|  |         +cell list | ||||||
|  |         +cell List of pipeline component names, in order. | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code meta] |         +cell #[code meta] | ||||||
|  | @ -424,3 +601,12 @@ p Load state from a binary string. | ||||||
|         +cell |         +cell | ||||||
|             |  Two-letter language ID, i.e. |             |  Two-letter language ID, i.e. | ||||||
|             |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]. |             |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code factories] | ||||||
|  |             +tag-new(2) | ||||||
|  |         +cell dict | ||||||
|  |         +cell | ||||||
|  |             |  Factories that create pre-defined pipeline components, e.g. the | ||||||
|  |             |  tagger, parser or entity recognizer, keyed by their component | ||||||
|  |             |  name. | ||||||
|  |  | ||||||
|  | @ -116,6 +116,109 @@ p Get the number of tokens in the span. | ||||||
|         +cell int |         +cell int | ||||||
|         +cell The number of tokens in the span. |         +cell The number of tokens in the span. | ||||||
| 
 | 
 | ||||||
|  | +h(2, "set_extension") Span.set_extension | ||||||
|  |     +tag classmethod | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Define a custom attribute on the #[code Span] which becomes available via | ||||||
|  |     |  #[code Span._]. For details, see the documentation on | ||||||
|  |     |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Span | ||||||
|  |     city_getter = lambda span: span.text in ('New York', 'Paris', 'Berlin') | ||||||
|  |     Span.set_extension('has_city', getter=city_getter) | ||||||
|  |     doc = nlp(u'I like New York in Autumn') | ||||||
|  |     assert doc[1:4]._.has_city | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell | ||||||
|  |             |  Name of the attribute to set by the extension. For example, | ||||||
|  |             |  #[code 'my_attr'] will be available as #[code span._.my_attr]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code default] | ||||||
|  |         +cell - | ||||||
|  |         +cell | ||||||
|  |             |  Optional default value of the attribute if no getter or method | ||||||
|  |             |  is defined. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code method] | ||||||
|  |         +cell callable | ||||||
|  |         +cell | ||||||
|  |             |  Set a custom method on the object, for example | ||||||
|  |             |  #[code span._.compare(other_span)]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code getter] | ||||||
|  |         +cell callable | ||||||
|  |         +cell | ||||||
|  |             |  Getter function that takes the object and returns an attribute | ||||||
|  |             |  value. Is called when the user accesses the #[code ._] attribute. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code setter] | ||||||
|  |         +cell callable | ||||||
|  |         +cell | ||||||
|  |             |  Setter function that takes the #[code Span] and a value, and | ||||||
|  |             |  modifies the object. Is called when the user writes to the | ||||||
|  |             |  #[code Span._] attribute. | ||||||
|  | 
 | ||||||
|  | +h(2, "get_extension") Span.get_extension | ||||||
|  |     +tag classmethod | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Look up a previously registered extension by name. Returns a 4-tuple | ||||||
|  |     |  #[code.u-break (default, method, getter, setter)] if the extension is | ||||||
|  |     |  registered. Raises a #[code KeyError] otherwise. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Span | ||||||
|  |     Span.set_extension('is_city', default=False) | ||||||
|  |     extension = Span.get_extension('is_city') | ||||||
|  |     assert extension == (False, None, None, None) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the extension. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell tuple | ||||||
|  |         +cell | ||||||
|  |             |  A #[code.u-break (default, method, getter, setter)] tuple of the | ||||||
|  |             |  extension. | ||||||
|  | 
 | ||||||
|  | +h(2, "has_extension") Span.has_extension | ||||||
|  |     +tag classmethod | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p Check whether an extension has been registered on the #[code Span] class. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Span | ||||||
|  |     Span.set_extension('is_city', default=False) | ||||||
|  |     assert Span.has_extension('is_city') | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the extension to check. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell bool | ||||||
|  |         +cell Whether the extension has been registered. | ||||||
|  | 
 | ||||||
| +h(2, "similarity") Span.similarity | +h(2, "similarity") Span.similarity | ||||||
|     +tag method |     +tag method | ||||||
|     +tag-model("vectors") |     +tag-model("vectors") | ||||||
|  |  | ||||||
|  | @ -51,6 +51,109 @@ p The number of unicode characters in the token, i.e. #[code token.text]. | ||||||
|         +cell int |         +cell int | ||||||
|         +cell The number of unicode characters in the token. |         +cell The number of unicode characters in the token. | ||||||
| 
 | 
 | ||||||
|  | +h(2, "set_extension") Token.set_extension | ||||||
|  |     +tag classmethod | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Define a custom attribute on the #[code Token] which becomes available | ||||||
|  |     |  via #[code Token._]. For details, see the documentation on | ||||||
|  |     |  #[+a("/usage/processing-pipelines#custom-components-attributes") custom attributes]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Token | ||||||
|  |     fruit_getter = lambda token: token.text in ('apple', 'pear', 'banana') | ||||||
|  |     Token.set_extension('is_fruit', getter=fruit_getter) | ||||||
|  |     doc = nlp(u'I have an apple') | ||||||
|  |     assert doc[3]._.is_fruit | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell | ||||||
|  |             |  Name of the attribute to set by the extension. For example, | ||||||
|  |             |  #[code 'my_attr'] will be available as #[code token._.my_attr]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code default] | ||||||
|  |         +cell - | ||||||
|  |         +cell | ||||||
|  |             |  Optional default value of the attribute if no getter or method | ||||||
|  |             |  is defined. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code method] | ||||||
|  |         +cell callable | ||||||
|  |         +cell | ||||||
|  |             |  Set a custom method on the object, for example | ||||||
|  |             |  #[code token._.compare(other_token)]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code getter] | ||||||
|  |         +cell callable | ||||||
|  |         +cell | ||||||
|  |             |  Getter function that takes the object and returns an attribute | ||||||
|  |             |  value. Is called when the user accesses the #[code ._] attribute. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code setter] | ||||||
|  |         +cell callable | ||||||
|  |         +cell | ||||||
|  |             |  Setter function that takes the #[code Token] and a value, and | ||||||
|  |             |  modifies the object. Is called when the user writes to the | ||||||
|  |             |  #[code Token._] attribute. | ||||||
|  | 
 | ||||||
|  | +h(2, "get_extension") Token.get_extension | ||||||
|  |     +tag classmethod | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Look up a previously registered extension by name. Returns a 4-tuple | ||||||
|  |     |  #[code.u-break (default, method, getter, setter)] if the extension is | ||||||
|  |     |  registered. Raises a #[code KeyError] otherwise. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Token | ||||||
|  |     Token.set_extension('is_fruit', default=False) | ||||||
|  |     extension = Token.get_extension('is_fruit') | ||||||
|  |     assert extension == (False, None, None, None) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the extension. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell tuple | ||||||
|  |         +cell | ||||||
|  |             |  A #[code.u-break (default, method, getter, setter)] tuple of the | ||||||
|  |             |  extension. | ||||||
|  | 
 | ||||||
|  | +h(2, "has_extension") Token.has_extension | ||||||
|  |     +tag classmethod | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p Check whether an extension has been registered on the #[code Token] class. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Token | ||||||
|  |     Token.set_extension('is_fruit', default=False) | ||||||
|  |     assert Token.has_extension('is_fruit') | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the extension to check. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell bool | ||||||
|  |         +cell Whether the extension has been registered. | ||||||
|  | 
 | ||||||
| +h(2, "check_flag") Token.check_flag | +h(2, "check_flag") Token.check_flag | ||||||
|     +tag method |     +tag method | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -143,6 +143,9 @@ | ||||||
| 
 | 
 | ||||||
| //- Layout | //- Layout | ||||||
| 
 | 
 | ||||||
|  | .u-width-full | ||||||
|  |     width: 100% | ||||||
|  | 
 | ||||||
| .u-float-left | .u-float-left | ||||||
|     float: left |     float: left | ||||||
|     margin-right: 1rem |     margin-right: 1rem | ||||||
|  | @ -166,6 +169,9 @@ | ||||||
| .u-padding-medium | .u-padding-medium | ||||||
|     padding: 1.8rem |     padding: 1.8rem | ||||||
| 
 | 
 | ||||||
|  | .u-padding-top | ||||||
|  |     padding-top: 2rem | ||||||
|  | 
 | ||||||
| .u-inline-block | .u-inline-block | ||||||
|     display: inline-block |     display: inline-block | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -25,7 +25,7 @@ | ||||||
|         display: inline-block |         display: inline-block | ||||||
|         font-size: 0.6em |         font-size: 0.6em | ||||||
|         font-weight: bold |         font-weight: bold | ||||||
|         padding-right: 1.25rem |         padding-right: 1em | ||||||
|         margin-left: -3.75rem |         margin-left: -3.75rem | ||||||
|         text-align: right |         text-align: right | ||||||
|         width: 2.5rem |         width: 2.5rem | ||||||
|  |  | ||||||
|  | @ -456,24 +456,11 @@ p | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  To add a lookup lemmatizer to your language, import the #[code LOOKUP] |     |  To provide a lookup lemmatizer for your language, import the lookup table | ||||||
|     |  table and #[code Lemmatizer], and create a new classmethod: |     |  and add it to the #[code Language] class as #[code lemma_lookup]: | ||||||
| 
 | 
 | ||||||
| 
 | +code. | ||||||
| +code("__init__py (excerpt)"). |     lemma_lookup = dict(LOOKUP) | ||||||
|     # other imports here, plus lookup table and lookup lemmatizer |  | ||||||
|     from .lemmatizer import LOOKUP |  | ||||||
|     from ...lemmatizerlookup import Lemmatizer |  | ||||||
| 
 |  | ||||||
|     class Xxxxx(Language): |  | ||||||
|         lang = 'xx' |  | ||||||
| 
 |  | ||||||
|         class Defaults(Language.Defaults): |  | ||||||
|             # other language defaults here |  | ||||||
| 
 |  | ||||||
|             @classmethod |  | ||||||
|             def create_lemmatizer(cls, nlp=None): |  | ||||||
|                 return Lemmatizer(LOOKUP) |  | ||||||
| 
 | 
 | ||||||
| +h(3, "tag-map") Tag map | +h(3, "tag-map") Tag map | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -103,10 +103,10 @@ | ||||||
|         "title": "Language Processing Pipelines", |         "title": "Language Processing Pipelines", | ||||||
|         "next": "vectors-similarity", |         "next": "vectors-similarity", | ||||||
|         "menu": { |         "menu": { | ||||||
|             "How pipelines work": "pipelines", |             "How Pipelines Work": "pipelines", | ||||||
|             "Examples": "examples", |             "Custom Components": "custom-components", | ||||||
|  |             "Developing Extensions": "extensions", | ||||||
|             "Multi-threading": "multithreading", |             "Multi-threading": "multithreading", | ||||||
|             "User Hooks": "user-hooks", |  | ||||||
|             "Serialization": "serialization" |             "Serialization": "serialization" | ||||||
|         } |         } | ||||||
|     }, |     }, | ||||||
|  | @ -195,6 +195,7 @@ | ||||||
|         "teaser": "Full code examples you can modify and run.", |         "teaser": "Full code examples you can modify and run.", | ||||||
|         "next": "resources", |         "next": "resources", | ||||||
|         "menu": { |         "menu": { | ||||||
|  |             "Pipeline": "pipeline", | ||||||
|             "Matching": "matching", |             "Matching": "matching", | ||||||
|             "Training": "training", |             "Training": "training", | ||||||
|             "Deep Learning": "deep-learning" |             "Deep Learning": "deep-learning" | ||||||
|  |  | ||||||
							
								
								
									
										369
									
								
								website/usage/_processing-pipelines/_custom-components.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										369
									
								
								website/usage/_processing-pipelines/_custom-components.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,369 @@ | ||||||
|  | //- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  A component receives a #[code Doc] object and can modify it – for example, | ||||||
|  |     |  by using the current weights to make a prediction and set some annotation | ||||||
|  |     |  on the document. By adding a component to the pipeline, you'll get access | ||||||
|  |     |  to the #[code Doc] at any point #[strong during processing] – instead of | ||||||
|  |     |  only being able to modify it afterwards. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     def my_component(doc): | ||||||
|  |         # do something to the doc here | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  | +table(["Argument", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code doc] | ||||||
|  |         +cell #[code Doc] | ||||||
|  |         +cell The #[code Doc] object processed by the previous component. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell #[code Doc] | ||||||
|  |         +cell The #[code Doc] object processed by this pipeline component. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Custom components can be added to the pipeline using the | ||||||
|  |     |  #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you | ||||||
|  |     |  can either specify a component to add it #[strong before or after], tell | ||||||
|  |     |  spaCy to add it #[strong first or last] in the pipeline, or define a | ||||||
|  |     |  #[strong custom name]. If no name is set and no #[code name] attribute | ||||||
|  |     |  is present on your component, the function name is used. | ||||||
|  | 
 | ||||||
|  | +code("Adding pipeline components"). | ||||||
|  |     def my_component(doc): | ||||||
|  |         print("After tokenization, this doc has %s tokens." % len(doc)) | ||||||
|  |         if len(doc) < 10: | ||||||
|  |             print("This is a pretty short document.") | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  |     nlp = spacy.load('en') | ||||||
|  |     nlp.pipeline.add_pipe(my_component, name='print_info', first=True) | ||||||
|  |     print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner'] | ||||||
|  |     doc = nlp(u"This is a sentence.") | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Of course, you can also wrap your component as a class to allow | ||||||
|  |     |  initialising it with custom settings and hold state within the component. | ||||||
|  |     |  This is useful for #[strong stateful components], especially ones which | ||||||
|  |     |  #[strong depend on shared data]. | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     class MyComponent(object): | ||||||
|  |         name = 'print_info' | ||||||
|  | 
 | ||||||
|  |         def __init__(vocab, short_limit=10): | ||||||
|  |             self.vocab = nlp.vocab | ||||||
|  |             self.short_limit = short_limit | ||||||
|  | 
 | ||||||
|  |         def __call__(doc): | ||||||
|  |             if len(doc) < self.short_limit: | ||||||
|  |                 print("This is a pretty short document.") | ||||||
|  |             return doc | ||||||
|  | 
 | ||||||
|  |     my_component = MyComponent(nlp.vocab, short_limit=25) | ||||||
|  |     nlp.add_pipe(my_component, first=True) | ||||||
|  | 
 | ||||||
|  | +h(3, "custom-components-attributes") | ||||||
|  |     |  Extension attributes on #[code Doc], #[code Span] and #[code Token] | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  As of v2.0, spaCy allows you to set any custom attributes and methods | ||||||
|  |     |  on the #[code Doc], #[code Span] and #[code Token], which become | ||||||
|  |     |  available as #[code Doc._], #[code Span._] and #[code Token._] – for | ||||||
|  |     |  example, #[code Token._.my_attr]. This lets you store additional | ||||||
|  |     |  information relevant to your application, add new features and | ||||||
|  |     |  functionality to spaCy, and implement your own models trained with other | ||||||
|  |     |  machine learning libraries. It also lets you take advantage of spaCy's | ||||||
|  |     |  data structures and the #[code Doc] object as the "single source of | ||||||
|  |     |  truth". | ||||||
|  | 
 | ||||||
|  | +aside("Why ._?") | ||||||
|  |     |  Writing to a #[code ._] attribute instead of to the #[code Doc] directly | ||||||
|  |     |  keeps a clearer separation and makes it easier to ensure backwards | ||||||
|  |     |  compatibility. For example, if you've implemented your own #[code .coref] | ||||||
|  |     |  property and spaCy claims it one day, it'll break your code. Similarly, | ||||||
|  |     |  just by looking at the code, you'll immediately know what's built-in and | ||||||
|  |     |  what's custom – for example, #[code doc.sentiment] is spaCy, while | ||||||
|  |     |  #[code doc._.sent_score] isn't. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  There are three main types of extensions, which can be defined using the | ||||||
|  |     |  #[+api("doc#set_extension") #[code Doc.set_extension]], | ||||||
|  |     |  #[+api("span#set_extension") #[code Span.set_extension]] and | ||||||
|  |     |  #[+api("token#set_extension") #[code Token.set_extension]] methods. | ||||||
|  | 
 | ||||||
|  | +list("numbers") | ||||||
|  |     +item #[strong Attribute extensions]. | ||||||
|  |         |  Set a default value for an attribute, which can be overwritten | ||||||
|  |         |  manually at any time. Attribute extensions work like "normal" | ||||||
|  |         |  variables and are the quickest way to store arbitrary information | ||||||
|  |         |  on a #[code Doc], #[code Span] or #[code Token]. | ||||||
|  | 
 | ||||||
|  |         +code-wrapper | ||||||
|  |             +code. | ||||||
|  |                 Doc.set_extension('hello', default=True) | ||||||
|  |                 assert doc._.hello | ||||||
|  |                 doc._.hello = False | ||||||
|  | 
 | ||||||
|  |     +item #[strong Property extensions]. | ||||||
|  |         |  Define a getter and an optional setter function. If no setter is | ||||||
|  |         |  provided, the extension is immutable. Since the getter and setter | ||||||
|  |         |  functions are only called when you #[em retrieve] the attribute, | ||||||
|  |         |  you can also access values of previously added attribute extensions. | ||||||
|  |         |  For example, a #[code Doc] getter can average over #[code Token] | ||||||
|  |         |   attributes. For #[code Span] extensions, you'll almost always want | ||||||
|  |         |  to use a property – otherwise, you'd have to write to | ||||||
|  |         |  #[em every possible] #[code Span] in the #[code Doc] to set up the | ||||||
|  |         |  values correctly. | ||||||
|  | 
 | ||||||
|  |         +code-wrapper | ||||||
|  |             +code. | ||||||
|  |                 Doc.set_extension('hello', getter=get_hello_value, setter=set_hello_value) | ||||||
|  |                 assert doc._.hello | ||||||
|  |                 doc._.hello = 'Hi!' | ||||||
|  | 
 | ||||||
|  |     +item #[strong Method extensions]. | ||||||
|  |         |  Assign a function that becomes available as an object method. Method | ||||||
|  |         |  extensions are always immutable. For more details and implementation | ||||||
|  |         |  ideas, see | ||||||
|  |         |  #[+a("/usage/examples#custom-components-attr-methods") these examples]. | ||||||
|  | 
 | ||||||
|  |         +code-wrapper | ||||||
|  |             +code.o-no-block. | ||||||
|  |                 Doc.set_extension('hello', method=lambda doc, name: 'Hi {}!'.format(name)) | ||||||
|  |                 assert doc._.hello('Bob') == 'Hi Bob!' | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Before you can access a custom extension, you need to register it using | ||||||
|  |     |  the #[code set_extension] method on the object you want | ||||||
|  |     |  to add it to, e.g. the #[code Doc]. Keep in mind that extensions are | ||||||
|  |     |  always #[strong added globally] and not just on a particular instance. | ||||||
|  |     |  If an attribute of the same name | ||||||
|  |     |  already exists, or if you're trying to access an attribute that hasn't | ||||||
|  |     |  been registered, spaCy will raise an #[code AttributeError]. | ||||||
|  | 
 | ||||||
|  | +code("Example"). | ||||||
|  |     from spacy.tokens import Doc, Span, Token | ||||||
|  | 
 | ||||||
|  |     fruits = ['apple', 'pear', 'banana', 'orange', 'strawberry'] | ||||||
|  |     is_fruit_getter = lambda token: token.text in fruits | ||||||
|  |     has_fruit_getter = lambda obj: any([t.text in fruits for t in obj]) | ||||||
|  | 
 | ||||||
|  |     Token.set_extension('is_fruit', getter=is_fruit_getter) | ||||||
|  |     Doc.set_extension('has_fruit', getter=has_fruit_getter) | ||||||
|  |     Span.set_extension('has_fruit', getter=has_fruit_getter) | ||||||
|  | 
 | ||||||
|  | +aside-code("Usage example"). | ||||||
|  |     doc = nlp(u"I have an apple and a melon") | ||||||
|  |     assert doc[3]._.is_fruit      # get Token attributes | ||||||
|  |     assert not doc[0]._.is_fruit | ||||||
|  |     assert doc._.has_fruit        # get Doc attributes | ||||||
|  |     assert doc[1:4]._.has_fruit   # get Span attributes | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Once you've registered your custom attribute, you can also use the | ||||||
|  |     |  built-in #[code set], #[code get] and #[code has] methods to modify and | ||||||
|  |     |  retrieve the attributes. This is especially useful it you want to pass in | ||||||
|  |     |  a string instead of calling #[code doc._.my_attr]. | ||||||
|  | 
 | ||||||
|  | +table(["Method", "Description", "Valid for", "Example"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code ._.set()] | ||||||
|  |         +cell Set a value for an attribute. | ||||||
|  |         +cell Attributes, mutable properties. | ||||||
|  |         +cell #[code.u-break token._.set('my_attr', True)] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code ._.get()] | ||||||
|  |         +cell Get the value of an attribute. | ||||||
|  |         +cell Attributes, mutable properties, immutable properties, methods. | ||||||
|  |         +cell #[code.u-break my_attr = span._.get('my_attr')] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code ._.has()] | ||||||
|  |         +cell Check if an attribute exists. | ||||||
|  |         +cell Attributes, mutable properties, immutable properties, methods. | ||||||
|  |         +cell #[code.u-break doc._.has('my_attr')] | ||||||
|  | 
 | ||||||
|  | +infobox("How the ._ is implemented") | ||||||
|  |     |  Extension definitions – the defaults, methods, getters and setters you | ||||||
|  |     |  pass in to #[code set_extension] are stored in class attributes on the | ||||||
|  |     |  #[code Underscore] class. If you write to an extension attribute, e.g. | ||||||
|  |     |  #[code doc._.hello = True], the data is stored within the | ||||||
|  |     |  #[+api("doc#attributes") #[code Doc.user_data]] dictionary. To keep the | ||||||
|  |     |  underscore data separate from your other dictionary entries, the string | ||||||
|  |     |  #[code "._."] is placed before the name, in a tuple. | ||||||
|  | 
 | ||||||
|  | +h(4, "component-example1") Example: Custom sentence segmentation logic | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Let's say you want to implement custom logic to improve spaCy's sentence | ||||||
|  |     |  boundary detection. Currently, sentence segmentation is based on the | ||||||
|  |     |  dependency parse, which doesn't always produce ideal results. The custom | ||||||
|  |     |  logic should therefore be applied #[strong after] tokenization, but | ||||||
|  |     |  #[strong before] the dependency parsing – this way, the parser can also | ||||||
|  |     |  take advantage of the sentence boundaries. | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     def sbd_component(doc): | ||||||
|  |         for i, token in enumerate(doc[:-2]): | ||||||
|  |             # define sentence start if period + titlecase token | ||||||
|  |             if token.text == '.' and doc[i+1].is_title: | ||||||
|  |                 doc[i+1].sent_start = True | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  |     nlp = spacy.load('en') | ||||||
|  |     nlp.add_pipe(sbd_component, before='parser')  # insert before the parser | ||||||
|  | 
 | ||||||
|  | +h(4, "component-example2") | ||||||
|  |     |  Example: Pipeline component for entity matching and tagging with | ||||||
|  |     |  custom attributes | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  This example shows how to create a spaCy extension that takes a | ||||||
|  |     |  terminology list (in this case, single- and multi-word company names), | ||||||
|  |     |  matches the occurences in a document, labels them as #[code ORG] entities, | ||||||
|  |     |  merges the tokens and sets custom #[code is_tech_org] and | ||||||
|  |     |  #[code has_tech_org] attributes. For efficient matching, the example uses | ||||||
|  |     |  the #[+api("phrasematcher") #[code PhraseMatcher]] which accepts | ||||||
|  |     |  #[code Doc] objects as match patterns and works well for large | ||||||
|  |     |  terminology lists. It also ensures your patterns will always match, even | ||||||
|  |     |  when you customise spaCy's tokenization rules. When you call #[code nlp] | ||||||
|  |     |  on a text, the custom pipeline component is applied to the #[code Doc] | ||||||
|  | 
 | ||||||
|  | +github("spacy", "examples/pipeline/custom_component_entities.py", false, 500) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Wrapping this functionality in a | ||||||
|  |     |  pipeline component allows you to reuse the module with different | ||||||
|  |     |  settings, and have all pre-processing taken care of when you call | ||||||
|  |     |  #[code nlp] on your text and receive a #[code Doc] object. | ||||||
|  | 
 | ||||||
|  | +h(4, "component-example3") | ||||||
|  |     |  Example: Pipeline component for GPE entities and country meta data via a | ||||||
|  |     |  REST API | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  This example shows the implementation of a pipeline component | ||||||
|  |     |  that fetches country meta data via the | ||||||
|  |     |  #[+a("https://restcountries.eu") REST Countries API] sets entity | ||||||
|  |     |  annotations for countries, merges entities into one token and | ||||||
|  |     |  sets custom attributes on the #[code Doc], #[code Span] and | ||||||
|  |     |  #[code Token] – for example, the capital, latitude/longitude coordinates | ||||||
|  |     |  and even the country flag. | ||||||
|  | 
 | ||||||
|  | +github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  In this case, all data can be fetched on initialisation in one request. | ||||||
|  |     |  However, if you're working with text that contains incomplete country | ||||||
|  |     |  names, spelling mistakes or foreign-language versions, you could also | ||||||
|  |     |  implement a #[code like_country]-style getter function that makes a | ||||||
|  |     |  request to the search API endpoint and returns the best-matching | ||||||
|  |     |  result. | ||||||
|  | 
 | ||||||
|  | +h(4, "custom-components-usage-ideas") Other usage ideas | ||||||
|  | 
 | ||||||
|  | +list | ||||||
|  |     +item | ||||||
|  |         |  #[strong Adding new features and hooking in models]. For example, | ||||||
|  |         |  a sentiment analysis model, or your preferred solution for | ||||||
|  |         |  lemmatization or sentiment analysis. spaCy's built-in tagger, | ||||||
|  |         |  parser and entity recognizer respect annotations that were already | ||||||
|  |         |  set on the #[code Doc] in a previous step of the pipeline. | ||||||
|  |     +item | ||||||
|  |         |  #[strong Integrating other libraries and APIs]. For example, your | ||||||
|  |         |  pipeline component can write additional information and data | ||||||
|  |         |  directly to the #[code Doc] or #[code Token] as custom attributes, | ||||||
|  |         |  while making sure no information is lost in the process. This can | ||||||
|  |         |  be output generated by other libraries and models, or an external | ||||||
|  |         |  service with a REST API. | ||||||
|  |     +item | ||||||
|  |         |  #[strong Debugging and logging]. For example, a component which | ||||||
|  |         |  stores and/or exports relevant information about the current state | ||||||
|  |         |  of the processed document, and insert it at any point of your | ||||||
|  |         |  pipeline. | ||||||
|  | 
 | ||||||
|  | +infobox("Developing third-party extensions") | ||||||
|  |     |  The new pipeline management and custom attributes finally make it easy | ||||||
|  |     |  to develop your own spaCy extensions and plugins and share them with | ||||||
|  |     |  others. Extensions can claim their own #[code ._] namespace and exist as | ||||||
|  |     |  standalone packages. If you're developing a tool or library and want to | ||||||
|  |     |  make it easy for others to use it with spaCy and add it to their | ||||||
|  |     |  pipeline, all you have to do is expose a function that takes a | ||||||
|  |     |  #[code Doc], modifies it and returns it. For more details and | ||||||
|  |     |  #[strong best practices], see the section on | ||||||
|  |     |  #[+a("#extensions") developing spaCy extensions]. | ||||||
|  | 
 | ||||||
|  | +h(3, "custom-components-user-hooks") User hooks | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  While it's generally recommended to use the #[code Doc._], #[code Span._] | ||||||
|  |     |  and #[code Token._] proxies to add your own custom attributes, spaCy | ||||||
|  |     |  offers a few exceptions to allow #[strong customising the built-in methods] | ||||||
|  |     |  like #[+api("doc#similarity") #[code Doc.similarity]] or | ||||||
|  |     |  #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can | ||||||
|  |     |  rely on statistical models you train yourself. For instance, you can | ||||||
|  |     |  provide your own on-the-fly sentence segmentation algorithm or document | ||||||
|  |     |  similarity method. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Hooks let you customize some of the behaviours of the #[code Doc], | ||||||
|  |     |  #[code Span] or #[code Token] objects by adding a component to the | ||||||
|  |     |  pipeline. For instance, to customize the | ||||||
|  |     |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a | ||||||
|  |     |  component that sets a custom function to | ||||||
|  |     |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] | ||||||
|  |     |  method will check the #[code user_hooks] dict, and delegate to your | ||||||
|  |     |  function if you've set one. Similar results can be achieved by setting | ||||||
|  |     |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. | ||||||
|  | 
 | ||||||
|  | +aside("Implementation note") | ||||||
|  |     |  The hooks live on the #[code Doc] object because the #[code Span] and | ||||||
|  |     |  #[code Token] objects are created lazily, and don't own any data. They | ||||||
|  |     |  just proxy to their parent #[code Doc]. This turns out to be convenient | ||||||
|  |     |  here — we only have to worry about installing hooks in one place. | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Customises"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code user_hooks] | ||||||
|  |         +cell | ||||||
|  |             +api("doc#vector") #[code Doc.vector] | ||||||
|  |             +api("doc#has_vector") #[code Doc.has_vector] | ||||||
|  |             +api("doc#vector_norm") #[code Doc.vector_norm] | ||||||
|  |             +api("doc#sents") #[code Doc.sents] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code user_token_hooks] | ||||||
|  |         +cell | ||||||
|  |             +api("token#similarity") #[code Token.similarity] | ||||||
|  |             +api("token#vector") #[code Token.vector] | ||||||
|  |             +api("token#has_vector") #[code Token.has_vector] | ||||||
|  |             +api("token#vector_norm") #[code Token.vector_norm] | ||||||
|  |             +api("token#conjuncts") #[code Token.conjuncts] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code user_span_hooks] | ||||||
|  |         +cell | ||||||
|  |             +api("span#similarity") #[code Span.similarity] | ||||||
|  |             +api("span#vector") #[code Span.vector] | ||||||
|  |             +api("span#has_vector") #[code Span.has_vector] | ||||||
|  |             +api("span#vector_norm") #[code Span.vector_norm] | ||||||
|  |             +api("span#root") #[code Span.root] | ||||||
|  | 
 | ||||||
|  | +code("Add custom similarity hooks"). | ||||||
|  |     class SimilarityModel(object): | ||||||
|  |         def __init__(self, model): | ||||||
|  |             self._model = model | ||||||
|  | 
 | ||||||
|  |         def __call__(self, doc): | ||||||
|  |             doc.user_hooks['similarity'] = self.similarity | ||||||
|  |             doc.user_span_hooks['similarity'] = self.similarity | ||||||
|  |             doc.user_token_hooks['similarity'] = self.similarity | ||||||
|  | 
 | ||||||
|  |         def similarity(self, obj1, obj2): | ||||||
|  |             y = self._model([obj1.vector, obj2.vector]) | ||||||
|  |             return float(y[0]) | ||||||
|  | @ -1,126 +0,0 @@ | ||||||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > EXAMPLES |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  To see real-world examples of pipeline factories and components in action, |  | ||||||
|     |  you can have a look at the source of spaCy's built-in components, e.g. |  | ||||||
|     |  the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or |  | ||||||
|     |  #[+api("entityrecognizer") #[code EntityRecongnizer]]. |  | ||||||
| 
 |  | ||||||
| +h(3, "example1") Example: Custom sentence segmentation logic |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Let's say you want to implement custom logic to improve spaCy's sentence |  | ||||||
|     |  boundary detection. Currently, sentence segmentation is based on the |  | ||||||
|     |  dependency parse, which doesn't always produce ideal results. The custom |  | ||||||
|     |  logic should therefore be applied #[strong after] tokenization, but |  | ||||||
|     |  #[strong before] the dependency parsing – this way, the parser can also |  | ||||||
|     |  take advantage of the sentence boundaries. |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     def sbd_component(doc): |  | ||||||
|         for i, token in enumerate(doc[:-2]): |  | ||||||
|             # define sentence start if period + titlecase token |  | ||||||
|             if token.text == '.' and doc[i+1].is_title: |  | ||||||
|                 doc[i+1].sent_start = True |  | ||||||
|         return doc |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  In this case, we simply want to add the component to the existing |  | ||||||
|     |  pipeline of the English model. We can do this by inserting it at index 0 |  | ||||||
|     |  of #[code nlp.pipeline]: |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     nlp = spacy.load('en') |  | ||||||
|     nlp.pipeline.insert(0, sbd_component) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  When you call #[code nlp] on some text, spaCy will tokenize it to create |  | ||||||
|     |  a #[code Doc] object, and first call #[code sbd_component] on it, followed |  | ||||||
|     |  by the model's default pipeline. |  | ||||||
| 
 |  | ||||||
| +h(3, "example2") Example: Sentiment model |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Let's say you have trained your own document sentiment model on English |  | ||||||
|     |  text. After tokenization, you want spaCy to first execute the |  | ||||||
|     |  #[strong default tensorizer], followed by a custom |  | ||||||
|     |  #[strong sentiment component] that adds a #[code .sentiment] |  | ||||||
|     |  property to the #[code Doc], containing your model's sentiment precition. |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Your component class will have a #[code from_disk()] method that spaCy |  | ||||||
|     |  calls to load the model data. When called, the component will compute |  | ||||||
|     |  the sentiment score, add it to the #[code Doc] and return the modified |  | ||||||
|     |  document. Optionally, the component can include an #[code update()] method |  | ||||||
|     |  to allow training the model. |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     import pickle |  | ||||||
|     from pathlib import Path |  | ||||||
| 
 |  | ||||||
|     class SentimentComponent(object): |  | ||||||
|         def __init__(self, vocab): |  | ||||||
|             self.weights = None |  | ||||||
| 
 |  | ||||||
|         def __call__(self, doc): |  | ||||||
|             doc.sentiment = sum(self.weights*doc.vector) # set sentiment property |  | ||||||
|             return doc |  | ||||||
| 
 |  | ||||||
|         def from_disk(self, path): # path = model path + factory ID ('sentiment') |  | ||||||
|             self.weights = pickle.load(Path(path) / 'weights.bin') # load weights |  | ||||||
|             return self |  | ||||||
| 
 |  | ||||||
|         def update(self, doc, gold): # update weights – allows training! |  | ||||||
|             prediction = sum(self.weights*doc.vector) |  | ||||||
|             self.weights -= 0.001*doc.vector*(prediction-gold.sentiment) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  The factory will initialise the component with the #[code Vocab] object. |  | ||||||
|     |  To be able to add it to your model's pipeline as #[code 'sentiment'], |  | ||||||
|     |  it also needs to be registered via |  | ||||||
|     |  #[+api("spacy#set_factory") #[code set_factory()]]. |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     def sentiment_factory(vocab): |  | ||||||
|         component = SentimentComponent(vocab) # initialise component |  | ||||||
|         return component |  | ||||||
| 
 |  | ||||||
|     spacy.set_factory('sentiment', sentiment_factory) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  The above code should be #[strong shipped with your model]. You can use |  | ||||||
|     |  the #[+api("cli#package") #[code package]] command to create all required |  | ||||||
|     |  files and directories. The model package will include an |  | ||||||
|     |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]] |  | ||||||
|     |  with a #[code load()] method, that will initialise the language class with |  | ||||||
|     |  the model's pipeline and call the #[code from_disk()] method to load |  | ||||||
|     |  the model data. |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  In the model package's meta.json, specify the language class and pipeline |  | ||||||
|     |  IDs: |  | ||||||
| 
 |  | ||||||
| +code("meta.json (excerpt)", "json"). |  | ||||||
|     { |  | ||||||
|         "name": "sentiment_model", |  | ||||||
|         "lang": "en", |  | ||||||
|         "version": "1.0.0", |  | ||||||
|         "spacy_version": ">=2.0.0,<3.0.0", |  | ||||||
|         "pipeline": ["tensorizer", "sentiment"] |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  When you load your new model, spaCy will call the model's #[code load()] |  | ||||||
|     |  method. This will return a #[code Language] object with a pipeline |  | ||||||
|     |  containing the default tensorizer, and the sentiment component returned |  | ||||||
|     |  by your custom #[code "sentiment"] factory. |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     nlp = spacy.load('en_sentiment_model') |  | ||||||
|     doc = nlp(u'I love pizza') |  | ||||||
|     assert doc.sentiment |  | ||||||
| 
 |  | ||||||
| +infobox("Saving and loading models") |  | ||||||
|     |  For more information and a detailed guide on how to package your model, |  | ||||||
|     |  see the documentation on |  | ||||||
|     |  #[+a("/usage/training#saving-loading") saving and loading models]. |  | ||||||
							
								
								
									
										110
									
								
								website/usage/_processing-pipelines/_extensions.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										110
									
								
								website/usage/_processing-pipelines/_extensions.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,110 @@ | ||||||
|  | //- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  We're very excited about all the new possibilities for community | ||||||
|  |     |  extensions and plugins in spaCy v2.0, and we can't wait to see what | ||||||
|  |     |  you build with it! To get you started, here are a few tips, tricks and | ||||||
|  |     |  best practices: | ||||||
|  | 
 | ||||||
|  | +list | ||||||
|  |     +item | ||||||
|  |         |  Make sure to choose a #[strong descriptive and specific name] for | ||||||
|  |         |  your pipeline component class, and set it as its #[code name] | ||||||
|  |         |  attribute. Avoid names that are too common or likely to clash with | ||||||
|  |         |  built-in or a user's other custom components. While it's fine to call | ||||||
|  |         |  your package "spacy_my_extension", avoid component names including | ||||||
|  |         |  "spacy", since this can easily lead to confusion. | ||||||
|  | 
 | ||||||
|  |         +code-wrapper | ||||||
|  |             +code-new name = 'myapp_lemmatizer' | ||||||
|  |             +code-old name = 'lemmatizer' | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  When writing to #[code Doc], #[code Token] or #[code Span] objects, | ||||||
|  |         |  #[strong use getter functions] wherever possible, and avoid setting | ||||||
|  |         |  values explicitly. Tokens and spans don't own any data themselves, | ||||||
|  |         |  so you should provide a function that allows them to compute the | ||||||
|  |         |  values instead of writing static properties to individual objects. | ||||||
|  | 
 | ||||||
|  |         +code-wrapper | ||||||
|  |             +code-new. | ||||||
|  |                 is_fruit = lambda token: token.text in ('apple', 'orange') | ||||||
|  |                 Token.set_extension('is_fruit', getter=is_fruit) | ||||||
|  |             +code-old. | ||||||
|  |                 token._.set_extension('is_fruit', default=False) | ||||||
|  |                 if token.text in ('apple', 'orange'): | ||||||
|  |                     token._.set('is_fruit', True) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  Always add your custom attributes to the #[strong global] #[code Doc] | ||||||
|  |         |  #[code Token] or #[code Span] objects, not a particular instance of | ||||||
|  |         |  them. Add the attributes #[strong as early as possible], e.g. in | ||||||
|  |         |  your extension's #[code __init__] method or in the global scope of | ||||||
|  |         |  your module. This means that in the case of namespace collisions, | ||||||
|  |         |  the user will see an error immediately, not just when they run their | ||||||
|  |         |  pipeline. | ||||||
|  | 
 | ||||||
|  |         +code-wrapper | ||||||
|  |             +code-new. | ||||||
|  |                 from spacy.tokens import Doc | ||||||
|  |                 def __init__(attr='my_attr'): | ||||||
|  |                     Doc.set_extension(attr, getter=self.get_doc_attr) | ||||||
|  |             +code-old. | ||||||
|  |                 def __call__(doc): | ||||||
|  |                     doc.set_extension('my_attr', getter=self.get_doc_attr) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  If your extension is setting properties on the #[code Doc], | ||||||
|  |         |  #[code Token] or #[code Span], include an option to | ||||||
|  |         |  #[strong let the user to change those attribute names]. This makes | ||||||
|  |         |  it easier to avoid namespace collisions and accommodate users with | ||||||
|  |         |  different naming preferences. We recommend adding an #[code attrs] | ||||||
|  |         |  argument to the #[code __init__] method of your class so you can | ||||||
|  |         |  write the names to class attributes and reuse them across your | ||||||
|  |         |  component. | ||||||
|  | 
 | ||||||
|  |         +code-wrapper | ||||||
|  |             +code-new Doc.set_extension(self.doc_attr, default='some value') | ||||||
|  |             +code-old Doc.set_extension('my_doc_attr', default='some value') | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  Ideally, extensions should be #[strong standalone packages] with | ||||||
|  |         |  spaCy and optionally, other packages specified as a dependency. They | ||||||
|  |         |  can freely assign to their own #[code ._] namespace, but should stick | ||||||
|  |         |  to that. If your extension's only job is to provide a better | ||||||
|  |         |  #[code .similarity] implementation, and your docs state this | ||||||
|  |         |  explicitly, there's no problem with writing to the | ||||||
|  |         |  #[+a("#custom-components-user-hooks") #[code user_hooks]], and | ||||||
|  |         |  overwriting spaCy's built-in method. However, a third-party | ||||||
|  |         |  extension should #[strong never silently overwrite built-ins], or | ||||||
|  |         |  attributes set by other extensions. | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  If you're looking to publish a model that depends on a custom | ||||||
|  |         |  pipeline component, you can either #[strong require it] in the model | ||||||
|  |         |  package's dependencies, or – if the component is specific and | ||||||
|  |         |  lightweight – choose to #[strong ship it with your model package] | ||||||
|  |         |  and add it to the #[code Language] instance returned by the | ||||||
|  |         |  model's #[code load()] method. For examples of this, check out the | ||||||
|  |         |  implementations of spaCy's | ||||||
|  |         |  #[+api("util#load_model_from_init_py") #[code load_model_from_init_py()]] | ||||||
|  |         |  and  #[+api("util#load_model_from_path") #[code load_model_from_path()]] | ||||||
|  |         |  utility functions. | ||||||
|  | 
 | ||||||
|  |         +code-wrapper | ||||||
|  |             +code-new. | ||||||
|  |                 nlp.add_pipe(my_custom_component) | ||||||
|  |                 return nlp.from_disk(model_path) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  Once you're ready to share your extension with others, make sure to | ||||||
|  |         |  #[strong add docs and installation instructions] (you can | ||||||
|  |         |  always link to this page for more info). Make it easy for others to | ||||||
|  |         |  install and use your extension, for example by uploading it to | ||||||
|  |         |  #[+a("https://pypi.python.org") PyPi]. If you're sharing your code on | ||||||
|  |         |  GitHub, don't forget to tag it | ||||||
|  |         |  with #[+a("https://github.com/search?q=topic%3Aspacy") #[code spacy]] | ||||||
|  |         |  and #[+a("https://github.com/search?q=topic%3Aspacy-pipeline") #[code spacy-pipeline]] | ||||||
|  |         |  to help people find it. If you post it on Twitter, feel free to tag | ||||||
|  |         |  #[+a("https://twitter.com/" + SOCIAL.twitter) @#{SOCIAL.twitter}] | ||||||
|  |         |  so we can check it out. | ||||||
|  | @ -11,7 +11,7 @@ p | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  When you load a model, spaCy first consults the model's |     |  When you load a model, spaCy first consults the model's | ||||||
|     |  #[+a("/usage/saving-loading#models-generating") meta.json]. The |     |  #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The | ||||||
|     |  meta typically includes the model details, the ID of a language class, |     |  meta typically includes the model details, the ID of a language class, | ||||||
|     |  and an optional list of pipeline components. spaCy then does the |     |  and an optional list of pipeline components. spaCy then does the | ||||||
|     |  following: |     |  following: | ||||||
|  | @ -21,24 +21,26 @@ p | ||||||
|         "name": "example_model", |         "name": "example_model", | ||||||
|         "lang": "en" |         "lang": "en" | ||||||
|         "description": "Example model for spaCy", |         "description": "Example model for spaCy", | ||||||
|         "pipeline": ["tensorizer", "tagger"] |         "pipeline": ["tagger", "parser"] | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| +list("numbers") | +list("numbers") | ||||||
|     +item |  | ||||||
|         |  Look up #[strong pipeline IDs] in the available |  | ||||||
|         |  #[strong pipeline factories]. |  | ||||||
|     +item |  | ||||||
|         |  Initialise the #[strong pipeline components] by calling their |  | ||||||
|         |  factories with the #[code Vocab] as an argument. This gives each |  | ||||||
|         |  factory and component access to the pipeline's shared data, like |  | ||||||
|         |  strings, morphology and annotation scheme. |  | ||||||
|     +item |     +item | ||||||
|         |  Load the #[strong language class and data] for the given ID via |         |  Load the #[strong language class and data] for the given ID via | ||||||
|         |  #[+api("util.get_lang_class") #[code get_lang_class]]. |         |  #[+api("util.get_lang_class") #[code get_lang_class]] and initialise | ||||||
|  |         |  it. The #[code Language] class contains the shared vocabulary, | ||||||
|  |         |  tokenization rules and the language-specific annotation scheme. | ||||||
|     +item |     +item | ||||||
|         |  Pass the path to the #[strong model data] to the #[code Language] |         |  Iterate over the #[strong pipeline names] and create each component | ||||||
|         |  class and return it. |         |  using #[+api("language#create_pipe") #[code create_pipe]], which | ||||||
|  |         |  looks them up in #[code Language.factories]. | ||||||
|  |     +item | ||||||
|  |         |  Add each pipeline component to the pipeline in order, using | ||||||
|  |         |  #[+api("language#add_pipe") #[code add_pipe]]. | ||||||
|  |     +item | ||||||
|  |         |  Make the #[strong model data] available to the #[code Language] class | ||||||
|  |         |  by calling #[+api("language#from_disk") #[code from_disk]] with the | ||||||
|  |         |  path to the model data ditectory. | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  So when you call this... |     |  So when you call this... | ||||||
|  | @ -47,12 +49,12 @@ p | ||||||
|     nlp = spacy.load('en') |     nlp = spacy.load('en') | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     | ... the model tells spaCy to use the pipeline |     | ... the model tells spaCy to use the language #[code "en"] and the pipeline | ||||||
|     |  #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will |     |  #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will | ||||||
|     |  then look up each string in its internal factories registry and |     |  then initialise #[code spacy.lang.en.English], and create each pipeline | ||||||
|     |  initialise the individual components. It'll then load |     |  component and add it to the processing pipeline. It'll then load in the | ||||||
|     |  #[code spacy.lang.en.English], pass it the path to the model's data |     |  model's data from its data ditectory and return the modified | ||||||
|     |  directory, and return it for you to use as the #[code nlp] object. |     |  #[code Language] class for you to use as the #[code nlp] object. | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Fundamentally, a #[+a("/models") spaCy model] consists of three |     |  Fundamentally, a #[+a("/models") spaCy model] consists of three | ||||||
|  | @ -74,8 +76,11 @@ p | ||||||
|     data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0' |     data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0' | ||||||
| 
 | 
 | ||||||
|     cls = spacy.util.get_lang_class(lang)   # 1. get Language instance, e.g. English() |     cls = spacy.util.get_lang_class(lang)   # 1. get Language instance, e.g. English() | ||||||
|     nlp = cls(pipeline=pipeline)           # 2. initialise it with the pipeline |     nlp = cls()                             # 2. initialise it | ||||||
|     nlp.from_disk(model_data_path)         # 3. load in the binary data |     for name in pipeline: | ||||||
|  |         component = nlp.create_pipe(name)   # 3. create the pipeline components | ||||||
|  |         nlp.add_pipe(component)             # 4. add the component to the pipeline | ||||||
|  |     nlp.from_disk(model_data_path)          # 5. load in the binary data | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and |     |  When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and | ||||||
|  | @ -87,124 +92,23 @@ p | ||||||
|     |  document, which is then processed by the component next in the pipeline. |     |  document, which is then processed by the component next in the pipeline. | ||||||
| 
 | 
 | ||||||
| +code("The pipeline under the hood"). | +code("The pipeline under the hood"). | ||||||
|     doc = nlp.make_doc(u'This is a sentence') |     doc = nlp.make_doc(u'This is a sentence')   # create a Doc from raw text | ||||||
|     for proc in nlp.pipeline: |     for name, proc in nlp.pipeline:             # iterate over components in order | ||||||
|         doc = proc(doc) |         doc = proc(doc)                         # apply each component | ||||||
| 
 |  | ||||||
| +h(3, "creating") Creating pipeline components and factories |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  spaCy lets you customise the pipeline with your own components. Components |     |  The current processing pipeline is available as #[code nlp.pipeline], | ||||||
|     |  are functions that receive a #[code Doc] object, modify and return it. |     |  which returns a list of #[code (name, component)] tuples, or | ||||||
|     |  If your component is stateful, you'll want to create a new one for each |     |  #[code nlp.pipe_names], which only returns a list of human-readable | ||||||
|     |  pipeline. You can do that by defining and registering a factory which |     |  component names. | ||||||
|     |  receives the shared #[code Vocab] object and returns a component. |  | ||||||
| 
 |  | ||||||
| +h(4, "creating-component") Creating a  component |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  A component receives a #[code Doc] object and |  | ||||||
|     |  #[strong performs the actual processing] – for example, using the current |  | ||||||
|     |  weights to make a prediction and set some annotation on the document. By |  | ||||||
|     |  adding a component to the pipeline, you'll get access to the #[code Doc] |  | ||||||
|     |  at any point #[strong during] processing – instead of only being able to |  | ||||||
|     |  modify it afterwards. |  | ||||||
| 
 |  | ||||||
| +aside-code("Example"). |  | ||||||
|     def my_component(doc): |  | ||||||
|         # do something to the doc here |  | ||||||
|         return doc |  | ||||||
| 
 |  | ||||||
| +table(["Argument", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code doc] |  | ||||||
|         +cell #[code Doc] |  | ||||||
|         +cell The #[code Doc] object processed by the previous component. |  | ||||||
| 
 |  | ||||||
|     +row("foot") |  | ||||||
|         +cell returns |  | ||||||
|         +cell #[code Doc] |  | ||||||
|         +cell The #[code Doc] object processed by this pipeline component. |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  When creating a new #[code Language] class, you can pass it a list of |  | ||||||
|     |  pipeline component functions to execute in that order. You can also |  | ||||||
|     |  add it to an existing pipeline by modifying #[code nlp.pipeline] – just |  | ||||||
|     |  be careful not to overwrite a pipeline or its components by accident! |  | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     # Create a new Language object with a pipeline |     nlp.pipeline | ||||||
|     from spacy.language import Language |     # [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)] | ||||||
|     nlp = Language(pipeline=[my_component]) |     nlp.pipe_names | ||||||
|  |     # ['tagger', 'parser', 'ner'] | ||||||
| 
 | 
 | ||||||
|     # Modify an existing pipeline | +h(3, "disabling") Disabling and modifying pipeline components | ||||||
|     nlp = spacy.load('en') |  | ||||||
|     nlp.pipeline.append(my_component) |  | ||||||
| 
 |  | ||||||
| +h(4, "creating-factory") Creating a factory |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  A factory is a #[strong function that returns a pipeline component]. |  | ||||||
|     |  It's called with the #[code Vocab] object, to give it access to the |  | ||||||
|     |  shared data between components – for example, the strings, morphology, |  | ||||||
|     |  vectors or annotation scheme. Factories are useful for creating |  | ||||||
|     |  #[strong stateful components], especially ones which |  | ||||||
|     |  #[strong depend on shared data]. |  | ||||||
| 
 |  | ||||||
| +aside-code("Example"). |  | ||||||
|     def my_factory(vocab): |  | ||||||
|         # load some state |  | ||||||
|         def my_component(doc): |  | ||||||
|             # process the doc |  | ||||||
|             return doc |  | ||||||
|         return my_component |  | ||||||
| 
 |  | ||||||
| +table(["Argument", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code vocab] |  | ||||||
|         +cell #[code Vocab] |  | ||||||
|         +cell |  | ||||||
|             |  Shared data between components, including strings, morphology, |  | ||||||
|             |  vectors etc. |  | ||||||
| 
 |  | ||||||
|     +row("foot") |  | ||||||
|         +cell returns |  | ||||||
|         +cell callable |  | ||||||
|         +cell The pipeline component. |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  By creating a factory, you're essentially telling spaCy how to get the |  | ||||||
|     |  pipeline component #[strong once the vocab is available]. Factories need to |  | ||||||
|     |  be registered via #[+api("spacy#set_factory") #[code set_factory()]] and |  | ||||||
|     |  by assigning them a unique ID. This ID can be added to the pipeline as a |  | ||||||
|     |  string. When creating a pipeline, you're free to mix strings and |  | ||||||
|     |  callable components: |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     spacy.set_factory('my_factory', my_factory) |  | ||||||
|     nlp = Language(pipeline=['my_factory', my_other_component]) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  If spaCy comes across a string in the pipeline, it will try to resolve it |  | ||||||
|     |  by looking it up in the available factories. The factory will then be |  | ||||||
|     |  initialised with the #[code Vocab]. Providing factory names instead of |  | ||||||
|     |  callables also makes it easy to specify them in the model's |  | ||||||
|     |  #[+a("/usage/saving-loading#models-generating") meta.json]. If you're |  | ||||||
|     |  training your own model and want to use one of spaCy's default components, |  | ||||||
|     |  you won't have to worry about finding and implementing it either – to use |  | ||||||
|     |  the default tagger, simply add #[code "tagger"] to the pipeline, and |  | ||||||
|     |  #[strong spaCy will know what to do]. |  | ||||||
| 
 |  | ||||||
| +infobox("Important note") |  | ||||||
|     |  Because factories are #[strong resolved on initialisation] of the |  | ||||||
|     |  #[code Language] class, it's #[strong not possible] to add them to the |  | ||||||
|     |  pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only |  | ||||||
|     |  works with individual component functions. To use factories, you need to |  | ||||||
|     |  create a new #[code Language] object, or generate a |  | ||||||
|     |  #[+a("/usage/training#models-generating") model package] with |  | ||||||
|     |  a custom pipeline. |  | ||||||
| 
 |  | ||||||
| +h(3, "disabling") Disabling pipeline components |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  If you don't need a particular component of the pipeline – for |     |  If you don't need a particular component of the pipeline – for | ||||||
|  | @ -217,16 +121,19 @@ p | ||||||
| +code. | +code. | ||||||
|     nlp = spacy.load('en', disable['parser', 'tagger']) |     nlp = spacy.load('en', disable['parser', 'tagger']) | ||||||
|     nlp = English().from_disk('/model', disable=['tensorizer', 'ner']) |     nlp = English().from_disk('/model', disable=['tensorizer', 'ner']) | ||||||
|     doc = nlp(u"I don't want parsed", disable=['parser']) |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Note that you can't write directly to #[code nlp.pipeline], as this list |     |  You can also use the #[+api("language#remove_pipe") #[code remove_pipe]] | ||||||
|     |  holds the #[em actual components], not the IDs. However, if you know the |     |  method to remove pipeline components from an existing pipeline, the | ||||||
|     |  order of the components, you can still slice the list: |     |  #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them, | ||||||
|  |     |  or the #[+api("language#replace_pipe") #[code replace_pipe]] method | ||||||
|  |     |  to replace them with a custom component entirely (more details on this | ||||||
|  |     |  in the section on #[+a("#custom-components") custom components]. | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     nlp = spacy.load('en') |     nlp.remove_pipe('parser') | ||||||
|     nlp.pipeline = nlp.pipeline[:2] # only use the first two components |     nlp.rename_pipe('ner', 'entityrecognizer') | ||||||
|  |     nlp.replace_pipe('tagger', my_custom_tagger) | ||||||
| 
 | 
 | ||||||
| +infobox("Important note: disabling pipeline components") | +infobox("Important note: disabling pipeline components") | ||||||
|     .o-block |     .o-block | ||||||
|  | @ -234,12 +141,14 @@ p | ||||||
|         |  processing pipeline components, the #[code parser], #[code tagger] |         |  processing pipeline components, the #[code parser], #[code tagger] | ||||||
|         |  and #[code entity] keyword arguments have been replaced with |         |  and #[code entity] keyword arguments have been replaced with | ||||||
|         |  #[code disable], which takes a list of pipeline component names. |         |  #[code disable], which takes a list of pipeline component names. | ||||||
|         |  This lets you disable both default and custom components when loading |         |  This lets you disable pre-defined components when loading | ||||||
|         |  a model, or initialising a Language class via |         |  a model, or initialising a Language class via | ||||||
|         |  #[+api("language-from_disk") #[code from_disk]]. |         |  #[+api("language-from_disk") #[code from_disk]]. | ||||||
|  | 
 | ||||||
|     +code-new. |     +code-new. | ||||||
|         nlp = spacy.load('en', disable=['tagger', 'ner']) |         nlp = spacy.load('en', disable=['ner']) | ||||||
|         doc = nlp(u"I don't want parsed", disable=['parser']) |         nlp.remove_pipe('parser') | ||||||
|  |         doc = nlp(u"I don't want parsed") | ||||||
|     +code-old. |     +code-old. | ||||||
|         nlp = spacy.load('en', tagger=False, entity=False) |         nlp = spacy.load('en', tagger=False, entity=False) | ||||||
|         doc = nlp(u"I don't want parsed", parse=False) |         doc = nlp(u"I don't want parsed", parse=False) | ||||||
|  |  | ||||||
|  | @ -1,61 +0,0 @@ | ||||||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Hooks let you customize some of the behaviours of the #[code Doc], |  | ||||||
|     |  #[code Span] or #[code Token] objects by adding a component to the |  | ||||||
|     |  pipeline. For instance, to customize the |  | ||||||
|     |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a |  | ||||||
|     |  component that sets a custom function to |  | ||||||
|     |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] |  | ||||||
|     |  method will check the #[code user_hooks] dict, and delegate to your |  | ||||||
|     |  function if you've set one. Similar results can be achieved by setting |  | ||||||
|     |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. |  | ||||||
| 
 |  | ||||||
| +code("Polymorphic similarity example"). |  | ||||||
|     span.similarity(doc) |  | ||||||
|     token.similarity(span) |  | ||||||
|     doc1.similarity(doc2) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  By default, this just averages the vectors for each document, and |  | ||||||
|     |  computes their cosine. Obviously, spaCy should make it easy for you to |  | ||||||
|     |  install your own similarity model. This introduces a tricky design |  | ||||||
|     |  challenge. The current solution is to add three more dicts to the |  | ||||||
|     |  #[code Doc] object: |  | ||||||
| 
 |  | ||||||
| +aside("Implementation note") |  | ||||||
|     |  The hooks live on the #[code Doc] object because the #[code Span] and |  | ||||||
|     |  #[code Token] objects are created lazily, and don't own any data. They |  | ||||||
|     |  just proxy to their parent #[code Doc]. This turns out to be convenient |  | ||||||
|     |  here — we only have to worry about installing hooks in one place. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code user_hooks] |  | ||||||
|         +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents] |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code user_token_hooks] |  | ||||||
|         +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts] |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code user_span_hooks] |  | ||||||
|         +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root] |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  To sum up, here's an example of hooking in custom #[code .similarity()] |  | ||||||
|     |  methods: |  | ||||||
| 
 |  | ||||||
| +code("Add custom similarity hooks"). |  | ||||||
|     class SimilarityModel(object): |  | ||||||
|         def __init__(self, model): |  | ||||||
|             self._model = model |  | ||||||
| 
 |  | ||||||
|         def __call__(self, doc): |  | ||||||
|             doc.user_hooks['similarity'] = self.similarity |  | ||||||
|             doc.user_span_hooks['similarity'] = self.similarity |  | ||||||
|             doc.user_token_hooks['similarity'] = self.similarity |  | ||||||
| 
 |  | ||||||
|         def similarity(self, obj1, obj2): |  | ||||||
|             y = self._model([obj1.vector, obj2.vector]) |  | ||||||
|             return float(y[0]) |  | ||||||
|  | @ -175,7 +175,7 @@ p | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     import spacy |     import spacy | ||||||
|     from spacy.tokens.doc import Doc |     from spacy.tokens import Doc | ||||||
|     from spacy.vocab import Vocab |     from spacy.vocab import Vocab | ||||||
| 
 | 
 | ||||||
|     nlp = spacy.load('en') |     nlp = spacy.load('en') | ||||||
|  |  | ||||||
|  | @ -61,7 +61,7 @@ p | ||||||
|         output_path.open('w', encoding='utf-8').write(svg) |         output_path.open('w', encoding='utf-8').write(svg) | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  The above code will generate the dependency visualizations and them to |     |  The above code will generate the dependency visualizations as to | ||||||
|     |  two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. |     |  two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -2,6 +2,44 @@ | ||||||
| 
 | 
 | ||||||
| include ../_includes/_mixins | include ../_includes/_mixins | ||||||
| 
 | 
 | ||||||
|  | +section("pipeline") | ||||||
|  |     +h(3, "custom-components-entities") Custom pipeline components and attribute extensions | ||||||
|  |         +tag-new(2) | ||||||
|  | 
 | ||||||
|  |     p | ||||||
|  |         |  This example shows the implementation of a pipeline component | ||||||
|  |         |  that sets entity annotations based on a list of single or | ||||||
|  |         |  multiple-word company names, merges entities into one token and | ||||||
|  |         |  sets custom attributes on the #[code Doc], #[code Span] and | ||||||
|  |         |  #[code Token]. | ||||||
|  | 
 | ||||||
|  |     +github("spacy", "examples/pipeline/custom_component_entities.py") | ||||||
|  | 
 | ||||||
|  |     +h(3, "custom-components-api") | ||||||
|  |         |  Custom pipeline components and attribute extensions via a REST API | ||||||
|  |         +tag-new(2) | ||||||
|  | 
 | ||||||
|  |     p | ||||||
|  |         |  This example shows the implementation of a pipeline component | ||||||
|  |         |  that fetches country meta data via the | ||||||
|  |         |  #[+a("https://restcountries.eu") REST Countries API] sets entity | ||||||
|  |         |  annotations for countries, merges entities into one token and | ||||||
|  |         |  sets custom attributes on the #[code Doc], #[code Span] and | ||||||
|  |         |  #[code Token] – for example, the capital, latitude/longitude | ||||||
|  |         |  coordinates and the country flag. | ||||||
|  | 
 | ||||||
|  |     +github("spacy", "examples/pipeline/custom_component_countries_api.py") | ||||||
|  | 
 | ||||||
|  |     +h(3, "custom-components-attr-methods") Custom method extensions | ||||||
|  |         +tag-new(2) | ||||||
|  | 
 | ||||||
|  |     p | ||||||
|  |         |  A collection of snippets showing examples of extensions adding | ||||||
|  |         |  custom methods to the #[code Doc], #[code Token] and | ||||||
|  |         |  #[code Span]. | ||||||
|  | 
 | ||||||
|  |     +github("spacy", "examples/pipeline/custom_attr_methods.py") | ||||||
|  | 
 | ||||||
| +section("matching") | +section("matching") | ||||||
|     +h(3, "matcher") Using spaCy's rule-based matcher |     +h(3, "matcher") Using spaCy's rule-based matcher | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -8,18 +8,18 @@ include _spacy-101/_pipelines | ||||||
|     +h(2, "pipelines") How pipelines work |     +h(2, "pipelines") How pipelines work | ||||||
|     include _processing-pipelines/_pipelines |     include _processing-pipelines/_pipelines | ||||||
| 
 | 
 | ||||||
| +section("examples") | +section("custom-components") | ||||||
|     +h(2, "examples") Examples |     +h(2, "custom-components") Creating custom pipeline components | ||||||
|     include _processing-pipelines/_examples |     include _processing-pipelines/_custom-components | ||||||
|  | 
 | ||||||
|  | +section("extensions") | ||||||
|  |     +h(2, "extensions") Developing spaCy extensions | ||||||
|  |     include _processing-pipelines/_extensions | ||||||
| 
 | 
 | ||||||
| +section("multithreading") | +section("multithreading") | ||||||
|     +h(2, "multithreading") Multi-threading |     +h(2, "multithreading") Multi-threading | ||||||
|     include _processing-pipelines/_multithreading |     include _processing-pipelines/_multithreading | ||||||
| 
 | 
 | ||||||
| +section("user-hooks") |  | ||||||
|     +h(2, "user-hooks") User hooks |  | ||||||
|     include _processing-pipelines/_user-hooks |  | ||||||
| 
 |  | ||||||
| +section("serialization") | +section("serialization") | ||||||
|     +h(2, "serialization") Serialization |     +h(2, "serialization") Serialization | ||||||
|     include _processing-pipelines/_serialization |     include _processing-pipelines/_serialization | ||||||
|  |  | ||||||
|  | @ -102,30 +102,36 @@ p | ||||||
|     +h(3, "features-pipelines") Improved processing pipelines |     +h(3, "features-pipelines") Improved processing pipelines | ||||||
| 
 | 
 | ||||||
|     +aside-code("Example"). |     +aside-code("Example"). | ||||||
|         # Modify an existing pipeline |         # Set custom attributes | ||||||
|         nlp = spacy.load('en') |         Doc.set_extension('my_attr', default=False) | ||||||
|         nlp.pipeline.append(my_component) |         Token.set_extension('my_attr', getter=my_token_getter) | ||||||
|  |         assert doc._.my_attr, token._.my_attr | ||||||
| 
 | 
 | ||||||
|         # Register a factory to create a component |         # Add components to the pipeline | ||||||
|         spacy.set_factory('my_factory', my_factory) |         my_component = lambda doc: doc | ||||||
|         nlp = Language(pipeline=['my_factory', mycomponent]) |         nlp.add_pipe(my_component) | ||||||
| 
 | 
 | ||||||
|     p |     p | ||||||
|         |  It's now much easier to #[strong customise the pipeline] with your own |         |  It's now much easier to #[strong customise the pipeline] with your own | ||||||
|         |  components, functions that receive a #[code Doc] object, modify and |         |  components: functions that receive a #[code Doc] object, modify and | ||||||
|         |  return it. If your component is stateful, you can define and register a |         |  return it. Extensions let you write any | ||||||
|         |  factory which receives the shared #[code Vocab] object and returns a |         |  #[strong attributes, properties and methods] to the #[code Doc], | ||||||
|         |  component. spaCy's default components can be added to your pipeline by |         |  #[code Token] and #[code Span]. You can add data, implement new | ||||||
|         |  using their string IDs. This way, you won't have to worry about finding |         |  features, integrate other libraries with spaCy or plug in your own | ||||||
|         |  and implementing them – simply add #[code "tagger"] to the pipeline, |         |  machine learning models. | ||||||
|         |  and spaCy will know what to do. |  | ||||||
| 
 | 
 | ||||||
|     +image |     +image | ||||||
|         include ../assets/img/pipeline.svg |         include ../assets/img/pipeline.svg | ||||||
| 
 | 
 | ||||||
|     +infobox |     +infobox | ||||||
|         |  #[+label-inline API:] #[+api("language") #[code Language]] |         |  #[+label-inline API:] #[+api("language") #[code Language]], | ||||||
|         |  #[+label-inline Usage:] #[+a("/usage/language-processing-pipeline") Processing text] |         |  #[+api("doc#set_extension") #[code Doc.set_extension]], | ||||||
|  |         |  #[+api("span#set_extension") #[code Span.set_extension]], | ||||||
|  |         |  #[+api("token#set_extension") #[code Token.set_extension]] | ||||||
|  |         |  #[+label-inline Usage:] | ||||||
|  |         |  #[+a("/usage/processing-pipelines") Processing pipelines] | ||||||
|  |         |  #[+label-inline Code:] | ||||||
|  |         |  #[+src("/usage/examples#section-pipeline") Pipeline examples] | ||||||
| 
 | 
 | ||||||
|     +h(3, "features-text-classification") Text classification |     +h(3, "features-text-classification") Text classification | ||||||
| 
 | 
 | ||||||
|  | @ -478,15 +484,16 @@ p | ||||||
|     p |     p | ||||||
|         |  If you've been using custom pipeline components, check out the new |         |  If you've been using custom pipeline components, check out the new | ||||||
|         |  guide on #[+a("/usage/language-processing-pipelines") processing pipelines]. |         |  guide on #[+a("/usage/language-processing-pipelines") processing pipelines]. | ||||||
|         |  Appending functions to the pipeline still works – but you might be able |         |  Appending functions to the pipeline still works – but the | ||||||
|         |  to make this more convenient by registering "component factories". |         |  #[+api("language#add_pipe") #[code add_pipe]] methods now makes this | ||||||
|         |  Components of the processing pipeline can now be disabled by passing a |         |  much more convenient. Components of the processing pipeline can now | ||||||
|         |  list of their names to the #[code disable] keyword argument on loading |         |  be disabled by passing a list of their names to the #[code disable] | ||||||
|         |  or processing. |         |  keyword argument on load, or by simply demoving them from the | ||||||
|  |         |  pipeline alltogether. | ||||||
| 
 | 
 | ||||||
|     +code-new. |     +code-new. | ||||||
|         nlp = spacy.load('en', disable=['tagger', 'ner']) |         nlp = spacy.load('en', disable=['tagger', 'ner']) | ||||||
|         doc = nlp(u"I don't want parsed", disable=['parser']) |         nlp.remove_pipe('parser') | ||||||
|     +code-old. |     +code-old. | ||||||
|         nlp = spacy.load('en', tagger=False, entity=False) |         nlp = spacy.load('en', tagger=False, entity=False) | ||||||
|         doc = nlp(u"I don't want parsed", parse=False) |         doc = nlp(u"I don't want parsed", parse=False) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user