mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Tidy up and improve docs and docstrings (#3370)
<!--- Provide a general summary of your changes in the title. --> ## Description * tidy up and adjust Cython code to code style * improve docstrings and make calling `help()` nicer * add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects * fix various typos and inconsistencies in docs ### Types of change enhancement, docs ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
This commit is contained in:
		
							parent
							
								
									daaeeb7a2b
								
							
						
					
					
						commit
						296446a1c8
					
				|  | @ -1,4 +1,11 @@ | |||
| # coding: utf8 | ||||
| """ | ||||
| Helpers for Python and platform compatibility. To distinguish them from | ||||
| the builtin functions, replacement functions are suffixed with an underscore, | ||||
| e.g. `unicode_`. | ||||
| 
 | ||||
| DOCS: https://spacy.io/api/top-level#compat | ||||
| """ | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import os | ||||
|  | @ -64,19 +71,23 @@ elif is_python3: | |||
| 
 | ||||
| 
 | ||||
| def b_to_str(b_str): | ||||
|     """Convert a bytes object to a string. | ||||
| 
 | ||||
|     b_str (bytes): The object to convert. | ||||
|     RETURNS (unicode): The converted string. | ||||
|     """ | ||||
|     if is_python2: | ||||
|         return b_str | ||||
|     # important: if no encoding is set, string becomes "b'...'" | ||||
|     # Important: if no encoding is set, string becomes "b'...'" | ||||
|     return str(b_str, encoding="utf8") | ||||
| 
 | ||||
| 
 | ||||
| def getattr_(obj, name, *default): | ||||
|     if is_python3 and isinstance(name, bytes): | ||||
|         name = name.decode("utf8") | ||||
|     return getattr(obj, name, *default) | ||||
| 
 | ||||
| 
 | ||||
| def symlink_to(orig, dest): | ||||
|     """Create a symlink. Used for model shortcut links. | ||||
| 
 | ||||
|     orig (unicode / Path): The origin path. | ||||
|     dest (unicode / Path): The destination path of the symlink. | ||||
|     """ | ||||
|     if is_windows: | ||||
|         import subprocess | ||||
| 
 | ||||
|  | @ -86,6 +97,10 @@ def symlink_to(orig, dest): | |||
| 
 | ||||
| 
 | ||||
| def symlink_remove(link): | ||||
|     """Remove a symlink. Used for model shortcut links. | ||||
| 
 | ||||
|     link (unicode / Path): The path to the symlink. | ||||
|     """ | ||||
|     # https://stackoverflow.com/q/26554135/6400719 | ||||
|     if os.path.isdir(path2str(link)) and is_windows: | ||||
|         # this should only be on Py2.7 and windows | ||||
|  | @ -95,6 +110,18 @@ def symlink_remove(link): | |||
| 
 | ||||
| 
 | ||||
| def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): | ||||
|     """Check if a specific configuration of Python version and operating system | ||||
|     matches the user's setup. Mostly used to display targeted error messages. | ||||
| 
 | ||||
|     python2 (bool): spaCy is executed with Python 2.x. | ||||
|     python3 (bool): spaCy is executed with Python 3.x. | ||||
|     windows (bool): spaCy is executed on Windows. | ||||
|     linux (bool): spaCy is executed on Linux. | ||||
|     osx (bool): spaCy is executed on OS X or macOS. | ||||
|     RETURNS (bool): Whether the configuration matches the user's platform. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/top-level#compat.is_config | ||||
|     """ | ||||
|     return ( | ||||
|         python2 in (None, is_python2) | ||||
|         and python3 in (None, is_python3) | ||||
|  | @ -104,19 +131,14 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): | |||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def normalize_string_keys(old): | ||||
|     """Given a dictionary, make sure keys are unicode strings, not bytes.""" | ||||
|     new = {} | ||||
|     for key, value in old.items(): | ||||
|         if isinstance(key, bytes_): | ||||
|             new[key.decode("utf8")] = value | ||||
|         else: | ||||
|             new[key] = value | ||||
|     return new | ||||
| 
 | ||||
| 
 | ||||
| def import_file(name, loc): | ||||
|     loc = str(loc) | ||||
|     """Import module from a file. Used to load models from a directory. | ||||
| 
 | ||||
|     name (unicode): Name of module to load. | ||||
|     loc (unicode / Path): Path to the file. | ||||
|     RETURNS: The loaded module. | ||||
|     """ | ||||
|     loc = path2str(loc) | ||||
|     if is_python_pre_3_5: | ||||
|         import imp | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,10 @@ | |||
| # coding: utf8 | ||||
| """ | ||||
| spaCy's built in visualization suite for dependencies and named entities. | ||||
| 
 | ||||
| DOCS: https://spacy.io/api/top-level#displacy | ||||
| USAGE: https://spacy.io/usage/visualizers | ||||
| """ | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .render import DependencyRenderer, EntityRenderer | ||||
|  | @ -25,6 +31,9 @@ def render( | |||
|     options (dict): Visualiser-specific options, e.g. colors. | ||||
|     manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. | ||||
|     RETURNS (unicode): Rendered HTML markup. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/top-level#displacy.render | ||||
|     USAGE: https://spacy.io/usage/visualizers | ||||
|     """ | ||||
|     factories = { | ||||
|         "dep": (DependencyRenderer, parse_deps), | ||||
|  | @ -71,6 +80,9 @@ def serve( | |||
|     manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. | ||||
|     port (int): Port to serve visualisation. | ||||
|     host (unicode): Host to serve visualisation. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/top-level#displacy.serve | ||||
|     USAGE: https://spacy.io/usage/visualizers | ||||
|     """ | ||||
|     from wsgiref import simple_server | ||||
| 
 | ||||
|  |  | |||
|  | @ -338,6 +338,17 @@ class Errors(object): | |||
|             "or with a getter AND setter.") | ||||
|     E120 = ("Can't set custom extension attributes during retokenization. " | ||||
|             "Expected dict mapping attribute names to values, but got: {value}") | ||||
|     E121 = ("Can't bulk merge spans. Attribute length {attr_len} should be " | ||||
|             "equal to span length ({span_len}).") | ||||
|     E122 = ("Cannot find token to be split. Did it get merged?") | ||||
|     E123 = ("Cannot find head of token to be split. Did it get merged?") | ||||
|     E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg") | ||||
|     E125 = ("Unexpected value: {value}") | ||||
|     E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " | ||||
|             "This is likely a bug in spaCy, so feel free to open an issue.") | ||||
|     E127 = ("Cannot create phrase pattern representation for length 0. This " | ||||
|             "is likely a bug in spaCy.") | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
|  |  | |||
							
								
								
									
										190
									
								
								spacy/gold.pyx
									
									
									
									
									
								
							
							
						
						
									
										190
									
								
								spacy/gold.pyx
									
									
									
									
									
								
							|  | @ -14,34 +14,38 @@ from . import _align | |||
| from .syntax import nonproj | ||||
| from .tokens import Doc, Span | ||||
| from .errors import Errors | ||||
| from .compat import path2str | ||||
| from . import util | ||||
| from .util import minibatch, itershuffle | ||||
| 
 | ||||
| from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek | ||||
| 
 | ||||
| 
 | ||||
| punct_re = re.compile(r"\W") | ||||
| 
 | ||||
| 
 | ||||
| def tags_to_entities(tags): | ||||
|     entities = [] | ||||
|     start = None | ||||
|     for i, tag in enumerate(tags): | ||||
|         if tag is None: | ||||
|             continue | ||||
|         if tag.startswith('O'): | ||||
|         if tag.startswith("O"): | ||||
|             # TODO: We shouldn't be getting these malformed inputs. Fix this. | ||||
|             if start is not None: | ||||
|                 start = None | ||||
|             continue | ||||
|         elif tag == '-': | ||||
|         elif tag == "-": | ||||
|             continue | ||||
|         elif tag.startswith('I'): | ||||
|         elif tag.startswith("I"): | ||||
|             if start is None: | ||||
|                 raise ValueError(Errors.E067.format(tags=tags[:i + 1])) | ||||
|             continue | ||||
|         if tag.startswith('U'): | ||||
|         if tag.startswith("U"): | ||||
|             entities.append((tag[2:], i, i)) | ||||
|         elif tag.startswith('B'): | ||||
|         elif tag.startswith("B"): | ||||
|             start = i | ||||
|         elif tag.startswith('L'): | ||||
|         elif tag.startswith("L"): | ||||
|             entities.append((tag[2:], start, i)) | ||||
|             start = None | ||||
|         else: | ||||
|  | @ -60,19 +64,18 @@ def merge_sents(sents): | |||
|         m_deps[3].extend(head + i for head in heads) | ||||
|         m_deps[4].extend(labels) | ||||
|         m_deps[5].extend(ner) | ||||
|         m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) | ||||
|         m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) | ||||
|                           for b in brackets) | ||||
|         i += len(ids) | ||||
|     return [(m_deps, m_brackets)] | ||||
| 
 | ||||
| 
 | ||||
| punct_re = re.compile(r'\W') | ||||
| def align(cand_words, gold_words): | ||||
|     if cand_words == gold_words: | ||||
|         alignment = numpy.arange(len(cand_words)) | ||||
|         return 0, alignment, alignment, {}, {} | ||||
|     cand_words = [w.replace(' ', '').lower() for w in cand_words] | ||||
|     gold_words = [w.replace(' ', '').lower() for w in gold_words] | ||||
|     cand_words = [w.replace(" ", "").lower() for w in cand_words] | ||||
|     gold_words = [w.replace(" ", "").lower() for w in gold_words] | ||||
|     cost, i2j, j2i, matrix = _align.align(cand_words, gold_words) | ||||
|     i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words], | ||||
|                                 [len(w) for w in gold_words]) | ||||
|  | @ -89,7 +92,10 @@ def align(cand_words, gold_words): | |||
| 
 | ||||
| class GoldCorpus(object): | ||||
|     """An annotated corpus, using the JSON file format. Manages | ||||
|     annotations for tagging, dependency parsing and NER.""" | ||||
|     annotations for tagging, dependency parsing and NER. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/goldcorpus | ||||
|     """ | ||||
|     def __init__(self, train, dev, gold_preproc=False, limit=None): | ||||
|         """Create a GoldCorpus. | ||||
| 
 | ||||
|  | @ -101,12 +107,10 @@ class GoldCorpus(object): | |||
|         if isinstance(train, str) or isinstance(train, Path): | ||||
|             train = self.read_tuples(self.walk_corpus(train)) | ||||
|             dev = self.read_tuples(self.walk_corpus(dev)) | ||||
| 
 | ||||
|         # Write temp directory with one doc per file, so we can shuffle | ||||
|         # and stream | ||||
|         # Write temp directory with one doc per file, so we can shuffle and stream | ||||
|         self.tmp_dir = Path(tempfile.mkdtemp()) | ||||
|         self.write_msgpack(self.tmp_dir / 'train', train, limit=self.limit) | ||||
|         self.write_msgpack(self.tmp_dir / 'dev', dev, limit=self.limit) | ||||
|         self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) | ||||
|         self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) | ||||
| 
 | ||||
|     def __del__(self): | ||||
|         shutil.rmtree(self.tmp_dir) | ||||
|  | @ -117,7 +121,7 @@ class GoldCorpus(object): | |||
|             directory.mkdir() | ||||
|         n = 0 | ||||
|         for i, doc_tuple in enumerate(doc_tuples): | ||||
|             srsly.write_msgpack(directory / '{}.msg'.format(i), [doc_tuple]) | ||||
|             srsly.write_msgpack(directory / "{}.msg".format(i), [doc_tuple]) | ||||
|             n += len(doc_tuple[1]) | ||||
|             if limit and n >= limit: | ||||
|                 break | ||||
|  | @ -134,11 +138,11 @@ class GoldCorpus(object): | |||
|             if str(path) in seen: | ||||
|                 continue | ||||
|             seen.add(str(path)) | ||||
|             if path.parts[-1].startswith('.'): | ||||
|             if path.parts[-1].startswith("."): | ||||
|                 continue | ||||
|             elif path.is_dir(): | ||||
|                 paths.extend(path.iterdir()) | ||||
|             elif path.parts[-1].endswith('.json'): | ||||
|             elif path.parts[-1].endswith(".json"): | ||||
|                 locs.append(path) | ||||
|         return locs | ||||
| 
 | ||||
|  | @ -147,13 +151,12 @@ class GoldCorpus(object): | |||
|         i = 0 | ||||
|         for loc in locs: | ||||
|             loc = util.ensure_path(loc) | ||||
|             if loc.parts[-1].endswith('json'): | ||||
|             if loc.parts[-1].endswith("json"): | ||||
|                 gold_tuples = read_json_file(loc) | ||||
|             elif loc.parts[-1].endswith('msg'): | ||||
|             elif loc.parts[-1].endswith("msg"): | ||||
|                 gold_tuples = srsly.read_msgpack(loc) | ||||
|             else: | ||||
|                 msg = "Cannot read from file: %s. Supported formats: .json, .msg" | ||||
|                 raise ValueError(msg % loc) | ||||
|                 raise ValueError(Errors.E124.format(path=path2str(loc))) | ||||
|             for item in gold_tuples: | ||||
|                 yield item | ||||
|                 i += len(item[1]) | ||||
|  | @ -162,12 +165,12 @@ class GoldCorpus(object): | |||
| 
 | ||||
|     @property | ||||
|     def dev_tuples(self): | ||||
|         locs = (self.tmp_dir / 'dev').iterdir() | ||||
|         locs = (self.tmp_dir / "dev").iterdir() | ||||
|         yield from self.read_tuples(locs, limit=self.limit) | ||||
| 
 | ||||
|     @property | ||||
|     def train_tuples(self): | ||||
|         locs = (self.tmp_dir / 'train').iterdir() | ||||
|         locs = (self.tmp_dir / "train").iterdir() | ||||
|         yield from self.read_tuples(locs, limit=self.limit) | ||||
| 
 | ||||
|     def count_train(self): | ||||
|  | @ -193,8 +196,7 @@ class GoldCorpus(object): | |||
|         yield from gold_docs | ||||
| 
 | ||||
|     def dev_docs(self, nlp, gold_preproc=False): | ||||
|         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, | ||||
|                                         gold_preproc=gold_preproc) | ||||
|         gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc) | ||||
|         yield from gold_docs | ||||
| 
 | ||||
|     @classmethod | ||||
|  | @ -205,32 +207,29 @@ class GoldCorpus(object): | |||
|                 raw_text = None | ||||
|             else: | ||||
|                 paragraph_tuples = merge_sents(paragraph_tuples) | ||||
|             docs = cls._make_docs(nlp, raw_text, paragraph_tuples, | ||||
|                                   gold_preproc, noise_level=noise_level) | ||||
|             docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, | ||||
|                                   noise_level=noise_level) | ||||
|             golds = cls._make_golds(docs, paragraph_tuples, make_projective) | ||||
|             for doc, gold in zip(docs, golds): | ||||
|                 if (not max_length) or len(doc) < max_length: | ||||
|                     yield doc, gold | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, | ||||
|                    noise_level=0.0): | ||||
|     def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0): | ||||
|         if raw_text is not None: | ||||
|             raw_text = add_noise(raw_text, noise_level) | ||||
|             return [nlp.make_doc(raw_text)] | ||||
|         else: | ||||
|             return [Doc(nlp.vocab, | ||||
|                         words=add_noise(sent_tuples[1], noise_level)) | ||||
|             return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) | ||||
|                     for (sent_tuples, brackets) in paragraph_tuples] | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _make_golds(cls, docs, paragraph_tuples, make_projective): | ||||
|         if len(docs) != len(paragraph_tuples): | ||||
|             raise ValueError(Errors.E070.format(n_docs=len(docs), | ||||
|                                                 n_annots=len(paragraph_tuples))) | ||||
|             n_annots = len(paragraph_tuples) | ||||
|             raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots)) | ||||
|         if len(docs) == 1: | ||||
|             return [GoldParse.from_annot_tuples(docs[0], | ||||
|                                                 paragraph_tuples[0][0], | ||||
|             return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0], | ||||
|                                                 make_projective=make_projective)] | ||||
|         else: | ||||
|             return [GoldParse.from_annot_tuples(doc, sent_tuples, | ||||
|  | @ -247,18 +246,18 @@ def add_noise(orig, noise_level): | |||
|         corrupted = [w for w in corrupted if w] | ||||
|         return corrupted | ||||
|     else: | ||||
|         return ''.join(_corrupt(c, noise_level) for c in orig) | ||||
|         return "".join(_corrupt(c, noise_level) for c in orig) | ||||
| 
 | ||||
| 
 | ||||
| def _corrupt(c, noise_level): | ||||
|     if random.random() >= noise_level: | ||||
|         return c | ||||
|     elif c == ' ': | ||||
|         return '\n' | ||||
|     elif c == '\n': | ||||
|         return ' ' | ||||
|     elif c in ['.', "'", "!", "?", ',']: | ||||
|         return '' | ||||
|     elif c == " ": | ||||
|         return "\n" | ||||
|     elif c == "\n": | ||||
|         return " " | ||||
|     elif c in [".", "'", "!", "?", ","]: | ||||
|         return "" | ||||
|     else: | ||||
|         return c.lower() | ||||
| 
 | ||||
|  | @ -284,30 +283,30 @@ def json_to_tuple(doc): | |||
|     YIELDS (tuple): The reformatted data. | ||||
|     """ | ||||
|     paragraphs = [] | ||||
|     for paragraph in doc['paragraphs']: | ||||
|     for paragraph in doc["paragraphs"]: | ||||
|         sents = [] | ||||
|         for sent in paragraph['sentences']: | ||||
|         for sent in paragraph["sentences"]: | ||||
|             words = [] | ||||
|             ids = [] | ||||
|             tags = [] | ||||
|             heads = [] | ||||
|             labels = [] | ||||
|             ner = [] | ||||
|             for i, token in enumerate(sent['tokens']): | ||||
|                 words.append(token['orth']) | ||||
|             for i, token in enumerate(sent["tokens"]): | ||||
|                 words.append(token["orth"]) | ||||
|                 ids.append(i) | ||||
|                 tags.append(token.get('tag', '-')) | ||||
|                 heads.append(token.get('head', 0) + i) | ||||
|                 labels.append(token.get('dep', '')) | ||||
|                 tags.append(token.get('tag', "-")) | ||||
|                 heads.append(token.get("head", 0) + i) | ||||
|                 labels.append(token.get("dep", "")) | ||||
|                 # Ensure ROOT label is case-insensitive | ||||
|                 if labels[-1].lower() == 'root': | ||||
|                     labels[-1] = 'ROOT' | ||||
|                 ner.append(token.get('ner', '-')) | ||||
|                 if labels[-1].lower() == "root": | ||||
|                     labels[-1] = "ROOT" | ||||
|                 ner.append(token.get("ner", "-")) | ||||
|             sents.append([ | ||||
|                 [ids, words, tags, heads, labels, ner], | ||||
|                 sent.get('brackets', [])]) | ||||
|                 sent.get("brackets", [])]) | ||||
|         if sents: | ||||
|             yield [paragraph.get('raw', None), sents] | ||||
|             yield [paragraph.get("raw", None), sents] | ||||
| 
 | ||||
| 
 | ||||
| def read_json_file(loc, docs_filter=None, limit=None): | ||||
|  | @ -329,7 +328,7 @@ def _json_iterate(loc): | |||
|     # It's okay to read in the whole file -- just don't parse it into JSON. | ||||
|     cdef bytes py_raw | ||||
|     loc = util.ensure_path(loc) | ||||
|     with loc.open('rb') as file_: | ||||
|     with loc.open("rb") as file_: | ||||
|         py_raw = file_.read() | ||||
|     raw = <char*>py_raw | ||||
|     cdef int square_depth = 0 | ||||
|  | @ -339,11 +338,11 @@ def _json_iterate(loc): | |||
|     cdef int start = -1 | ||||
|     cdef char c | ||||
|     cdef char quote = ord('"') | ||||
|     cdef char backslash = ord('\\') | ||||
|     cdef char open_square = ord('[') | ||||
|     cdef char close_square = ord(']') | ||||
|     cdef char open_curly = ord('{') | ||||
|     cdef char close_curly = ord('}') | ||||
|     cdef char backslash = ord("\\") | ||||
|     cdef char open_square = ord("[") | ||||
|     cdef char close_square = ord("]") | ||||
|     cdef char open_curly = ord("{") | ||||
|     cdef char close_curly = ord("}") | ||||
|     for i in range(len(py_raw)): | ||||
|         c = raw[i] | ||||
|         if escape: | ||||
|  | @ -368,7 +367,7 @@ def _json_iterate(loc): | |||
|         elif c == close_curly: | ||||
|             curly_depth -= 1 | ||||
|             if square_depth == 1 and curly_depth == 0: | ||||
|                 py_str = py_raw[start : i+1].decode('utf8') | ||||
|                 py_str = py_raw[start : i + 1].decode("utf8") | ||||
|                 try: | ||||
|                     yield srsly.json_loads(py_str) | ||||
|                 except Exception: | ||||
|  | @ -388,7 +387,7 @@ def iob_to_biluo(tags): | |||
| 
 | ||||
| 
 | ||||
| def _consume_os(tags): | ||||
|     while tags and tags[0] == 'O': | ||||
|     while tags and tags[0] == "O": | ||||
|         yield tags.pop(0) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -396,24 +395,27 @@ def _consume_ent(tags): | |||
|     if not tags: | ||||
|         return [] | ||||
|     tag = tags.pop(0) | ||||
|     target_in = 'I' + tag[1:] | ||||
|     target_last = 'L' + tag[1:] | ||||
|     target_in = "I" + tag[1:] | ||||
|     target_last = "L" + tag[1:] | ||||
|     length = 1 | ||||
|     while tags and tags[0] in {target_in, target_last}: | ||||
|         length += 1 | ||||
|         tags.pop(0) | ||||
|     label = tag[2:] | ||||
|     if length == 1: | ||||
|         return ['U-' + label] | ||||
|         return ["U-" + label] | ||||
|     else: | ||||
|         start = 'B-' + label | ||||
|         end = 'L-' + label | ||||
|         middle = ['I-%s' % label for _ in range(1, length - 1)] | ||||
|         start = "B-" + label | ||||
|         end = "L-" + label | ||||
|         middle = ["I-%s" % label for _ in range(1, length - 1)] | ||||
|         return [start] + middle + [end] | ||||
| 
 | ||||
| 
 | ||||
| cdef class GoldParse: | ||||
|     """Collection for training annotations.""" | ||||
|     """Collection for training annotations. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/goldparse | ||||
|     """ | ||||
|     @classmethod | ||||
|     def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): | ||||
|         _, words, tags, heads, deps, entities = annot_tuples | ||||
|  | @ -456,13 +458,13 @@ cdef class GoldParse: | |||
|         if deps is None: | ||||
|             deps = [None for _ in doc] | ||||
|         if entities is None: | ||||
|             entities = ['-' for _ in doc] | ||||
|             entities = ["-" for _ in doc] | ||||
|         elif len(entities) == 0: | ||||
|             entities = ['O' for _ in doc] | ||||
|             entities = ["O" for _ in doc] | ||||
|         else: | ||||
|             # Translate the None values to '-', to make processing easier. | ||||
|             # See Issue #2603 | ||||
|             entities = [(ent if ent is not None else '-') for ent in entities] | ||||
|             entities = [(ent if ent is not None else "-") for ent in entities] | ||||
|             if not isinstance(entities[0], basestring): | ||||
|                 # Assume we have entities specified by character offset. | ||||
|                 entities = biluo_tags_from_offsets(doc, entities) | ||||
|  | @ -508,10 +510,10 @@ cdef class GoldParse: | |||
|         for i, gold_i in enumerate(self.cand_to_gold): | ||||
|             if doc[i].text.isspace(): | ||||
|                 self.words[i] = doc[i].text | ||||
|                 self.tags[i] = '_SP' | ||||
|                 self.tags[i] = "_SP" | ||||
|                 self.heads[i] = None | ||||
|                 self.labels[i] = None | ||||
|                 self.ner[i] = 'O' | ||||
|                 self.ner[i] = "O" | ||||
|             if gold_i is None: | ||||
|                 if i in i2j_multi: | ||||
|                     self.words[i] = words[i2j_multi[i]] | ||||
|  | @ -521,7 +523,7 @@ cdef class GoldParse: | |||
|                     # Set next word in multi-token span as head, until last | ||||
|                     if not is_last: | ||||
|                         self.heads[i] = i+1 | ||||
|                         self.labels[i] = 'subtok' | ||||
|                         self.labels[i] = "subtok" | ||||
|                     else: | ||||
|                         self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]] | ||||
|                         self.labels[i] = deps[i2j_multi[i]] | ||||
|  | @ -530,24 +532,24 @@ cdef class GoldParse: | |||
|                     # BILOU tags. We can't have BB or LL etc. | ||||
|                     # Case 1: O -- easy. | ||||
|                     ner_tag = entities[i2j_multi[i]] | ||||
|                     if ner_tag == 'O': | ||||
|                         self.ner[i] = 'O' | ||||
|                     if ner_tag == "O": | ||||
|                         self.ner[i] = "O" | ||||
|                     # Case 2: U. This has to become a B I* L sequence. | ||||
|                     elif ner_tag.startswith('U-'): | ||||
|                     elif ner_tag.startswith("U-"): | ||||
|                         if is_first: | ||||
|                             self.ner[i] = ner_tag.replace('U-', 'B-', 1) | ||||
|                             self.ner[i] = ner_tag.replace("U-", "B-", 1) | ||||
|                         elif is_last: | ||||
|                             self.ner[i] = ner_tag.replace('U-', 'L-', 1) | ||||
|                             self.ner[i] = ner_tag.replace("U-", "L-", 1) | ||||
|                         else: | ||||
|                             self.ner[i] = ner_tag.replace('U-', 'I-', 1) | ||||
|                             self.ner[i] = ner_tag.replace("U-", "I-", 1) | ||||
|                     # Case 3: L. If not last, change to I. | ||||
|                     elif ner_tag.startswith('L-'): | ||||
|                     elif ner_tag.startswith("L-"): | ||||
|                         if is_last: | ||||
|                             self.ner[i] = ner_tag | ||||
|                         else: | ||||
|                             self.ner[i] = ner_tag.replace('L-', 'I-', 1) | ||||
|                             self.ner[i] = ner_tag.replace("L-", "I-", 1) | ||||
|                     # Case 4: I. Stays correct | ||||
|                     elif ner_tag.startswith('I-'): | ||||
|                     elif ner_tag.startswith("I-"): | ||||
|                         self.ner[i] = ner_tag | ||||
|             else: | ||||
|                 self.words[i] = words[gold_i] | ||||
|  | @ -608,7 +610,7 @@ def docs_to_json(docs, underscore=None): | |||
|     return [doc.to_json(underscore=underscore) for doc in docs] | ||||
| 
 | ||||
| 
 | ||||
| def biluo_tags_from_offsets(doc, entities, missing='O'): | ||||
| def biluo_tags_from_offsets(doc, entities, missing="O"): | ||||
|     """Encode labelled spans into per-token tags, using the | ||||
|     Begin/In/Last/Unit/Out scheme (BILUO). | ||||
| 
 | ||||
|  | @ -631,11 +633,11 @@ def biluo_tags_from_offsets(doc, entities, missing='O'): | |||
|         >>> entities = [(len('I like '), len('I like London'), 'LOC')] | ||||
|         >>> doc = nlp.tokenizer(text) | ||||
|         >>> tags = biluo_tags_from_offsets(doc, entities) | ||||
|         >>> assert tags == ['O', 'O', 'U-LOC', 'O'] | ||||
|         >>> assert tags == ["O", "O", 'U-LOC', "O"] | ||||
|     """ | ||||
|     starts = {token.idx: token.i for token in doc} | ||||
|     ends = {token.idx + len(token): token.i for token in doc} | ||||
|     biluo = ['-' for _ in doc] | ||||
|     biluo = ["-" for _ in doc] | ||||
|     # Handle entity cases | ||||
|     for start_char, end_char, label in entities: | ||||
|         start_token = starts.get(start_char) | ||||
|  | @ -643,12 +645,12 @@ def biluo_tags_from_offsets(doc, entities, missing='O'): | |||
|         # Only interested if the tokenization is correct | ||||
|         if start_token is not None and end_token is not None: | ||||
|             if start_token == end_token: | ||||
|                 biluo[start_token] = 'U-%s' % label | ||||
|                 biluo[start_token] = "U-%s" % label | ||||
|             else: | ||||
|                 biluo[start_token] = 'B-%s' % label | ||||
|                 biluo[start_token] = "B-%s" % label | ||||
|                 for i in range(start_token+1, end_token): | ||||
|                     biluo[i] = 'I-%s' % label | ||||
|                 biluo[end_token] = 'L-%s' % label | ||||
|                     biluo[i] = "I-%s" % label | ||||
|                 biluo[end_token] = "L-%s" % label | ||||
|     # Now distinguish the O cases from ones where we miss the tokenization | ||||
|     entity_chars = set() | ||||
|     for start_char, end_char, label in entities: | ||||
|  | @ -697,4 +699,4 @@ def offsets_from_biluo_tags(doc, tags): | |||
| 
 | ||||
| 
 | ||||
| def is_punct_label(label): | ||||
|     return label == 'P' or label.lower() == 'punct' | ||||
|     return label == "P" or label.lower() == "punct" | ||||
|  |  | |||
|  | @ -103,8 +103,9 @@ class Language(object): | |||
|     Defaults (class): Settings, data and factory methods for creating the `nlp` | ||||
|         object and processing pipeline. | ||||
|     lang (unicode): Two-letter language ID, i.e. ISO code. | ||||
|     """ | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/language | ||||
|     """ | ||||
|     Defaults = BaseDefaults | ||||
|     lang = None | ||||
| 
 | ||||
|  |  | |||
|  | @ -6,6 +6,13 @@ from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos | |||
| 
 | ||||
| 
 | ||||
| class Lemmatizer(object): | ||||
|     """ | ||||
|     The Lemmatizer supports simple part-of-speech-sensitive suffix rules and | ||||
|     lookup tables. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/lemmatizer | ||||
|     """ | ||||
| 
 | ||||
|     @classmethod | ||||
|     def load(cls, path, index=None, exc=None, rules=None, lookup=None): | ||||
|         return cls(index, exc, rules, lookup) | ||||
|  |  | |||
|  | @ -4,17 +4,19 @@ from __future__ import unicode_literals, print_function | |||
| 
 | ||||
| # Compiler crashes on memory view coercion without this. Should report bug. | ||||
| from cython.view cimport array as cvarray | ||||
| from libc.string cimport memset | ||||
| cimport numpy as np | ||||
| np.import_array() | ||||
| from libc.string cimport memset | ||||
| 
 | ||||
| import numpy | ||||
| from thinc.neural.util import get_array_module | ||||
| 
 | ||||
| from .typedefs cimport attr_t, flags_t | ||||
| from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE | ||||
| from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP | ||||
| from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV | ||||
| from .attrs cimport PROB | ||||
| from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT | ||||
| from .attrs cimport IS_CURRENCY, IS_OOV, PROB | ||||
| 
 | ||||
| from .attrs import intify_attrs | ||||
| from .errors import Errors, Warnings, user_warning | ||||
| 
 | ||||
|  | @ -27,6 +29,8 @@ cdef class Lexeme: | |||
|     word-type, as opposed to a word token.  It therefore has no part-of-speech | ||||
|     tag, dependency parse, or lemma (lemmatization depends on the | ||||
|     part-of-speech tag). | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/lexeme | ||||
|     """ | ||||
|     def __init__(self, Vocab vocab, attr_t orth): | ||||
|         """Create a Lexeme object. | ||||
|  | @ -115,15 +119,15 @@ cdef class Lexeme: | |||
|         RETURNS (float): A scalar similarity score. Higher is more similar. | ||||
|         """ | ||||
|         # Return 1.0 similarity for matches | ||||
|         if hasattr(other, 'orth'): | ||||
|         if hasattr(other, "orth"): | ||||
|             if self.c.orth == other.orth: | ||||
|                 return 1.0 | ||||
|         elif hasattr(other, '__len__') and len(other) == 1 \ | ||||
|         and hasattr(other[0], 'orth'): | ||||
|         elif hasattr(other, "__len__") and len(other) == 1 \ | ||||
|         and hasattr(other[0], "orth"): | ||||
|             if self.c.orth == other[0].orth: | ||||
|                 return 1.0 | ||||
|         if self.vector_norm == 0 or other.vector_norm == 0: | ||||
|             user_warning(Warnings.W008.format(obj='Lexeme')) | ||||
|             user_warning(Warnings.W008.format(obj="Lexeme")) | ||||
|             return 0.0 | ||||
|         vector = self.vector | ||||
|         xp = get_array_module(vector) | ||||
|  | @ -136,7 +140,7 @@ cdef class Lexeme: | |||
|         if (end-start) != sizeof(lex_data.data): | ||||
|             raise ValueError(Errors.E072.format(length=end-start, | ||||
|                                                 bad_length=sizeof(lex_data.data))) | ||||
|         byte_string = b'\0' * sizeof(lex_data.data) | ||||
|         byte_string = b"\0" * sizeof(lex_data.data) | ||||
|         byte_chars = <char*>byte_string | ||||
|         for i in range(sizeof(lex_data.data)): | ||||
|             byte_chars[i] = lex_data.data[i] | ||||
|  |  | |||
|  | @ -1,6 +1,8 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .matcher import Matcher  # noqa: F401 | ||||
| from .phrasematcher import PhraseMatcher  # noqa: F401 | ||||
| from .dependencymatcher import DependencyTreeMatcher  # noqa: F401 | ||||
| from .matcher import Matcher | ||||
| from .phrasematcher import PhraseMatcher | ||||
| from .dependencymatcher import DependencyTreeMatcher | ||||
| 
 | ||||
| __all__ = ["Matcher", "PhraseMatcher", "DependencyTreeMatcher"] | ||||
|  |  | |||
|  | @ -13,7 +13,7 @@ from .matcher import unpickle_matcher | |||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| DELIMITER = '||' | ||||
| DELIMITER = "||" | ||||
| INDEX_HEAD = 1 | ||||
| INDEX_RELOP = 0 | ||||
| 
 | ||||
|  | @ -55,7 +55,8 @@ cdef class DependencyTreeMatcher: | |||
|         return (unpickle_matcher, data, None, None) | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         """Get the number of rules, which are edges ,added to the dependency tree matcher. | ||||
|         """Get the number of rules, which are edges, added to the dependency | ||||
|         tree matcher. | ||||
| 
 | ||||
|         RETURNS (int): The number of rules. | ||||
|         """ | ||||
|  | @ -73,19 +74,30 @@ cdef class DependencyTreeMatcher: | |||
|         idx = 0 | ||||
|         visitedNodes = {} | ||||
|         for relation in pattern: | ||||
|             if 'PATTERN' not in relation or 'SPEC' not in relation: | ||||
|             if "PATTERN" not in relation or "SPEC" not in relation: | ||||
|                 raise ValueError(Errors.E098.format(key=key)) | ||||
|             if idx == 0: | ||||
|                 if not('NODE_NAME' in relation['SPEC'] and 'NBOR_RELOP' not in relation['SPEC'] and 'NBOR_NAME' not in relation['SPEC']): | ||||
|                 if not( | ||||
|                     "NODE_NAME" in relation["SPEC"] | ||||
|                     and "NBOR_RELOP" not in relation["SPEC"] | ||||
|                     and "NBOR_NAME" not in relation["SPEC"] | ||||
|                 ): | ||||
|                     raise ValueError(Errors.E099.format(key=key)) | ||||
|                 visitedNodes[relation['SPEC']['NODE_NAME']] = True | ||||
|                 visitedNodes[relation["SPEC"]["NODE_NAME"]] = True | ||||
|             else: | ||||
|                 if not('NODE_NAME' in relation['SPEC'] and 'NBOR_RELOP' in relation['SPEC'] and 'NBOR_NAME' in relation['SPEC']): | ||||
|                 if not( | ||||
|                     "NODE_NAME" in relation["SPEC"] | ||||
|                     and "NBOR_RELOP" in relation["SPEC"] | ||||
|                     and "NBOR_NAME" in relation["SPEC"] | ||||
|                 ): | ||||
|                     raise ValueError(Errors.E100.format(key=key)) | ||||
|                 if relation['SPEC']['NODE_NAME'] in visitedNodes or relation['SPEC']['NBOR_NAME'] not in visitedNodes: | ||||
|                 if ( | ||||
|                     relation["SPEC"]["NODE_NAME"] in visitedNodes | ||||
|                     or relation["SPEC"]["NBOR_NAME"] not in visitedNodes | ||||
|                 ): | ||||
|                     raise ValueError(Errors.E101.format(key=key)) | ||||
|                 visitedNodes[relation['SPEC']['NODE_NAME']] = True | ||||
|                 visitedNodes[relation['SPEC']['NBOR_NAME']] = True | ||||
|                 visitedNodes[relation["SPEC"]["NODE_NAME"]] = True | ||||
|                 visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True | ||||
|             idx = idx + 1 | ||||
| 
 | ||||
|     def add(self, key, on_match, *patterns): | ||||
|  | @ -93,24 +105,20 @@ cdef class DependencyTreeMatcher: | |||
|             if len(pattern) == 0: | ||||
|                 raise ValueError(Errors.E012.format(key=key)) | ||||
|             self.validateInput(pattern,key) | ||||
| 
 | ||||
|         key = self._normalize_key(key) | ||||
| 
 | ||||
|         _patterns = [] | ||||
|         for pattern in patterns: | ||||
|             token_patterns = [] | ||||
|             for i in range(len(pattern)): | ||||
|                 token_pattern = [pattern[i]['PATTERN']] | ||||
|                 token_pattern = [pattern[i]["PATTERN"]] | ||||
|                 token_patterns.append(token_pattern) | ||||
|             # self.patterns.append(token_patterns) | ||||
|             _patterns.append(token_patterns) | ||||
| 
 | ||||
|         self._patterns.setdefault(key, []) | ||||
|         self._callbacks[key] = on_match | ||||
|         self._patterns[key].extend(_patterns) | ||||
| 
 | ||||
|         # Add each node pattern of all the input patterns individually to the matcher. | ||||
|         # This enables only a single instance of Matcher to be used. | ||||
|         # Add each node pattern of all the input patterns individually to the | ||||
|         # matcher. This enables only a single instance of Matcher to be used. | ||||
|         # Multiple adds are required to track each node pattern. | ||||
|         _keys_to_token_list = [] | ||||
|         for i in range(len(_patterns)): | ||||
|  | @ -121,24 +129,19 @@ cdef class DependencyTreeMatcher: | |||
|                 self.token_matcher.add(k, None, _patterns[i][j]) | ||||
|                 _keys_to_token[k] = j | ||||
|             _keys_to_token_list.append(_keys_to_token) | ||||
| 
 | ||||
|         self._keys_to_token.setdefault(key, []) | ||||
|         self._keys_to_token[key].extend(_keys_to_token_list) | ||||
| 
 | ||||
|         _nodes_list = [] | ||||
|         for pattern in patterns: | ||||
|             nodes = {} | ||||
|             for i in range(len(pattern)): | ||||
|                 nodes[pattern[i]['SPEC']['NODE_NAME']]=i | ||||
|                 nodes[pattern[i]["SPEC"]["NODE_NAME"]] = i | ||||
|             _nodes_list.append(nodes) | ||||
| 
 | ||||
|         self._nodes.setdefault(key, []) | ||||
|         self._nodes[key].extend(_nodes_list) | ||||
| 
 | ||||
|         # Create an object tree to traverse later on. | ||||
|         # This datastructure enable easy tree pattern match. | ||||
|         # Doc-Token based tree cannot be reused since it is memory heavy and | ||||
|         # tightly coupled with doc | ||||
|         # Create an object tree to traverse later on. This data structure | ||||
|         # enables easy tree pattern match. Doc-Token based tree cannot be | ||||
|         # reused since it is memory-heavy and tightly coupled with the Doc. | ||||
|         self.retrieve_tree(patterns, _nodes_list,key) | ||||
| 
 | ||||
|     def retrieve_tree(self, patterns, _nodes_list, key): | ||||
|  | @ -149,31 +152,29 @@ cdef class DependencyTreeMatcher: | |||
|             root = -1 | ||||
|             for j in range(len(patterns[i])): | ||||
|                 token_pattern = patterns[i][j] | ||||
|                 if('NBOR_RELOP' not in token_pattern['SPEC']): | ||||
|                 if ("NBOR_RELOP" not in token_pattern["SPEC"]): | ||||
|                     heads[j] = ('root', j) | ||||
|                     root = j | ||||
|                 else: | ||||
|                     heads[j] = (token_pattern['SPEC']['NBOR_RELOP'],_nodes_list[i][token_pattern['SPEC']['NBOR_NAME']]) | ||||
| 
 | ||||
|                     heads[j] = ( | ||||
|                         token_pattern["SPEC"]["NBOR_RELOP"], | ||||
|                         _nodes_list[i][token_pattern["SPEC"]["NBOR_NAME"]] | ||||
|                     ) | ||||
|             _heads_list.append(heads) | ||||
|             _root_list.append(root) | ||||
| 
 | ||||
|         _tree_list = [] | ||||
|         for i in range(len(patterns)): | ||||
|             tree = {} | ||||
|             for j in range(len(patterns[i])): | ||||
|                 if(_heads_list[i][j][INDEX_HEAD] == j): | ||||
|                     continue | ||||
| 
 | ||||
|                 head = _heads_list[i][j][INDEX_HEAD] | ||||
|                 if(head not in tree): | ||||
|                     tree[head] = [] | ||||
|                 tree[head].append((_heads_list[i][j][INDEX_RELOP], j)) | ||||
|             _tree_list.append(tree) | ||||
| 
 | ||||
|         self._tree.setdefault(key, []) | ||||
|         self._tree[key].extend(_tree_list) | ||||
| 
 | ||||
|         self._root.setdefault(key, []) | ||||
|         self._root[key].extend(_root_list) | ||||
| 
 | ||||
|  | @ -199,7 +200,6 @@ cdef class DependencyTreeMatcher: | |||
| 
 | ||||
|     def __call__(self, Doc doc): | ||||
|         matched_trees = [] | ||||
| 
 | ||||
|         matches = self.token_matcher(doc) | ||||
|         for key in list(self._patterns.keys()): | ||||
|             _patterns_list = self._patterns[key] | ||||
|  | @ -216,39 +216,51 @@ cdef class DependencyTreeMatcher: | |||
|                 id_to_position = {} | ||||
|                 for i in range(len(_nodes)): | ||||
|                     id_to_position[i]=[] | ||||
| 
 | ||||
|                 # This could be taken outside to improve running time..? | ||||
|                 # TODO: This could be taken outside to improve running time..? | ||||
|                 for match_id, start, end in matches: | ||||
|                     if match_id in _keys_to_token: | ||||
|                         id_to_position[_keys_to_token[match_id]].append(start) | ||||
| 
 | ||||
|                 _node_operator_map = self.get_node_operator_map(doc,_tree,id_to_position,_nodes,_root) | ||||
|                 _node_operator_map = self.get_node_operator_map( | ||||
|                     doc, | ||||
|                     _tree, | ||||
|                     id_to_position, | ||||
|                     _nodes,_root | ||||
|                 ) | ||||
|                 length = len(_nodes) | ||||
|                 if _root in id_to_position: | ||||
|                     candidates = id_to_position[_root] | ||||
|                     for candidate in candidates: | ||||
|                         isVisited = {} | ||||
|                         self.dfs(candidate,_root,_tree,id_to_position,doc,isVisited,_node_operator_map) | ||||
|                         # To check if the subtree pattern is completely identified. This is a heuristic. | ||||
|                         # This is done to reduce the complexity of exponential unordered subtree matching. | ||||
|                         # Will give approximate matches in some cases. | ||||
|                         self.dfs( | ||||
|                             candidate, | ||||
|                             _root,_tree, | ||||
|                             id_to_position, | ||||
|                             doc, | ||||
|                             isVisited, | ||||
|                             _node_operator_map | ||||
|                         ) | ||||
|                         # To check if the subtree pattern is completely | ||||
|                         # identified. This is a heuristic. This is done to | ||||
|                         # reduce the complexity of exponential unordered subtree | ||||
|                         # matching. Will give approximate matches in some cases. | ||||
|                         if(len(isVisited) == length): | ||||
|                             matched_trees.append((key,list(isVisited))) | ||||
| 
 | ||||
|             for i, (ent_id, nodes) in enumerate(matched_trees): | ||||
|                 on_match = self._callbacks.get(ent_id) | ||||
|                 if on_match is not None: | ||||
|                     on_match(self, doc, i, matches) | ||||
| 
 | ||||
|         return matched_trees | ||||
| 
 | ||||
|     def dfs(self,candidate,root,tree,id_to_position,doc,isVisited,_node_operator_map): | ||||
|         if (root in id_to_position and candidate in id_to_position[root]): | ||||
|             # color the node since it is valid | ||||
|             # Color the node since it is valid | ||||
|             isVisited[candidate] = True | ||||
|             if root in tree: | ||||
|                 for root_child in tree[root]: | ||||
|                     if candidate in _node_operator_map and root_child[INDEX_RELOP] in _node_operator_map[candidate]: | ||||
|                     if ( | ||||
|                         candidate in _node_operator_map | ||||
|                         and root_child[INDEX_RELOP] in _node_operator_map[candidate] | ||||
|                     ): | ||||
|                         candidate_children = _node_operator_map[candidate][root_child[INDEX_RELOP]] | ||||
|                         for candidate_child in candidate_children: | ||||
|                             result = self.dfs( | ||||
|  | @ -275,33 +287,29 @@ cdef class DependencyTreeMatcher: | |||
|                 for child in tree[node]: | ||||
|                     all_operators.append(child[INDEX_RELOP]) | ||||
|         all_operators = list(set(all_operators)) | ||||
| 
 | ||||
|         all_nodes = [] | ||||
|         for node in all_node_indices: | ||||
|             all_nodes = all_nodes + id_to_position[node] | ||||
|         all_nodes = list(set(all_nodes)) | ||||
| 
 | ||||
|         for node in all_nodes: | ||||
|             _node_operator_map[node] = {} | ||||
|             for operator in all_operators: | ||||
|                 _node_operator_map[node][operator] = [] | ||||
| 
 | ||||
|         # Used to invoke methods for each operator | ||||
|         switcher = { | ||||
|             '<':self.dep, | ||||
|             '>':self.gov, | ||||
|             '>>':self.dep_chain, | ||||
|             '<<':self.gov_chain, | ||||
|             '.':self.imm_precede, | ||||
|             '$+':self.imm_right_sib, | ||||
|             '$-':self.imm_left_sib, | ||||
|             '$++':self.right_sib, | ||||
|             '$--':self.left_sib | ||||
|             "<": self.dep, | ||||
|             ">": self.gov, | ||||
|             ">>": self.dep_chain, | ||||
|             "<<": self.gov_chain, | ||||
|             ".": self.imm_precede, | ||||
|             "$+": self.imm_right_sib, | ||||
|             "$-": self.imm_left_sib, | ||||
|             "$++": self.right_sib, | ||||
|             "$--": self.left_sib | ||||
|         } | ||||
|         for operator in all_operators: | ||||
|             for node in all_nodes: | ||||
|                 _node_operator_map[node][operator] = switcher.get(operator)(doc,node) | ||||
| 
 | ||||
|         return _node_operator_map | ||||
| 
 | ||||
|     def dep(self, doc, node): | ||||
|  |  | |||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -12,7 +12,7 @@ from ..vocab cimport Vocab | |||
| from ..tokens.doc cimport Doc, get_token_attr | ||||
| from ..typedefs cimport attr_t, hash_t | ||||
| 
 | ||||
| from ..errors import Warnings, deprecation_warning, user_warning | ||||
| from ..errors import Errors, Warnings, deprecation_warning, user_warning | ||||
| from ..attrs import FLAG61 as U_ENT | ||||
| from ..attrs import FLAG60 as B2_ENT | ||||
| from ..attrs import FLAG59 as B3_ENT | ||||
|  | @ -25,6 +25,13 @@ from ..attrs import FLAG41 as I4_ENT | |||
| 
 | ||||
| 
 | ||||
| cdef class PhraseMatcher: | ||||
|     """Efficiently match large terminology lists. While the `Matcher` matches | ||||
|     sequences based on lists of token descriptions, the `PhraseMatcher` accepts | ||||
|     match patterns in the form of `Doc` objects. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/phrasematcher | ||||
|     USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher | ||||
|     """ | ||||
|     cdef Pool mem | ||||
|     cdef Vocab vocab | ||||
|     cdef Matcher matcher | ||||
|  | @ -36,7 +43,16 @@ cdef class PhraseMatcher: | |||
|     cdef public object _docs | ||||
|     cdef public object _validate | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, max_length=0, attr='ORTH', validate=False): | ||||
|     def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False): | ||||
|         """Initialize the PhraseMatcher. | ||||
| 
 | ||||
|         vocab (Vocab): The shared vocabulary. | ||||
|         attr (int / unicode): Token attribute to match on. | ||||
|         validate (bool): Perform additional validation when patterns are added. | ||||
|         RETURNS (PhraseMatcher): The newly constructed object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/phrasematcher#init | ||||
|         """ | ||||
|         if max_length != 0: | ||||
|             deprecation_warning(Warnings.W010) | ||||
|         self.mem = Pool() | ||||
|  | @ -54,7 +70,7 @@ cdef class PhraseMatcher: | |||
|             [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}], | ||||
|             [{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}], | ||||
|         ] | ||||
|         self.matcher.add('Candidate', None, *abstract_patterns) | ||||
|         self.matcher.add("Candidate", None, *abstract_patterns) | ||||
|         self._callbacks = {} | ||||
|         self._docs = {} | ||||
|         self._validate = validate | ||||
|  | @ -65,6 +81,8 @@ cdef class PhraseMatcher: | |||
|         number of individual patterns. | ||||
| 
 | ||||
|         RETURNS (int): The number of rules. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/phrasematcher#len | ||||
|         """ | ||||
|         return len(self._docs) | ||||
| 
 | ||||
|  | @ -73,6 +91,8 @@ cdef class PhraseMatcher: | |||
| 
 | ||||
|         key (unicode): The match ID. | ||||
|         RETURNS (bool): Whether the matcher contains rules for this match ID. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/phrasematcher#contains | ||||
|         """ | ||||
|         cdef hash_t ent_id = self.matcher._normalize_key(key) | ||||
|         return ent_id in self._callbacks | ||||
|  | @ -88,6 +108,8 @@ cdef class PhraseMatcher: | |||
|         key (unicode): The match ID. | ||||
|         on_match (callable): Callback executed on match. | ||||
|         *docs (Doc): `Doc` objects representing match patterns. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/phrasematcher#add | ||||
|         """ | ||||
|         cdef Doc doc | ||||
|         cdef hash_t ent_id = self.matcher._normalize_key(key) | ||||
|  | @ -112,8 +134,7 @@ cdef class PhraseMatcher: | |||
|                 lexeme = self.vocab[attr_value] | ||||
|                 lexeme.set_flag(tag, True) | ||||
|                 phrase_key[i] = lexeme.orth | ||||
|             phrase_hash = hash64(phrase_key, | ||||
|                                  length * sizeof(attr_t), 0) | ||||
|             phrase_hash = hash64(phrase_key, length * sizeof(attr_t), 0) | ||||
|             self.phrase_ids.set(phrase_hash, <void*>ent_id) | ||||
| 
 | ||||
|     def __call__(self, Doc doc): | ||||
|  | @ -123,6 +144,8 @@ cdef class PhraseMatcher: | |||
|         RETURNS (list): A list of `(key, start, end)` tuples, | ||||
|             describing the matches. A match tuple describes a span | ||||
|             `doc[start:end]`. The `label_id` and `key` are both integers. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/phrasematcher#call | ||||
|         """ | ||||
|         matches = [] | ||||
|         if self.attr == ORTH: | ||||
|  | @ -158,6 +181,8 @@ cdef class PhraseMatcher: | |||
|             If both return_matches and as_tuples are True, the output will | ||||
|             be a sequence of ((doc, matches), context) tuples. | ||||
|         YIELDS (Doc): Documents, in order. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/phrasematcher#pipe | ||||
|         """ | ||||
|         if as_tuples: | ||||
|             for doc, context in stream: | ||||
|  | @ -180,8 +205,7 @@ cdef class PhraseMatcher: | |||
|         phrase_key = <attr_t*>mem.alloc(end-start, sizeof(attr_t)) | ||||
|         for i, j in enumerate(range(start, end)): | ||||
|             phrase_key[i] = doc.c[j].lex.orth | ||||
|         cdef hash_t key = hash64(phrase_key, | ||||
|                                  (end-start) * sizeof(attr_t), 0) | ||||
|         cdef hash_t key = hash64(phrase_key, (end-start) * sizeof(attr_t), 0) | ||||
|         ent_id = <hash_t>self.phrase_ids.get(key) | ||||
|         if ent_id == 0: | ||||
|             return None | ||||
|  | @ -203,12 +227,12 @@ cdef class PhraseMatcher: | |||
|         # Concatenate the attr name and value to not pollute lexeme space | ||||
|         # e.g. 'POS-VERB' instead of just 'VERB', which could otherwise | ||||
|         # create false positive matches | ||||
|         return 'matcher:{}-{}'.format(string_attr_name, string_attr_value) | ||||
|         return "matcher:{}-{}".format(string_attr_name, string_attr_value) | ||||
| 
 | ||||
| 
 | ||||
| def get_bilou(length): | ||||
|     if length == 0: | ||||
|         raise ValueError("Length must be >= 1") | ||||
|         raise ValueError(Errors.E127) | ||||
|     elif length == 1: | ||||
|         return [U_ENT] | ||||
|     elif length == 2: | ||||
|  |  | |||
|  | @ -1,8 +1,23 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .pipes import Tagger, DependencyParser, EntityRecognizer  # noqa | ||||
| from .pipes import TextCategorizer, Tensorizer, Pipe  # noqa | ||||
| from .entityruler import EntityRuler  # noqa | ||||
| from .hooks import SentenceSegmenter, SimilarityHook  # noqa | ||||
| from .functions import merge_entities, merge_noun_chunks, merge_subtokens  # noqa | ||||
| from .pipes import Tagger, DependencyParser, EntityRecognizer | ||||
| from .pipes import TextCategorizer, Tensorizer, Pipe | ||||
| from .entityruler import EntityRuler | ||||
| from .hooks import SentenceSegmenter, SimilarityHook | ||||
| from .functions import merge_entities, merge_noun_chunks, merge_subtokens | ||||
| 
 | ||||
| __all__ = [ | ||||
|     "Tagger", | ||||
|     "DependencyParser", | ||||
|     "EntityRecognizer", | ||||
|     "TextCategorizer", | ||||
|     "Tensorizer", | ||||
|     "Pipe", | ||||
|     "EntityRuler", | ||||
|     "SentenceSegmenter", | ||||
|     "SimilarityHook", | ||||
|     "merge_entities", | ||||
|     "merge_noun_chunks", | ||||
|     "merge_subtokens", | ||||
| ] | ||||
|  |  | |||
|  | @ -12,10 +12,20 @@ from ..matcher import Matcher, PhraseMatcher | |||
| 
 | ||||
| 
 | ||||
| class EntityRuler(object): | ||||
|     """The EntityRuler lets you add spans to the `Doc.ents` using token-based | ||||
|     rules or exact phrase matches. It can be combined with the statistical | ||||
|     `EntityRecognizer` to boost accuracy, or used on its own to implement a | ||||
|     purely rule-based entity recognition system. After initialization, the | ||||
|     component is typically added to the pipeline using `nlp.add_pipe`. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/entityruler | ||||
|     USAGE: https://spacy.io/usage/rule-based-matching#entityruler | ||||
|     """ | ||||
| 
 | ||||
|     name = "entity_ruler" | ||||
| 
 | ||||
|     def __init__(self, nlp, **cfg): | ||||
|         """Initialise the entitiy ruler. If patterns are supplied here, they | ||||
|         """Initialize the entitiy ruler. If patterns are supplied here, they | ||||
|         need to be a list of dictionaries with a `"label"` and `"pattern"` | ||||
|         key. A pattern can either be a token pattern (list) or a phrase pattern | ||||
|         (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`. | ||||
|  | @ -29,6 +39,8 @@ class EntityRuler(object): | |||
|             of a model pipeline, this will include all keyword arguments passed | ||||
|             to `spacy.load`. | ||||
|         RETURNS (EntityRuler): The newly constructed object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler#init | ||||
|         """ | ||||
|         self.nlp = nlp | ||||
|         self.overwrite = cfg.get("overwrite_ents", False) | ||||
|  | @ -55,6 +67,8 @@ class EntityRuler(object): | |||
| 
 | ||||
|         doc (Doc): The Doc object in the pipeline. | ||||
|         RETURNS (Doc): The Doc with added entities, if available. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler#call | ||||
|         """ | ||||
|         matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) | ||||
|         matches = set( | ||||
|  | @ -83,6 +97,8 @@ class EntityRuler(object): | |||
|         """All labels present in the match patterns. | ||||
| 
 | ||||
|         RETURNS (set): The string labels. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler#labels | ||||
|         """ | ||||
|         all_labels = set(self.token_patterns.keys()) | ||||
|         all_labels.update(self.phrase_patterns.keys()) | ||||
|  | @ -93,6 +109,8 @@ class EntityRuler(object): | |||
|         """Get all patterns that were added to the entity ruler. | ||||
| 
 | ||||
|         RETURNS (list): The original patterns, one dictionary per pattern. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler#patterns | ||||
|         """ | ||||
|         all_patterns = [] | ||||
|         for label, patterns in self.token_patterns.items(): | ||||
|  | @ -110,6 +128,8 @@ class EntityRuler(object): | |||
|         {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]} | ||||
| 
 | ||||
|         patterns (list): The patterns to add. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler#add_patterns | ||||
|         """ | ||||
|         for entry in patterns: | ||||
|             label = entry["label"] | ||||
|  | @ -131,6 +151,8 @@ class EntityRuler(object): | |||
|         patterns_bytes (bytes): The bytestring to load. | ||||
|         **kwargs: Other config paramters, mostly for consistency. | ||||
|         RETURNS (EntityRuler): The loaded entity ruler. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler#from_bytes | ||||
|         """ | ||||
|         patterns = srsly.msgpack_loads(patterns_bytes) | ||||
|         self.add_patterns(patterns) | ||||
|  | @ -140,6 +162,8 @@ class EntityRuler(object): | |||
|         """Serialize the entity ruler patterns to a bytestring. | ||||
| 
 | ||||
|         RETURNS (bytes): The serialized patterns. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler#to_bytes | ||||
|         """ | ||||
|         return srsly.msgpack_dumps(self.patterns) | ||||
| 
 | ||||
|  | @ -150,6 +174,8 @@ class EntityRuler(object): | |||
|         path (unicode / Path): The JSONL file to load. | ||||
|         **kwargs: Other config paramters, mostly for consistency. | ||||
|         RETURNS (EntityRuler): The loaded entity ruler. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler#from_disk | ||||
|         """ | ||||
|         path = ensure_path(path) | ||||
|         path = path.with_suffix(".jsonl") | ||||
|  | @ -164,6 +190,8 @@ class EntityRuler(object): | |||
|         path (unicode / Path): The JSONL file to load. | ||||
|         **kwargs: Other config paramters, mostly for consistency. | ||||
|         RETURNS (EntityRuler): The loaded entity ruler. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/entityruler | ||||
|         """ | ||||
|         path = ensure_path(path) | ||||
|         path = path.with_suffix(".jsonl") | ||||
|  |  | |||
|  | @ -9,6 +9,8 @@ def merge_noun_chunks(doc): | |||
| 
 | ||||
|     doc (Doc): The Doc object. | ||||
|     RETURNS (Doc): The Doc object with merged noun chunks. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/pipeline-functions#merge_noun_chunks | ||||
|     """ | ||||
|     if not doc.is_parsed: | ||||
|         return doc | ||||
|  | @ -23,7 +25,9 @@ def merge_entities(doc): | |||
|     """Merge entities into a single token. | ||||
| 
 | ||||
|     doc (Doc): The Doc object. | ||||
|     RETURNS (Doc): The Doc object with merged noun entities. | ||||
|     RETURNS (Doc): The Doc object with merged entities. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/pipeline-functions#merge_entities | ||||
|     """ | ||||
|     with doc.retokenize() as retokenizer: | ||||
|         for ent in doc.ents: | ||||
|  | @ -33,6 +37,14 @@ def merge_entities(doc): | |||
| 
 | ||||
| 
 | ||||
| def merge_subtokens(doc, label="subtok"): | ||||
|     """Merge subtokens into a single token. | ||||
| 
 | ||||
|     doc (Doc): The Doc object. | ||||
|     label (unicode): The subtoken dependency label. | ||||
|     RETURNS (Doc): The Doc object with merged subtokens. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens | ||||
|     """ | ||||
|     merger = Matcher(doc.vocab) | ||||
|     merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}]) | ||||
|     matches = merger(doc) | ||||
|  |  | |||
|  | @ -15,6 +15,8 @@ class SentenceSegmenter(object): | |||
|     initialization, or assign a new strategy to the .strategy attribute. | ||||
|     Sentence detection strategies should be generators that take `Doc` objects | ||||
|     and yield `Span` objects for each sentence. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/sentencesegmenter | ||||
|     """ | ||||
| 
 | ||||
|     name = "sentencizer" | ||||
|  |  | |||
|  | @ -6,9 +6,8 @@ from __future__ import unicode_literals | |||
| cimport numpy as np | ||||
| 
 | ||||
| import numpy | ||||
| from collections import OrderedDict | ||||
| import srsly | ||||
| 
 | ||||
| from collections import OrderedDict | ||||
| from thinc.api import chain | ||||
| from thinc.v2v import Affine, Maxout, Softmax | ||||
| from thinc.misc import LayerNorm | ||||
|  | @ -284,9 +283,7 @@ class Tensorizer(Pipe): | |||
|         """ | ||||
|         for doc, tensor in zip(docs, tensors): | ||||
|             if tensor.shape[0] != len(doc): | ||||
|                 raise ValueError( | ||||
|                     Errors.E076.format(rows=tensor.shape[0], words=len(doc)) | ||||
|                 ) | ||||
|                 raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) | ||||
|             doc.tensor = tensor | ||||
| 
 | ||||
|     def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): | ||||
|  | @ -346,14 +343,19 @@ class Tensorizer(Pipe): | |||
| 
 | ||||
| 
 | ||||
| class Tagger(Pipe): | ||||
|     name = 'tagger' | ||||
|     """Pipeline component for part-of-speech tagging. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/tagger | ||||
|     """ | ||||
| 
 | ||||
|     name = "tagger" | ||||
| 
 | ||||
|     def __init__(self, vocab, model=True, **cfg): | ||||
|         self.vocab = vocab | ||||
|         self.model = model | ||||
|         self._rehearsal_model = None | ||||
|         self.cfg = OrderedDict(sorted(cfg.items())) | ||||
|         self.cfg.setdefault('cnn_maxout_pieces', 2) | ||||
|         self.cfg.setdefault("cnn_maxout_pieces", 2) | ||||
| 
 | ||||
|     @property | ||||
|     def labels(self): | ||||
|  | @ -404,7 +406,7 @@ class Tagger(Pipe): | |||
|         cdef Vocab vocab = self.vocab | ||||
|         for i, doc in enumerate(docs): | ||||
|             doc_tag_ids = batch_tag_ids[i] | ||||
|             if hasattr(doc_tag_ids, 'get'): | ||||
|             if hasattr(doc_tag_ids, "get"): | ||||
|                 doc_tag_ids = doc_tag_ids.get() | ||||
|             for j, tag_id in enumerate(doc_tag_ids): | ||||
|                 # Don't clobber preset POS tags | ||||
|  | @ -453,9 +455,9 @@ class Tagger(Pipe): | |||
|         scores = self.model.ops.flatten(scores) | ||||
|         tag_index = {tag: i for i, tag in enumerate(self.labels)} | ||||
|         cdef int idx = 0 | ||||
|         correct = numpy.zeros((scores.shape[0],), dtype='i') | ||||
|         correct = numpy.zeros((scores.shape[0],), dtype="i") | ||||
|         guesses = scores.argmax(axis=1) | ||||
|         known_labels = numpy.ones((scores.shape[0], 1), dtype='f') | ||||
|         known_labels = numpy.ones((scores.shape[0], 1), dtype="f") | ||||
|         for gold in golds: | ||||
|             for tag in gold.tags: | ||||
|                 if tag is None: | ||||
|  | @ -466,7 +468,7 @@ class Tagger(Pipe): | |||
|                     correct[idx] = 0 | ||||
|                     known_labels[idx] = 0. | ||||
|                 idx += 1 | ||||
|         correct = self.model.ops.xp.array(correct, dtype='i') | ||||
|         correct = self.model.ops.xp.array(correct, dtype="i") | ||||
|         d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) | ||||
|         d_scores *= self.model.ops.asarray(known_labels) | ||||
|         loss = (d_scores**2).sum() | ||||
|  | @ -490,9 +492,9 @@ class Tagger(Pipe): | |||
|             vocab.morphology = Morphology(vocab.strings, new_tag_map, | ||||
|                                           vocab.morphology.lemmatizer, | ||||
|                                           exc=vocab.morphology.exc) | ||||
|         self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') | ||||
|         self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") | ||||
|         if self.model is True: | ||||
|             for hp in ['token_vector_width', 'conv_depth']: | ||||
|             for hp in ["token_vector_width", "conv_depth"]: | ||||
|                 if hp in kwargs: | ||||
|                     self.cfg[hp] = kwargs[hp] | ||||
|             self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) | ||||
|  | @ -503,7 +505,7 @@ class Tagger(Pipe): | |||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, n_tags, **cfg): | ||||
|         if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): | ||||
|         if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"): | ||||
|             raise ValueError(TempErrors.T008) | ||||
|         return build_tagger_model(n_tags, **cfg) | ||||
| 
 | ||||
|  | @ -538,25 +540,23 @@ class Tagger(Pipe): | |||
|     def to_bytes(self, **exclude): | ||||
|         serialize = OrderedDict() | ||||
|         if self.model not in (None, True, False): | ||||
|             serialize['model'] = self.model.to_bytes | ||||
|         serialize['vocab'] = self.vocab.to_bytes | ||||
|         serialize['cfg'] = lambda: srsly.json_dumps(self.cfg) | ||||
|             serialize["model"] = self.model.to_bytes | ||||
|         serialize["vocab"] = self.vocab.to_bytes | ||||
|         serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) | ||||
|         tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) | ||||
|         serialize['tag_map'] = lambda: srsly.msgpack_dumps(tag_map) | ||||
|         serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) | ||||
|         return util.to_bytes(serialize, exclude) | ||||
| 
 | ||||
|     def from_bytes(self, bytes_data, **exclude): | ||||
|         def load_model(b): | ||||
|             # TODO: Remove this once we don't have to handle previous models | ||||
|             if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: | ||||
|                 self.cfg['pretrained_vectors'] = self.vocab.vectors.name | ||||
| 
 | ||||
|             if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: | ||||
|                 self.cfg["pretrained_vectors"] = self.vocab.vectors.name | ||||
|             if self.model is True: | ||||
|                 token_vector_width = util.env_opt( | ||||
|                     'token_vector_width', | ||||
|                     self.cfg.get('token_vector_width', 96)) | ||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, | ||||
|                                         **self.cfg) | ||||
|                     "token_vector_width", | ||||
|                     self.cfg.get("token_vector_width", 96)) | ||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) | ||||
|             self.model.from_bytes(b) | ||||
| 
 | ||||
|         def load_tag_map(b): | ||||
|  | @ -567,10 +567,10 @@ class Tagger(Pipe): | |||
|                 exc=self.vocab.morphology.exc) | ||||
| 
 | ||||
|         deserialize = OrderedDict(( | ||||
|             ('vocab', lambda b: self.vocab.from_bytes(b)), | ||||
|             ('tag_map', load_tag_map), | ||||
|             ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), | ||||
|             ('model', lambda b: load_model(b)), | ||||
|             ("vocab", lambda b: self.vocab.from_bytes(b)), | ||||
|             ("tag_map", load_tag_map), | ||||
|             ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), | ||||
|             ("model", lambda b: load_model(b)), | ||||
|         )) | ||||
|         util.from_bytes(bytes_data, deserialize, exclude) | ||||
|         return self | ||||
|  | @ -580,7 +580,7 @@ class Tagger(Pipe): | |||
|         serialize = OrderedDict(( | ||||
|             ('vocab', lambda p: self.vocab.to_disk(p)), | ||||
|             ('tag_map', lambda p: srsly.write_msgpack(p, tag_map)), | ||||
|             ('model', lambda p: p.open('wb').write(self.model.to_bytes())), | ||||
|             ('model', lambda p: p.open("wb").write(self.model.to_bytes())), | ||||
|             ('cfg', lambda p: srsly.write_json(p, self.cfg)) | ||||
|         )) | ||||
|         util.to_disk(path, serialize, exclude) | ||||
|  | @ -588,11 +588,11 @@ class Tagger(Pipe): | |||
|     def from_disk(self, path, **exclude): | ||||
|         def load_model(p): | ||||
|             # TODO: Remove this once we don't have to handle previous models | ||||
|             if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: | ||||
|                 self.cfg['pretrained_vectors'] = self.vocab.vectors.name | ||||
|             if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: | ||||
|                 self.cfg["pretrained_vectors"] = self.vocab.vectors.name | ||||
|             if self.model is True: | ||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) | ||||
|             with p.open('rb') as file_: | ||||
|             with p.open("rb") as file_: | ||||
|                 self.model.from_bytes(file_.read()) | ||||
| 
 | ||||
|         def load_tag_map(p): | ||||
|  | @ -603,10 +603,10 @@ class Tagger(Pipe): | |||
|                 exc=self.vocab.morphology.exc) | ||||
| 
 | ||||
|         deserialize = OrderedDict(( | ||||
|             ('cfg', lambda p: self.cfg.update(_load_cfg(p))), | ||||
|             ('vocab', lambda p: self.vocab.from_disk(p)), | ||||
|             ('tag_map', load_tag_map), | ||||
|             ('model', load_model), | ||||
|             ("cfg", lambda p: self.cfg.update(_load_cfg(p))), | ||||
|             ("vocab", lambda p: self.vocab.from_disk(p)), | ||||
|             ("tag_map", load_tag_map), | ||||
|             ("model", load_model), | ||||
|         )) | ||||
|         util.from_disk(path, deserialize, exclude) | ||||
|         return self | ||||
|  | @ -616,37 +616,38 @@ class MultitaskObjective(Tagger): | |||
|     """Experimental: Assist training of a parser or tagger, by training a | ||||
|     side-objective. | ||||
|     """ | ||||
|     name = 'nn_labeller' | ||||
| 
 | ||||
|     name = "nn_labeller" | ||||
| 
 | ||||
|     def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): | ||||
|         self.vocab = vocab | ||||
|         self.model = model | ||||
|         if target == 'dep': | ||||
|         if target == "dep": | ||||
|             self.make_label = self.make_dep | ||||
|         elif target == 'tag': | ||||
|         elif target == "tag": | ||||
|             self.make_label = self.make_tag | ||||
|         elif target == 'ent': | ||||
|         elif target == "ent": | ||||
|             self.make_label = self.make_ent | ||||
|         elif target == 'dep_tag_offset': | ||||
|         elif target == "dep_tag_offset": | ||||
|             self.make_label = self.make_dep_tag_offset | ||||
|         elif target == 'ent_tag': | ||||
|         elif target == "ent_tag": | ||||
|             self.make_label = self.make_ent_tag | ||||
|         elif target == 'sent_start': | ||||
|         elif target == "sent_start": | ||||
|             self.make_label = self.make_sent_start | ||||
|         elif hasattr(target, '__call__'): | ||||
|         elif hasattr(target, "__call__"): | ||||
|             self.make_label = target | ||||
|         else: | ||||
|             raise ValueError(Errors.E016) | ||||
|         self.cfg = dict(cfg) | ||||
|         self.cfg.setdefault('cnn_maxout_pieces', 2) | ||||
|         self.cfg.setdefault("cnn_maxout_pieces", 2) | ||||
| 
 | ||||
|     @property | ||||
|     def labels(self): | ||||
|         return self.cfg.setdefault('labels', {}) | ||||
|         return self.cfg.setdefault("labels", {}) | ||||
| 
 | ||||
|     @labels.setter | ||||
|     def labels(self, value): | ||||
|         self.cfg['labels'] = value | ||||
|         self.cfg["labels"] = value | ||||
| 
 | ||||
|     def set_annotations(self, docs, dep_ids, tensors=None): | ||||
|         pass | ||||
|  | @ -662,7 +663,7 @@ class MultitaskObjective(Tagger): | |||
|                     if label is not None and label not in self.labels: | ||||
|                         self.labels[label] = len(self.labels) | ||||
|         if self.model is True: | ||||
|             token_vector_width = util.env_opt('token_vector_width') | ||||
|             token_vector_width = util.env_opt("token_vector_width") | ||||
|             self.model = self.Model(len(self.labels), tok2vec=tok2vec) | ||||
|         link_vectors_to_models(self.vocab) | ||||
|         if sgd is None: | ||||
|  | @ -671,7 +672,7 @@ class MultitaskObjective(Tagger): | |||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, n_tags, tok2vec=None, **cfg): | ||||
|         token_vector_width = util.env_opt('token_vector_width', 96) | ||||
|         token_vector_width = util.env_opt("token_vector_width", 96) | ||||
|         softmax = Softmax(n_tags, token_vector_width*2) | ||||
|         model = chain( | ||||
|             tok2vec, | ||||
|  | @ -690,10 +691,10 @@ class MultitaskObjective(Tagger): | |||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         if len(docs) != len(golds): | ||||
|             raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs), | ||||
|             raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs), | ||||
|                                                 n_golds=len(golds))) | ||||
|         cdef int idx = 0 | ||||
|         correct = numpy.zeros((scores.shape[0],), dtype='i') | ||||
|         correct = numpy.zeros((scores.shape[0],), dtype="i") | ||||
|         guesses = scores.argmax(axis=1) | ||||
|         for i, gold in enumerate(golds): | ||||
|             for j in range(len(docs[i])): | ||||
|  | @ -705,7 +706,7 @@ class MultitaskObjective(Tagger): | |||
|                 else: | ||||
|                     correct[idx] = self.labels[label] | ||||
|                 idx += 1 | ||||
|         correct = self.model.ops.xp.array(correct, dtype='i') | ||||
|         correct = self.model.ops.xp.array(correct, dtype="i") | ||||
|         d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) | ||||
|         loss = (d_scores**2).sum() | ||||
|         return float(loss), d_scores | ||||
|  | @ -733,25 +734,25 @@ class MultitaskObjective(Tagger): | |||
|         offset = heads[i] - i | ||||
|         offset = min(offset, 2) | ||||
|         offset = max(offset, -2) | ||||
|         return '%s-%s:%d' % (deps[i], tags[i], offset) | ||||
|         return "%s-%s:%d" % (deps[i], tags[i], offset) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def make_ent_tag(i, words, tags, heads, deps, ents): | ||||
|         if ents is None or ents[i] is None: | ||||
|             return None | ||||
|         else: | ||||
|             return '%s-%s' % (tags[i], ents[i]) | ||||
|             return "%s-%s" % (tags[i], ents[i]) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}): | ||||
|         '''A multi-task objective for representing sentence boundaries, | ||||
|         """A multi-task objective for representing sentence boundaries, | ||||
|         using BILU scheme. (O is impossible) | ||||
| 
 | ||||
|         The implementation of this method uses an internal cache that relies | ||||
|         on the identity of the heads array, to avoid requiring a new piece | ||||
|         of gold data. You can pass cache=False if you know the cache will | ||||
|         do the wrong thing. | ||||
|         ''' | ||||
|         """ | ||||
|         assert len(words) == len(heads) | ||||
|         assert target < len(words), (target, len(words)) | ||||
|         if cache: | ||||
|  | @ -760,10 +761,10 @@ class MultitaskObjective(Tagger): | |||
|             else: | ||||
|                 for key in list(_cache.keys()): | ||||
|                     _cache.pop(key) | ||||
|             sent_tags = ['I-SENT'] * len(words) | ||||
|             sent_tags = ["I-SENT"] * len(words) | ||||
|             _cache[id(heads)] = sent_tags | ||||
|         else: | ||||
|             sent_tags = ['I-SENT'] * len(words) | ||||
|             sent_tags = ["I-SENT"] * len(words) | ||||
| 
 | ||||
|         def _find_root(child): | ||||
|             seen = set([child]) | ||||
|  | @ -781,10 +782,10 @@ class MultitaskObjective(Tagger): | |||
|                 sentences.setdefault(root, []).append(i) | ||||
|         for root, span in sorted(sentences.items()): | ||||
|             if len(span) == 1: | ||||
|                 sent_tags[span[0]] = 'U-SENT' | ||||
|                 sent_tags[span[0]] = "U-SENT" | ||||
|             else: | ||||
|                 sent_tags[span[0]] = 'B-SENT' | ||||
|                 sent_tags[span[-1]] = 'L-SENT' | ||||
|                 sent_tags[span[0]] = "B-SENT" | ||||
|                 sent_tags[span[-1]] = "L-SENT" | ||||
|         return sent_tags[target] | ||||
| 
 | ||||
| 
 | ||||
|  | @ -854,6 +855,10 @@ class ClozeMultitask(Pipe): | |||
| 
 | ||||
| 
 | ||||
| class TextCategorizer(Pipe): | ||||
|     """Pipeline component for text classification. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/textcategorizer | ||||
|     """ | ||||
|     name = 'textcat' | ||||
| 
 | ||||
|     @classmethod | ||||
|  | @ -863,7 +868,7 @@ class TextCategorizer(Pipe): | |||
|             token_vector_width = cfg["token_vector_width"] | ||||
|         else: | ||||
|             token_vector_width = util.env_opt("token_vector_width", 96) | ||||
|         if cfg.get('architecture') == 'simple_cnn': | ||||
|         if cfg.get("architecture") == "simple_cnn": | ||||
|             tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg) | ||||
|             return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg) | ||||
|         else: | ||||
|  | @ -884,11 +889,11 @@ class TextCategorizer(Pipe): | |||
| 
 | ||||
|     @property | ||||
|     def labels(self): | ||||
|         return tuple(self.cfg.setdefault('labels', [])) | ||||
|         return tuple(self.cfg.setdefault("labels", [])) | ||||
| 
 | ||||
|     @labels.setter | ||||
|     def labels(self, value): | ||||
|         self.cfg['labels'] = tuple(value) | ||||
|         self.cfg["labels"] = tuple(value) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         scores, tensors = self.predict([doc]) | ||||
|  | @ -934,8 +939,8 @@ class TextCategorizer(Pipe): | |||
|             losses[self.name] += (gradient**2).sum() | ||||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') | ||||
|         not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') | ||||
|         truths = numpy.zeros((len(golds), len(self.labels)), dtype="f") | ||||
|         not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f") | ||||
|         for i, gold in enumerate(golds): | ||||
|             for j, label in enumerate(self.labels): | ||||
|                 if label in gold.cats: | ||||
|  | @ -956,7 +961,7 @@ class TextCategorizer(Pipe): | |||
|             # This functionality was available previously, but was broken. | ||||
|             # The problem is that we resize the last layer, but the last layer | ||||
|             # is actually just an ensemble. We're not resizing the child layers | ||||
|             # -- a huge problem. | ||||
|             # - a huge problem. | ||||
|             raise ValueError(Errors.E116) | ||||
|             # smaller = self.model._layers[-1] | ||||
|             # larger = Affine(len(self.labels)+1, smaller.nI) | ||||
|  | @ -966,10 +971,9 @@ class TextCategorizer(Pipe): | |||
|         self.labels = tuple(list(self.labels) + [label]) | ||||
|         return 1 | ||||
| 
 | ||||
|     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, | ||||
|                        **kwargs): | ||||
|     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): | ||||
|         if self.model is True: | ||||
|             self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') | ||||
|             self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") | ||||
|             self.model = self.Model(len(self.labels), **self.cfg) | ||||
|             link_vectors_to_models(self.vocab) | ||||
|         if sgd is None: | ||||
|  | @ -978,7 +982,12 @@ class TextCategorizer(Pipe): | |||
| 
 | ||||
| 
 | ||||
| cdef class DependencyParser(Parser): | ||||
|     name = 'parser' | ||||
|     """Pipeline component for dependency parsing. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/dependencyparser | ||||
|     """ | ||||
| 
 | ||||
|     name = "parser" | ||||
|     TransitionSystem = ArcEager | ||||
| 
 | ||||
|     @property | ||||
|  | @ -986,7 +995,7 @@ cdef class DependencyParser(Parser): | |||
|         return [nonproj.deprojectivize] | ||||
| 
 | ||||
|     def add_multitask_objective(self, target): | ||||
|         if target == 'cloze': | ||||
|         if target == "cloze": | ||||
|             cloze = ClozeMultitask(self.vocab) | ||||
|             self._multitasks.append(cloze) | ||||
|         else: | ||||
|  | @ -1000,8 +1009,7 @@ cdef class DependencyParser(Parser): | |||
|                                     tok2vec=tok2vec, sgd=sgd) | ||||
| 
 | ||||
|     def __reduce__(self): | ||||
|         return (DependencyParser, (self.vocab, self.moves, self.model), | ||||
|                 None, None) | ||||
|         return (DependencyParser, (self.vocab, self.moves, self.model), None, None) | ||||
| 
 | ||||
|     @property | ||||
|     def labels(self): | ||||
|  | @ -1010,6 +1018,11 @@ cdef class DependencyParser(Parser): | |||
| 
 | ||||
| 
 | ||||
| cdef class EntityRecognizer(Parser): | ||||
|     """Pipeline component for named entity recognition. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/entityrecognizer | ||||
|     """ | ||||
| 
 | ||||
|     name = "ner" | ||||
|     TransitionSystem = BiluoPushDown | ||||
|     nr_feature = 6 | ||||
|  | @ -1040,4 +1053,4 @@ cdef class EntityRecognizer(Parser): | |||
|                 if move[0] in ("B", "I", "L", "U"))) | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer'] | ||||
| __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"] | ||||
|  |  | |||
|  | @ -31,12 +31,12 @@ def get_string_id(key): | |||
|     elif not key: | ||||
|         return 0 | ||||
|     else: | ||||
|         chars = key.encode('utf8') | ||||
|         chars = key.encode("utf8") | ||||
|         return hash_utf8(chars, len(chars)) | ||||
| 
 | ||||
| 
 | ||||
| cpdef hash_t hash_string(unicode string) except 0: | ||||
|     chars = string.encode('utf8') | ||||
|     chars = string.encode("utf8") | ||||
|     return hash_utf8(chars, len(chars)) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -51,9 +51,9 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil: | |||
| cdef unicode decode_Utf8Str(const Utf8Str* string): | ||||
|     cdef int i, length | ||||
|     if string.s[0] < sizeof(string.s) and string.s[0] != 0: | ||||
|         return string.s[1:string.s[0]+1].decode('utf8') | ||||
|         return string.s[1:string.s[0]+1].decode("utf8") | ||||
|     elif string.p[0] < 255: | ||||
|         return string.p[1:string.p[0]+1].decode('utf8') | ||||
|         return string.p[1:string.p[0]+1].decode("utf8") | ||||
|     else: | ||||
|         i = 0 | ||||
|         length = 0 | ||||
|  | @ -62,7 +62,7 @@ cdef unicode decode_Utf8Str(const Utf8Str* string): | |||
|             length += 255 | ||||
|         length += string.p[i] | ||||
|         i += 1 | ||||
|         return string.p[i:length + i].decode('utf8') | ||||
|         return string.p[i:length + i].decode("utf8") | ||||
| 
 | ||||
| 
 | ||||
| cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *: | ||||
|  | @ -91,7 +91,10 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e | |||
| 
 | ||||
| 
 | ||||
| cdef class StringStore: | ||||
|     """Look up strings by 64-bit hashes.""" | ||||
|     """Look up strings by 64-bit hashes. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/stringstore | ||||
|     """ | ||||
|     def __init__(self, strings=None, freeze=False): | ||||
|         """Create the StringStore. | ||||
| 
 | ||||
|  | @ -113,7 +116,7 @@ cdef class StringStore: | |||
|         if isinstance(string_or_id, basestring) and len(string_or_id) == 0: | ||||
|             return 0 | ||||
|         elif string_or_id == 0: | ||||
|             return u'' | ||||
|             return "" | ||||
|         elif string_or_id in SYMBOLS_BY_STR: | ||||
|             return SYMBOLS_BY_STR[string_or_id] | ||||
| 
 | ||||
|  | @ -181,7 +184,7 @@ cdef class StringStore: | |||
|         elif isinstance(string, unicode): | ||||
|             key = hash_string(string) | ||||
|         else: | ||||
|             string = string.encode('utf8') | ||||
|             string = string.encode("utf8") | ||||
|             key = hash_utf8(string, len(string)) | ||||
|         if key < len(SYMBOLS_BY_INT): | ||||
|             return True | ||||
|  | @ -296,7 +299,7 @@ cdef class StringStore: | |||
| 
 | ||||
|     cdef const Utf8Str* intern_unicode(self, unicode py_string): | ||||
|         # 0 means missing, but we don't bother offsetting the index. | ||||
|         cdef bytes byte_string = py_string.encode('utf8') | ||||
|         cdef bytes byte_string = py_string.encode("utf8") | ||||
|         return self._intern_utf8(byte_string, len(byte_string)) | ||||
| 
 | ||||
|     @cython.final | ||||
|  |  | |||
|  | @ -3,16 +3,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from collections import OrderedDict | ||||
| from cython.operator cimport dereference as deref | ||||
| from cython.operator cimport preincrement as preinc | ||||
| from cymem.cymem cimport Pool | ||||
| from preshed.maps cimport PreshMap | ||||
| import re | ||||
| cimport cython | ||||
| 
 | ||||
| from collections import OrderedDict | ||||
| import re | ||||
| 
 | ||||
| from .tokens.doc cimport Doc | ||||
| from .strings cimport hash_string | ||||
| 
 | ||||
| from .errors import Errors, Warnings, deprecation_warning | ||||
| from . import util | ||||
| 
 | ||||
|  | @ -20,6 +22,8 @@ from . import util | |||
| cdef class Tokenizer: | ||||
|     """Segment text, and create Doc objects with the discovered segment | ||||
|     boundaries. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/tokenizer | ||||
|     """ | ||||
|     def __init__(self, Vocab vocab, rules=None, prefix_search=None, | ||||
|                  suffix_search=None, infix_finditer=None, token_match=None): | ||||
|  | @ -40,6 +44,8 @@ cdef class Tokenizer: | |||
|         EXAMPLE: | ||||
|             >>> tokenizer = Tokenizer(nlp.vocab) | ||||
|             >>> tokenizer = English().Defaults.create_tokenizer(nlp) | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#init | ||||
|         """ | ||||
|         self.mem = Pool() | ||||
|         self._cache = PreshMap() | ||||
|  | @ -73,6 +79,8 @@ cdef class Tokenizer: | |||
| 
 | ||||
|         string (unicode): The string to tokenize. | ||||
|         RETURNS (Doc): A container for linguistic annotations. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#call | ||||
|         """ | ||||
|         if len(string) >= (2 ** 30): | ||||
|             raise ValueError(Errors.E025.format(length=len(string))) | ||||
|  | @ -114,7 +122,7 @@ cdef class Tokenizer: | |||
|             cache_hit = self._try_cache(key, doc) | ||||
|             if not cache_hit: | ||||
|                 self._tokenize(doc, span, key) | ||||
|             doc.c[doc.length - 1].spacy = string[-1] == ' ' and not in_ws | ||||
|             doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, texts, batch_size=1000, n_threads=2): | ||||
|  | @ -122,9 +130,9 @@ cdef class Tokenizer: | |||
| 
 | ||||
|         texts: A sequence of unicode texts. | ||||
|         batch_size (int): Number of texts to accumulate in an internal buffer. | ||||
|         n_threads (int): Number of threads to use, if the implementation | ||||
|             supports multi-threading. The default tokenizer is single-threaded. | ||||
|         YIELDS (Doc): A sequence of Doc objects, in order. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#pipe | ||||
|         """ | ||||
|         for text in texts: | ||||
|             yield self(text) | ||||
|  | @ -235,7 +243,7 @@ cdef class Tokenizer: | |||
|                 if not matches: | ||||
|                     tokens.push_back(self.vocab.get(tokens.mem, string), False) | ||||
|                 else: | ||||
|                     # let's say we have dyn-o-mite-dave - the regex finds the | ||||
|                     # Let's say we have dyn-o-mite-dave - the regex finds the | ||||
|                     # start and end positions of the hyphens | ||||
|                     start = 0 | ||||
|                     start_before_infixes = start | ||||
|  | @ -257,7 +265,6 @@ cdef class Tokenizer: | |||
|                             # https://github.com/explosion/spaCy/issues/768) | ||||
|                             infix_span = string[infix_start:infix_end] | ||||
|                             tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) | ||||
| 
 | ||||
|                         start = infix_end | ||||
|                     span = string[start:] | ||||
|                     if span: | ||||
|  | @ -274,7 +281,7 @@ cdef class Tokenizer: | |||
|         for i in range(n): | ||||
|             if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: | ||||
|                 return 0 | ||||
|         # See https://github.com/explosion/spaCy/issues/1250 | ||||
|         # See #1250 | ||||
|         if has_special: | ||||
|             return 0 | ||||
|         cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) | ||||
|  | @ -293,6 +300,8 @@ cdef class Tokenizer: | |||
|         RETURNS (list): A list of `re.MatchObject` objects that have `.start()` | ||||
|             and `.end()` methods, denoting the placement of internal segment | ||||
|             separators, e.g. hyphens. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#find_infix | ||||
|         """ | ||||
|         if self.infix_finditer is None: | ||||
|             return 0 | ||||
|  | @ -304,6 +313,8 @@ cdef class Tokenizer: | |||
| 
 | ||||
|         string (unicode): The string to segment. | ||||
|         RETURNS (int): The length of the prefix if present, otherwise `None`. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#find_prefix | ||||
|         """ | ||||
|         if self.prefix_search is None: | ||||
|             return 0 | ||||
|  | @ -316,6 +327,8 @@ cdef class Tokenizer: | |||
| 
 | ||||
|         string (unicode): The string to segment. | ||||
|         Returns (int): The length of the suffix if present, otherwise `None`. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#find_suffix | ||||
|         """ | ||||
|         if self.suffix_search is None: | ||||
|             return 0 | ||||
|  | @ -334,6 +347,8 @@ cdef class Tokenizer: | |||
|         token_attrs (iterable): A sequence of dicts, where each dict describes | ||||
|             a token and its attributes. The `ORTH` fields of the attributes | ||||
|             must exactly match the string when they are concatenated. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#add_special_case | ||||
|         """ | ||||
|         substrings = list(substrings) | ||||
|         cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) | ||||
|  | @ -350,8 +365,10 @@ cdef class Tokenizer: | |||
| 
 | ||||
|         path (unicode or Path): A path to a directory, which will be created if | ||||
|             it doesn't exist. Paths may be either strings or Path-like objects. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#to_disk | ||||
|         """ | ||||
|         with path.open('wb') as file_: | ||||
|         with path.open("wb") as file_: | ||||
|             file_.write(self.to_bytes(**exclude)) | ||||
| 
 | ||||
|     def from_disk(self, path, **exclude): | ||||
|  | @ -361,8 +378,10 @@ cdef class Tokenizer: | |||
|         path (unicode or Path): A path to a directory. Paths may be either | ||||
|             strings or `Path`-like objects. | ||||
|         RETURNS (Tokenizer): The modified `Tokenizer` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#from_disk | ||||
|         """ | ||||
|         with path.open('rb') as file_: | ||||
|         with path.open("rb") as file_: | ||||
|             bytes_data = file_.read() | ||||
|         self.from_bytes(bytes_data, **exclude) | ||||
|         return self | ||||
|  | @ -372,14 +391,16 @@ cdef class Tokenizer: | |||
| 
 | ||||
|         **exclude: Named attributes to prevent from being serialized. | ||||
|         RETURNS (bytes): The serialized form of the `Tokenizer` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#to_bytes | ||||
|         """ | ||||
|         serializers = OrderedDict(( | ||||
|             ('vocab', lambda: self.vocab.to_bytes()), | ||||
|             ('prefix_search', lambda: _get_regex_pattern(self.prefix_search)), | ||||
|             ('suffix_search', lambda: _get_regex_pattern(self.suffix_search)), | ||||
|             ('infix_finditer', lambda: _get_regex_pattern(self.infix_finditer)), | ||||
|             ('token_match', lambda: _get_regex_pattern(self.token_match)), | ||||
|             ('exceptions', lambda: OrderedDict(sorted(self._rules.items()))) | ||||
|             ("vocab", lambda: self.vocab.to_bytes()), | ||||
|             ("prefix_search", lambda: _get_regex_pattern(self.prefix_search)), | ||||
|             ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), | ||||
|             ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), | ||||
|             ("token_match", lambda: _get_regex_pattern(self.token_match)), | ||||
|             ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) | ||||
|         )) | ||||
|         return util.to_bytes(serializers, exclude) | ||||
| 
 | ||||
|  | @ -389,26 +410,28 @@ cdef class Tokenizer: | |||
|         bytes_data (bytes): The data to load from. | ||||
|         **exclude: Named attributes to prevent from being loaded. | ||||
|         RETURNS (Tokenizer): The `Tokenizer` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tokenizer#from_bytes | ||||
|         """ | ||||
|         data = OrderedDict() | ||||
|         deserializers = OrderedDict(( | ||||
|             ('vocab', lambda b: self.vocab.from_bytes(b)), | ||||
|             ('prefix_search', lambda b: data.setdefault('prefix_search', b)), | ||||
|             ('suffix_search', lambda b: data.setdefault('suffix_search', b)), | ||||
|             ('infix_finditer', lambda b: data.setdefault('infix_finditer', b)), | ||||
|             ('token_match', lambda b: data.setdefault('token_match', b)), | ||||
|             ('exceptions', lambda b: data.setdefault('rules', b)) | ||||
|             ("vocab", lambda b: self.vocab.from_bytes(b)), | ||||
|             ("prefix_search", lambda b: data.setdefault("prefix_search", b)), | ||||
|             ("suffix_search", lambda b: data.setdefault("suffix_search", b)), | ||||
|             ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), | ||||
|             ("token_match", lambda b: data.setdefault("token_match", b)), | ||||
|             ("exceptions", lambda b: data.setdefault("rules", b)) | ||||
|         )) | ||||
|         msg = util.from_bytes(bytes_data, deserializers, exclude) | ||||
|         if data.get('prefix_search'): | ||||
|             self.prefix_search = re.compile(data['prefix_search']).search | ||||
|         if data.get('suffix_search'): | ||||
|             self.suffix_search = re.compile(data['suffix_search']).search | ||||
|         if data.get('infix_finditer'): | ||||
|             self.infix_finditer = re.compile(data['infix_finditer']).finditer | ||||
|         if data.get('token_match'): | ||||
|             self.token_match = re.compile(data['token_match']).match | ||||
|         for string, substrings in data.get('rules', {}).items(): | ||||
|         if data.get("prefix_search"): | ||||
|             self.prefix_search = re.compile(data["prefix_search"]).search | ||||
|         if data.get("suffix_search"): | ||||
|             self.suffix_search = re.compile(data["suffix_search"]).search | ||||
|         if data.get("infix_finditer"): | ||||
|             self.infix_finditer = re.compile(data["infix_finditer"]).finditer | ||||
|         if data.get("token_match"): | ||||
|             self.token_match = re.compile(data["token_match"]).match | ||||
|         for string, substrings in data.get("rules", {}).items(): | ||||
|             self.add_special_case(string, substrings) | ||||
|         return self | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,5 +1,8 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .doc import Doc | ||||
| from .token import Token | ||||
| from .span import Span | ||||
| 
 | ||||
| __all__ = ['Doc', 'Token', 'Span'] | ||||
| __all__ = ["Doc", "Token", "Span"] | ||||
|  |  | |||
|  | @ -6,11 +6,11 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| from libc.string cimport memcpy, memset | ||||
| from libc.stdlib cimport malloc, free | ||||
| 
 | ||||
| import numpy | ||||
| from cymem.cymem cimport Pool | ||||
| from thinc.neural.util import get_array_module | ||||
| 
 | ||||
| import numpy | ||||
| 
 | ||||
| from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end | ||||
| from .span cimport Span | ||||
| from .token cimport Token | ||||
|  | @ -26,11 +26,16 @@ from ..strings import get_string_id | |||
| 
 | ||||
| 
 | ||||
| cdef class Retokenizer: | ||||
|     """Helper class for doc.retokenize() context manager.""" | ||||
|     """Helper class for doc.retokenize() context manager. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/doc#retokenize | ||||
|     USAGE: https://spacy.io/usage/linguistic-features#retokenization | ||||
|     """ | ||||
|     cdef Doc doc | ||||
|     cdef list merges | ||||
|     cdef list splits | ||||
|     cdef set tokens_to_merge | ||||
| 
 | ||||
|     def __init__(self, doc): | ||||
|         self.doc = doc | ||||
|         self.merges = [] | ||||
|  | @ -40,6 +45,11 @@ cdef class Retokenizer: | |||
|     def merge(self, Span span, attrs=SimpleFrozenDict()): | ||||
|         """Mark a span for merging. The attrs will be applied to the resulting | ||||
|         token. | ||||
| 
 | ||||
|         span (Span): The span to merge. | ||||
|         attrs (dict): Attributes to set on the merged token. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#retokenizer.merge | ||||
|         """ | ||||
|         for token in span: | ||||
|             if token.i in self.tokens_to_merge: | ||||
|  | @ -58,6 +68,16 @@ cdef class Retokenizer: | |||
|     def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()): | ||||
|         """Mark a Token for splitting, into the specified orths. The attrs | ||||
|         will be applied to each subtoken. | ||||
| 
 | ||||
|         token (Token): The token to split. | ||||
|         orths (list): The verbatim text of the split tokens. Needs to match the | ||||
|             text of the original token. | ||||
|         heads (list): List of token or `(token, subtoken)` tuples specifying the | ||||
|             tokens to attach the newly split subtokens to. | ||||
|         attrs (dict): Attributes to set on all split tokens. Attribute names | ||||
|             mapped to list of per-token attribute values. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#retokenizer.split | ||||
|         """ | ||||
|         if ''.join(orths) != token.text: | ||||
|             raise ValueError(Errors.E117.format(new=''.join(orths), old=token.text)) | ||||
|  | @ -104,14 +124,12 @@ cdef class Retokenizer: | |||
|             # referred to in the splits. If we merged these tokens previously, we | ||||
|             # have to raise an error | ||||
|             if token_index == -1: | ||||
|                 raise IndexError( | ||||
|                     "Cannot find token to be split. Did it get merged?") | ||||
|                 raise IndexError(Errors.E122) | ||||
|             head_indices = [] | ||||
|             for head_char, subtoken in heads: | ||||
|                 head_index = token_by_start(self.doc.c, self.doc.length, head_char) | ||||
|                 if head_index == -1: | ||||
|                     raise IndexError( | ||||
|                         "Cannot find head of token to be split. Did it get merged?") | ||||
|                     raise IndexError(Errors.E123) | ||||
|                 # We want to refer to the token index of the head *after* the | ||||
|                 # mergery. We need to account for the extra tokens introduced. | ||||
|                 # e.g., let's say we have [ab, c] and we want a and b to depend | ||||
|  | @ -206,7 +224,6 @@ def _merge(Doc doc, int start, int end, attributes): | |||
|         doc.c[i].head -= i | ||||
|     # Set the left/right children, left/right edges | ||||
|     set_children_from_heads(doc.c, doc.length) | ||||
|     # Clear the cached Python objects | ||||
|     # Return the merged Python object | ||||
|     return doc[start] | ||||
| 
 | ||||
|  | @ -336,7 +353,7 @@ def _bulk_merge(Doc doc, merges): | |||
|     # Make sure ent_iob remains consistent | ||||
|     for (span, _) in merges: | ||||
|         if(span.end < len(offsets)): | ||||
|         #if it's not the last span | ||||
|         # If it's not the last span | ||||
|             token_after_span_position = offsets[span.end] | ||||
|             if doc.c[token_after_span_position].ent_iob == 1\ | ||||
|                     and doc.c[token_after_span_position - 1].ent_iob in (0, 2): | ||||
|  |  | |||
|  | @ -1,3 +1,4 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import numpy | ||||
|  | @ -16,9 +17,8 @@ class Binder(object): | |||
|     def __init__(self, attrs=None): | ||||
|         """Create a Binder object, to hold serialized annotations. | ||||
| 
 | ||||
|         attrs (list): | ||||
|             List of attributes to serialize. 'orth' and 'spacy' are always | ||||
|             serialized, so they're not required. Defaults to None. | ||||
|         attrs (list): List of attributes to serialize. 'orth' and 'spacy' are | ||||
|             always serialized, so they're not required. Defaults to None. | ||||
|         """ | ||||
|         attrs = attrs or [] | ||||
|         self.attrs = list(attrs) | ||||
|  |  | |||
|  | @ -7,28 +7,25 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| cimport cython | ||||
| cimport numpy as np | ||||
| from libc.string cimport memcpy, memset | ||||
| from libc.math cimport sqrt | ||||
| 
 | ||||
| import numpy | ||||
| import numpy.linalg | ||||
| import struct | ||||
| import srsly | ||||
| from thinc.neural.util import get_array_module, copy_array | ||||
| import srsly | ||||
| 
 | ||||
| from libc.string cimport memcpy, memset | ||||
| from libc.math cimport sqrt | ||||
| 
 | ||||
| from .span cimport Span | ||||
| from .token cimport Token | ||||
| from .span cimport Span | ||||
| from .token cimport Token | ||||
| from ..lexeme cimport Lexeme, EMPTY_LEXEME | ||||
| from ..typedefs cimport attr_t, flags_t | ||||
| from ..attrs import intify_attrs, IDS | ||||
| from ..attrs cimport attr_id_t | ||||
| from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER | ||||
| from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB | ||||
| from ..attrs cimport ENT_TYPE, SENT_START | ||||
| from ..attrs cimport ENT_TYPE, SENT_START, attr_id_t | ||||
| from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t | ||||
| 
 | ||||
| from ..attrs import intify_attrs, IDS | ||||
| from ..util import normalize_slice | ||||
| from ..compat import is_config, copy_reg, pickle, basestring_ | ||||
| from ..errors import deprecation_warning, models_warning, user_warning | ||||
|  | @ -37,6 +34,7 @@ from .. import util | |||
| from .underscore import Underscore, get_ext_args | ||||
| from ._retokenize import Retokenizer | ||||
| 
 | ||||
| 
 | ||||
| DEF PADDING = 5 | ||||
| 
 | ||||
| 
 | ||||
|  | @ -77,7 +75,7 @@ def _get_chunker(lang): | |||
|         return None | ||||
|     except KeyError: | ||||
|         return None | ||||
|     return cls.Defaults.syntax_iterators.get(u'noun_chunks') | ||||
|     return cls.Defaults.syntax_iterators.get("noun_chunks") | ||||
| 
 | ||||
| 
 | ||||
| cdef class Doc: | ||||
|  | @ -94,23 +92,60 @@ cdef class Doc: | |||
|         >>> from spacy.tokens import Doc | ||||
|         >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], | ||||
|                       spaces=[True, False, False]) | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/doc | ||||
|     """ | ||||
| 
 | ||||
|     @classmethod | ||||
|     def set_extension(cls, name, **kwargs): | ||||
|         if cls.has_extension(name) and not kwargs.get('force', False): | ||||
|             raise ValueError(Errors.E090.format(name=name, obj='Doc')) | ||||
|         """Define a custom attribute which becomes available as `Doc._`. | ||||
| 
 | ||||
|         name (unicode): Name of the attribute to set. | ||||
|         default: Optional default value of the attribute. | ||||
|         getter (callable): Optional getter function. | ||||
|         setter (callable): Optional setter function. | ||||
|         method (callable): Optional method for method extension. | ||||
|         force (bool): Force overwriting existing attribute. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#set_extension | ||||
|         USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes | ||||
|         """ | ||||
|         if cls.has_extension(name) and not kwargs.get("force", False): | ||||
|             raise ValueError(Errors.E090.format(name=name, obj="Doc")) | ||||
|         Underscore.doc_extensions[name] = get_ext_args(**kwargs) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_extension(cls, name): | ||||
|         """Look up a previously registered extension by name. | ||||
| 
 | ||||
|         name (unicode): Name of the extension. | ||||
|         RETURNS (tuple): A `(default, method, getter, setter)` tuple. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#get_extension | ||||
|         """ | ||||
|         return Underscore.doc_extensions.get(name) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def has_extension(cls, name): | ||||
|         """Check whether an extension has been registered. | ||||
| 
 | ||||
|         name (unicode): Name of the extension. | ||||
|         RETURNS (bool): Whether the extension has been registered. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#has_extension | ||||
|         """ | ||||
|         return name in Underscore.doc_extensions | ||||
| 
 | ||||
|     @classmethod | ||||
|     def remove_extension(cls, name): | ||||
|         """Remove a previously registered extension. | ||||
| 
 | ||||
|         name (unicode): Name of the extension. | ||||
|         RETURNS (tuple): A `(default, method, getter, setter)` tuple of the | ||||
|             removed extension. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#remove_extension | ||||
|         """ | ||||
|         if not cls.has_extension(name): | ||||
|             raise ValueError(Errors.E046.format(name=name)) | ||||
|         return Underscore.doc_extensions.pop(name) | ||||
|  | @ -128,6 +163,8 @@ cdef class Doc: | |||
|             it is not. If `None`, defaults to `[True]*len(words)` | ||||
|         user_data (dict or None): Optional extra data to attach to the Doc. | ||||
|         RETURNS (Doc): The newly constructed object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#init | ||||
|         """ | ||||
|         self.vocab = vocab | ||||
|         size = 20 | ||||
|  | @ -151,7 +188,7 @@ cdef class Doc: | |||
|         self.user_hooks = {} | ||||
|         self.user_token_hooks = {} | ||||
|         self.user_span_hooks = {} | ||||
|         self.tensor = numpy.zeros((0,), dtype='float32') | ||||
|         self.tensor = numpy.zeros((0,), dtype="float32") | ||||
|         self.user_data = {} if user_data is None else user_data | ||||
|         self._vector = None | ||||
|         self.noun_chunks_iterator = _get_chunker(self.vocab.lang) | ||||
|  | @ -184,6 +221,7 @@ cdef class Doc: | |||
| 
 | ||||
|     @property | ||||
|     def _(self): | ||||
|         """Custom extension attributes registered via `set_extension`.""" | ||||
|         return Underscore(Underscore.doc_extensions, self) | ||||
| 
 | ||||
|     @property | ||||
|  | @ -195,7 +233,7 @@ cdef class Doc: | |||
|         b) sent.is_parsed is set to True; | ||||
|         c) At least one token other than the first where sent_start is not None. | ||||
|         """ | ||||
|         if 'sents' in self.user_hooks: | ||||
|         if "sents" in self.user_hooks: | ||||
|             return True | ||||
|         if self.is_parsed: | ||||
|             return True | ||||
|  | @ -227,11 +265,12 @@ cdef class Doc: | |||
|             supported, as `Span` objects must be contiguous (cannot have gaps). | ||||
|             You can use negative indices and open-ended ranges, which have | ||||
|             their normal Python semantics. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#getitem | ||||
|         """ | ||||
|         if isinstance(i, slice): | ||||
|             start, stop = normalize_slice(len(self), i.start, i.stop, i.step) | ||||
|             return Span(self, start, stop, label=0) | ||||
| 
 | ||||
|         if i < 0: | ||||
|             i = self.length + i | ||||
|         bounds_check(i, self.length, PADDING) | ||||
|  | @ -244,8 +283,7 @@ cdef class Doc: | |||
|         than-Python speeds are required, you can instead access the annotations | ||||
|         as a numpy array, or access the underlying C data directly from Cython. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> for token in doc | ||||
|         DOCS: https://spacy.io/api/doc#iter | ||||
|         """ | ||||
|         cdef int i | ||||
|         for i in range(self.length): | ||||
|  | @ -256,16 +294,15 @@ cdef class Doc: | |||
| 
 | ||||
|         RETURNS (int): The number of tokens in the document. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> len(doc) | ||||
|         DOCS: https://spacy.io/api/doc#len | ||||
|         """ | ||||
|         return self.length | ||||
| 
 | ||||
|     def __unicode__(self): | ||||
|         return u''.join([t.text_with_ws for t in self]) | ||||
|         return "".join([t.text_with_ws for t in self]) | ||||
| 
 | ||||
|     def __bytes__(self): | ||||
|         return u''.join([t.text_with_ws for t in self]).encode('utf-8') | ||||
|         return "".join([t.text_with_ws for t in self]).encode("utf-8") | ||||
| 
 | ||||
|     def __str__(self): | ||||
|         if is_config(python3=True): | ||||
|  | @ -290,6 +327,8 @@ cdef class Doc: | |||
|         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of | ||||
|             the span. | ||||
|         RETURNS (Span): The newly constructed object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#char_span | ||||
|         """ | ||||
|         if not isinstance(label, int): | ||||
|             label = self.vocab.strings.add(label) | ||||
|  | @ -311,9 +350,11 @@ cdef class Doc: | |||
|         other (object): The object to compare with. By default, accepts `Doc`, | ||||
|             `Span`, `Token` and `Lexeme` objects. | ||||
|         RETURNS (float): A scalar similarity score. Higher is more similar. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#similarity | ||||
|         """ | ||||
|         if 'similarity' in self.user_hooks: | ||||
|             return self.user_hooks['similarity'](self, other) | ||||
|         if "similarity" in self.user_hooks: | ||||
|             return self.user_hooks["similarity"](self, other) | ||||
|         if isinstance(other, (Lexeme, Token)) and self.length == 1: | ||||
|             if self.c[0].lex.orth == other.orth: | ||||
|                 return 1.0 | ||||
|  | @ -325,9 +366,9 @@ cdef class Doc: | |||
|                 else: | ||||
|                     return 1.0 | ||||
|         if self.vocab.vectors.n_keys == 0: | ||||
|             models_warning(Warnings.W007.format(obj='Doc')) | ||||
|             models_warning(Warnings.W007.format(obj="Doc")) | ||||
|         if self.vector_norm == 0 or other.vector_norm == 0: | ||||
|             user_warning(Warnings.W008.format(obj='Doc')) | ||||
|             user_warning(Warnings.W008.format(obj="Doc")) | ||||
|             return 0.0 | ||||
|         vector = self.vector | ||||
|         xp = get_array_module(vector) | ||||
|  | @ -338,10 +379,12 @@ cdef class Doc: | |||
|         the object. | ||||
| 
 | ||||
|         RETURNS (bool): Whether a word vector is associated with the object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#has_vector | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'has_vector' in self.user_hooks: | ||||
|                 return self.user_hooks['has_vector'](self) | ||||
|             if "has_vector" in self.user_hooks: | ||||
|                 return self.user_hooks["has_vector"](self) | ||||
|             elif self.vocab.vectors.data.size: | ||||
|                 return True | ||||
|             elif self.tensor.size: | ||||
|  | @ -355,15 +398,16 @@ cdef class Doc: | |||
| 
 | ||||
|         RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array | ||||
|             representing the document's semantics. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#vector | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'vector' in self.user_hooks: | ||||
|                 return self.user_hooks['vector'](self) | ||||
|             if "vector" in self.user_hooks: | ||||
|                 return self.user_hooks["vector"](self) | ||||
|             if self._vector is not None: | ||||
|                 return self._vector | ||||
|             elif not len(self): | ||||
|                 self._vector = numpy.zeros((self.vocab.vectors_length,), | ||||
|                                            dtype='f') | ||||
|                 self._vector = numpy.zeros((self.vocab.vectors_length,), dtype="f") | ||||
|                 return self._vector | ||||
|             elif self.vocab.vectors.data.size > 0: | ||||
|                 self._vector = sum(t.vector for t in self) / len(self) | ||||
|  | @ -372,8 +416,7 @@ cdef class Doc: | |||
|                 self._vector = self.tensor.mean(axis=0) | ||||
|                 return self._vector | ||||
|             else: | ||||
|                 return numpy.zeros((self.vocab.vectors_length,), | ||||
|                                    dtype='float32') | ||||
|                 return numpy.zeros((self.vocab.vectors_length,), dtype="float32") | ||||
| 
 | ||||
|         def __set__(self, value): | ||||
|             self._vector = value | ||||
|  | @ -382,10 +425,12 @@ cdef class Doc: | |||
|         """The L2 norm of the document's vector representation. | ||||
| 
 | ||||
|         RETURNS (float): The L2 norm of the vector representation. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#vector_norm | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'vector_norm' in self.user_hooks: | ||||
|                 return self.user_hooks['vector_norm'](self) | ||||
|             if "vector_norm" in self.user_hooks: | ||||
|                 return self.user_hooks["vector_norm"](self) | ||||
|             cdef float value | ||||
|             cdef double norm = 0 | ||||
|             if self._vector_norm is None: | ||||
|  | @ -404,7 +449,7 @@ cdef class Doc: | |||
|         RETURNS (unicode): The original verbatim text of the document. | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return u''.join(t.text_with_ws for t in self) | ||||
|             return "".join(t.text_with_ws for t in self) | ||||
| 
 | ||||
|     property text_with_ws: | ||||
|         """An alias of `Doc.text`, provided for duck-type compatibility with | ||||
|  | @ -416,21 +461,12 @@ cdef class Doc: | |||
|             return self.text | ||||
| 
 | ||||
|     property ents: | ||||
|         """Iterate over the entities in the document. Yields named-entity | ||||
|         `Span` objects, if the entity recognizer has been applied to the | ||||
|         document. | ||||
|         """The named entities in the document. Returns a tuple of named entity | ||||
|         `Span` objects, if the entity recognizer has been applied. | ||||
| 
 | ||||
|         YIELDS (Span): Entities in the document. | ||||
|         RETURNS (tuple): Entities in the document, one `Span` per entity. | ||||
| 
 | ||||
|         EXAMPLE: Iterate over the span to get individual Token objects, | ||||
|             or access the label: | ||||
| 
 | ||||
|             >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') | ||||
|             >>> ents = list(tokens.ents) | ||||
|             >>> assert ents[0].label == 346 | ||||
|             >>> assert ents[0].label_ == 'PERSON' | ||||
|             >>> assert ents[0].orth_ == 'Best' | ||||
|             >>> assert ents[0].text == 'Mr. Best' | ||||
|         DOCS: https://spacy.io/api/doc#ents | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             cdef int i | ||||
|  | @ -442,8 +478,8 @@ cdef class Doc: | |||
|                 token = &self.c[i] | ||||
|                 if token.ent_iob == 1: | ||||
|                     if start == -1: | ||||
|                         seq = ['%s|%s' % (t.text, t.ent_iob_) for t in self[i-5:i+5]] | ||||
|                         raise ValueError(Errors.E093.format(seq=' '.join(seq))) | ||||
|                         seq = ["%s|%s" % (t.text, t.ent_iob_) for t in self[i-5:i+5]] | ||||
|                         raise ValueError(Errors.E093.format(seq=" ".join(seq))) | ||||
|                 elif token.ent_iob == 2 or token.ent_iob == 0: | ||||
|                     if start != -1: | ||||
|                         output.append(Span(self, start, i, label=label)) | ||||
|  | @ -465,7 +501,6 @@ cdef class Doc: | |||
|             #    prediction | ||||
|             # 3. Test basic data-driven ORTH gazetteer | ||||
|             # 4. Test more nuanced date and currency regex | ||||
| 
 | ||||
|             tokens_in_ents = {} | ||||
|             cdef attr_t entity_type | ||||
|             cdef int ent_start, ent_end | ||||
|  | @ -479,7 +514,6 @@ cdef class Doc: | |||
|                                    self.vocab.strings[tokens_in_ents[token_index][2]]), | ||||
|                             span2=(ent_start, ent_end, self.vocab.strings[entity_type]))) | ||||
|                     tokens_in_ents[token_index] = (ent_start, ent_end, entity_type) | ||||
| 
 | ||||
|             cdef int i | ||||
|             for i in range(self.length): | ||||
|                 self.c[i].ent_type = 0 | ||||
|  | @ -510,6 +544,8 @@ cdef class Doc: | |||
|         clauses. | ||||
| 
 | ||||
|         YIELDS (Span): Noun chunks in the document. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#noun_chunks | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if not self.is_parsed: | ||||
|  | @ -533,15 +569,15 @@ cdef class Doc: | |||
|         dependency parse. If the parser is disabled, the `sents` iterator will | ||||
|         be unavailable. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> doc = nlp("This is a sentence. Here's another...") | ||||
|             >>> assert [s.root.text for s in doc.sents] == ["is", "'s"] | ||||
|         YIELDS (Span): Sentences in the document. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#sents | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if not self.is_sentenced: | ||||
|                 raise ValueError(Errors.E030) | ||||
|             if 'sents' in self.user_hooks: | ||||
|                 yield from self.user_hooks['sents'](self) | ||||
|             if "sents" in self.user_hooks: | ||||
|                 yield from self.user_hooks["sents"](self) | ||||
|             else: | ||||
|                 start = 0 | ||||
|                 for i in range(1, self.length): | ||||
|  | @ -606,17 +642,16 @@ cdef class Doc: | |||
|         if isinstance(py_attr_ids, basestring_): | ||||
|             # Handle inputs like doc.to_array('ORTH') | ||||
|             py_attr_ids = [py_attr_ids] | ||||
|         elif not hasattr(py_attr_ids, '__iter__'): | ||||
|         elif not hasattr(py_attr_ids, "__iter__"): | ||||
|             # Handle inputs like doc.to_array(ORTH) | ||||
|             py_attr_ids = [py_attr_ids] | ||||
|         # Allow strings, e.g. 'lemma' or 'LEMMA' | ||||
|         py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_) | ||||
|         py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) | ||||
|                        for id_ in py_attr_ids] | ||||
|         # Make an array from the attributes --- otherwise our inner loop is | ||||
|         # Python dict iteration. | ||||
|         cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype='i') | ||||
|         output = numpy.ndarray(shape=(self.length, len(attr_ids)), | ||||
|                                dtype=numpy.uint64) | ||||
|         cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i") | ||||
|         output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) | ||||
|         c_output = <attr_t*>output.data | ||||
|         c_attr_ids = <attr_id_t*>attr_ids.data | ||||
|         cdef TokenC* token | ||||
|  | @ -628,8 +663,7 @@ cdef class Doc: | |||
|         # Handle 1d case | ||||
|         return output if len(attr_ids) >= 2 else output.reshape((self.length,)) | ||||
| 
 | ||||
|     def count_by(self, attr_id_t attr_id, exclude=None, | ||||
|                  PreshCounter counts=None): | ||||
|     def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): | ||||
|         """Count the frequencies of a given attribute. Produces a dict of | ||||
|         `{attribute (int): count (ints)}` frequencies, keyed by the values of | ||||
|         the given attribute ID. | ||||
|  | @ -637,13 +671,7 @@ cdef class Doc: | |||
|         attr_id (int): The attribute ID to key the counts. | ||||
|         RETURNS (dict): A dictionary mapping attributes to integer counts. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> from spacy import attrs | ||||
|             >>> doc = nlp(u'apple apple orange banana') | ||||
|             >>> tokens.count_by(attrs.ORTH) | ||||
|             {12800L: 1, 11880L: 2, 7561L: 1} | ||||
|             >>> tokens.to_array([attrs.ORTH]) | ||||
|             array([[11880], [11880], [7561], [12800]]) | ||||
|         DOCS: https://spacy.io/api/doc#count_by | ||||
|         """ | ||||
|         cdef int i | ||||
|         cdef attr_t attr | ||||
|  | @ -684,13 +712,21 @@ cdef class Doc: | |||
|     cdef void set_parse(self, const TokenC* parsed) nogil: | ||||
|         # TODO: This method is fairly misleading atm. It's used by Parser | ||||
|         # to actually apply the parse calculated. Need to rethink this. | ||||
| 
 | ||||
|         # Probably we should use from_array? | ||||
|         self.is_parsed = True | ||||
|         for i in range(self.length): | ||||
|             self.c[i] = parsed[i] | ||||
| 
 | ||||
|     def from_array(self, attrs, array): | ||||
|         """Load attributes from a numpy array. Write to a `Doc` object, from an | ||||
|         `(M, N)` array of attributes. | ||||
| 
 | ||||
|         attrs (list) A list of attribute ID ints. | ||||
|         array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values. | ||||
|         RETURNS (Doc): Itself. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#from_array | ||||
|         """ | ||||
|         if SENT_START in attrs and HEAD in attrs: | ||||
|             raise ValueError(Errors.E032) | ||||
|         cdef int i, col | ||||
|  | @ -714,10 +750,10 @@ cdef class Doc: | |||
|                 for i in range(length): | ||||
|                     if array[i, col] != 0: | ||||
|                         self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) | ||||
|         # set flags | ||||
|         # Set flags | ||||
|         self.is_parsed = bool(self.is_parsed or HEAD in attrs or DEP in attrs) | ||||
|         self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) | ||||
|         # if document is parsed, set children | ||||
|         # If document is parsed, set children | ||||
|         if self.is_parsed: | ||||
|             set_children_from_heads(self.c, self.length) | ||||
|         return self | ||||
|  | @ -729,6 +765,8 @@ cdef class Doc: | |||
| 
 | ||||
|         RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape | ||||
|             (n, n), where n = len(self). | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#get_lca_matrix | ||||
|         """ | ||||
|         return numpy.asarray(_get_lca_matrix(self, 0, len(self))) | ||||
| 
 | ||||
|  | @ -737,9 +775,11 @@ cdef class Doc: | |||
| 
 | ||||
|         path (unicode or Path): A path to a directory, which will be created if | ||||
|             it doesn't exist. Paths may be either strings or Path-like objects. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#to_disk | ||||
|         """ | ||||
|         path = util.ensure_path(path) | ||||
|         with path.open('wb') as file_: | ||||
|         with path.open("wb") as file_: | ||||
|             file_.write(self.to_bytes(**exclude)) | ||||
| 
 | ||||
|     def from_disk(self, path, **exclude): | ||||
|  | @ -749,9 +789,11 @@ cdef class Doc: | |||
|         path (unicode or Path): A path to a directory. Paths may be either | ||||
|             strings or `Path`-like objects. | ||||
|         RETURNS (Doc): The modified `Doc` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#from_disk | ||||
|         """ | ||||
|         path = util.ensure_path(path) | ||||
|         with path.open('rb') as file_: | ||||
|         with path.open("rb") as file_: | ||||
|             bytes_data = file_.read() | ||||
|         return self.from_bytes(bytes_data, **exclude) | ||||
| 
 | ||||
|  | @ -760,15 +802,16 @@ cdef class Doc: | |||
| 
 | ||||
|         RETURNS (bytes): A losslessly serialized copy of the `Doc`, including | ||||
|             all annotations. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#to_bytes | ||||
|         """ | ||||
|         array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] | ||||
| 
 | ||||
|         if self.is_tagged: | ||||
|             array_head.append(TAG) | ||||
|         # if doc parsed add head and dep attribute | ||||
|         # If doc parsed add head and dep attribute | ||||
|         if self.is_parsed: | ||||
|             array_head.extend([HEAD, DEP]) | ||||
|         # otherwise add sent_start | ||||
|         # Otherwise add sent_start | ||||
|         else: | ||||
|             array_head.append(SENT_START) | ||||
|         # Msgpack doesn't distinguish between lists and tuples, which is | ||||
|  | @ -776,17 +819,16 @@ cdef class Doc: | |||
|         # keys, we must have tuples. In values we just have to hope | ||||
|         # users don't mind getting a list instead of a tuple. | ||||
|         serializers = { | ||||
|             'text': lambda: self.text, | ||||
|             'array_head': lambda: array_head, | ||||
|             'array_body': lambda: self.to_array(array_head), | ||||
|             'sentiment': lambda: self.sentiment, | ||||
|             'tensor': lambda: self.tensor, | ||||
|             "text": lambda: self.text, | ||||
|             "array_head": lambda: array_head, | ||||
|             "array_body": lambda: self.to_array(array_head), | ||||
|             "sentiment": lambda: self.sentiment, | ||||
|             "tensor": lambda: self.tensor, | ||||
|         } | ||||
|         if 'user_data' not in exclude and self.user_data: | ||||
|         if "user_data" not in exclude and self.user_data: | ||||
|             user_data_keys, user_data_values = list(zip(*self.user_data.items())) | ||||
|             serializers['user_data_keys'] = lambda: srsly.msgpack_dumps(user_data_keys) | ||||
|             serializers['user_data_values'] = lambda: srsly.msgpack_dumps(user_data_values) | ||||
| 
 | ||||
|             serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys) | ||||
|             serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) | ||||
|         return util.to_bytes(serializers, exclude) | ||||
| 
 | ||||
|     def from_bytes(self, bytes_data, **exclude): | ||||
|  | @ -794,42 +836,40 @@ cdef class Doc: | |||
| 
 | ||||
|         data (bytes): The string to load from. | ||||
|         RETURNS (Doc): Itself. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#from_bytes | ||||
|         """ | ||||
|         if self.length != 0: | ||||
|             raise ValueError(Errors.E033.format(length=self.length)) | ||||
|         deserializers = { | ||||
|             'text': lambda b: None, | ||||
|             'array_head': lambda b: None, | ||||
|             'array_body': lambda b: None, | ||||
|             'sentiment': lambda b: None, | ||||
|             'tensor': lambda b: None, | ||||
|             'user_data_keys': lambda b: None, | ||||
|             'user_data_values': lambda b: None, | ||||
|             "text": lambda b: None, | ||||
|             "array_head": lambda b: None, | ||||
|             "array_body": lambda b: None, | ||||
|             "sentiment": lambda b: None, | ||||
|             "tensor": lambda b: None, | ||||
|             "user_data_keys": lambda b: None, | ||||
|             "user_data_values": lambda b: None, | ||||
|         } | ||||
| 
 | ||||
|         msg = util.from_bytes(bytes_data, deserializers, exclude) | ||||
|         # Msgpack doesn't distinguish between lists and tuples, which is | ||||
|         # vexing for user data. As a best guess, we *know* that within | ||||
|         # keys, we must have tuples. In values we just have to hope | ||||
|         # users don't mind getting a list instead of a tuple. | ||||
|         if 'user_data' not in exclude and 'user_data_keys' in msg: | ||||
|             user_data_keys = srsly.msgpack_loads(msg['user_data_keys'], use_list=False) | ||||
|             user_data_values = srsly.msgpack_loads(msg['user_data_values']) | ||||
|         if "user_data" not in exclude and "user_data_keys" in msg: | ||||
|             user_data_keys = srsly.msgpack_loads(msg["user_data_keys"], use_list=False) | ||||
|             user_data_values = srsly.msgpack_loads(msg["user_data_values"]) | ||||
|             for key, value in zip(user_data_keys, user_data_values): | ||||
|                 self.user_data[key] = value | ||||
| 
 | ||||
|         cdef int i, start, end, has_space | ||||
| 
 | ||||
|         if 'sentiment' not in exclude and 'sentiment' in msg: | ||||
|             self.sentiment = msg['sentiment'] | ||||
|         if 'tensor' not in exclude and 'tensor' in msg: | ||||
|             self.tensor = msg['tensor'] | ||||
| 
 | ||||
|         if "sentiment" not in exclude and "sentiment" in msg: | ||||
|             self.sentiment = msg["sentiment"] | ||||
|         if "tensor" not in exclude and "tensor" in msg: | ||||
|             self.tensor = msg["tensor"] | ||||
|         start = 0 | ||||
|         cdef const LexemeC* lex | ||||
|         cdef unicode orth_ | ||||
|         text = msg['text'] | ||||
|         attrs = msg['array_body'] | ||||
|         text = msg["text"] | ||||
|         attrs = msg["array_body"] | ||||
|         for i in range(attrs.shape[0]): | ||||
|             end = start + attrs[i, 0] | ||||
|             has_space = attrs[i, 1] | ||||
|  | @ -837,11 +877,11 @@ cdef class Doc: | |||
|             lex = self.vocab.get(self.mem, orth_) | ||||
|             self.push_back(lex, has_space) | ||||
|             start = end + has_space | ||||
|         self.from_array(msg['array_head'][2:], attrs[:, 2:]) | ||||
|         self.from_array(msg["array_head"][2:], attrs[:, 2:]) | ||||
|         return self | ||||
| 
 | ||||
|     def extend_tensor(self, tensor): | ||||
|         '''Concatenate a new tensor onto the doc.tensor object. | ||||
|         """Concatenate a new tensor onto the doc.tensor object. | ||||
| 
 | ||||
|         The doc.tensor attribute holds dense feature vectors | ||||
|         computed by the models in the pipeline. Let's say a | ||||
|  | @ -849,7 +889,7 @@ cdef class Doc: | |||
|         per word. doc.tensor.shape will be (30, 128). After | ||||
|         calling doc.extend_tensor with an array of shape (30, 64), | ||||
|         doc.tensor == (30, 192). | ||||
|         ''' | ||||
|         """ | ||||
|         xp = get_array_module(self.tensor) | ||||
|         if self.tensor.size == 0: | ||||
|             self.tensor.resize(tensor.shape, refcheck=False) | ||||
|  | @ -858,7 +898,7 @@ cdef class Doc: | |||
|             self.tensor = xp.hstack((self.tensor, tensor)) | ||||
| 
 | ||||
|     def retokenize(self): | ||||
|         '''Context manager to handle retokenization of the Doc. | ||||
|         """Context manager to handle retokenization of the Doc. | ||||
|         Modifications to the Doc's tokenization are stored, and then | ||||
|         made all at once when the context manager exits. This is | ||||
|         much more efficient, and less error-prone. | ||||
|  | @ -866,7 +906,10 @@ cdef class Doc: | |||
|         All views of the Doc (Span and Token) created before the | ||||
|         retokenization are invalidated, although they may accidentally | ||||
|         continue to work. | ||||
|         ''' | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#retokenize | ||||
|         USAGE: https://spacy.io/usage/linguistic-features#retokenization | ||||
|         """ | ||||
|         return Retokenizer(self) | ||||
| 
 | ||||
|     def _bulk_merge(self, spans, attributes): | ||||
|  | @ -882,9 +925,10 @@ cdef class Doc: | |||
|         RETURNS (Token): The first newly merged token. | ||||
|         """ | ||||
|         cdef unicode tag, lemma, ent_type | ||||
| 
 | ||||
|         assert len(attributes) == len(spans), "attribute length should be equal to span length" + str(len(attributes)) +\ | ||||
|                                               str(len(spans)) | ||||
|         attr_len = len(attributes) | ||||
|         span_len = len(spans) | ||||
|         if not attr_len == span_len: | ||||
|             raise ValueError(Errors.E121.format(attr_len=attr_len, span_len=span_len)) | ||||
|         with self.retokenize() as retokenizer: | ||||
|             for i, span in enumerate(spans): | ||||
|                 fix_attributes(self, attributes[i]) | ||||
|  | @ -915,13 +959,10 @@ cdef class Doc: | |||
|         elif not args: | ||||
|             fix_attributes(self, attributes) | ||||
|         elif args: | ||||
|             raise ValueError(Errors.E034.format(n_args=len(args), | ||||
|                                                 args=repr(args), | ||||
|             raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args), | ||||
|                                                 kwargs=repr(attributes))) | ||||
|         remove_label_if_necessary(attributes) | ||||
| 
 | ||||
|         attributes = intify_attrs(attributes, strings_map=self.vocab.strings) | ||||
| 
 | ||||
|         cdef int start = token_by_start(self.c, self.length, start_idx) | ||||
|         if start == -1: | ||||
|             return None | ||||
|  | @ -938,44 +979,47 @@ cdef class Doc: | |||
|         raise ValueError(Errors.E105) | ||||
| 
 | ||||
|     def to_json(self, underscore=None): | ||||
|         """Convert a Doc to JSON. Produces the same format used by the spacy | ||||
|         train command. | ||||
|         """Convert a Doc to JSON. The format it produces will be the new format | ||||
|         for the `spacy train` command (not implemented yet). | ||||
| 
 | ||||
|         underscore (list): Optional list of string names of custom doc._. | ||||
|         attributes. Attribute values need to be JSON-serializable. Values will | ||||
|         be added to an "_" key in the data, e.g. "_": {"foo": "bar"}. | ||||
|         RETURNS (dict): The data in spaCy's JSON format. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#to_json | ||||
|         """ | ||||
|         data = {'text': self.text} | ||||
|         data['ents'] = [{'start': ent.start_char, 'end': ent.end_char, | ||||
|                          'label': ent.label_} for ent in self.ents] | ||||
|         data = {"text": self.text} | ||||
|         if self.ents: | ||||
|             data["ents"] = [{"start": ent.start_char, "end": ent.end_char, | ||||
|                             "label": ent.label_} for ent in self.ents] | ||||
|         sents = list(self.sents) | ||||
|         if sents: | ||||
|             data['sents'] = [{'start': sent.start_char, 'end': sent.end_char} | ||||
|             data["sents"] = [{"start": sent.start_char, "end": sent.end_char} | ||||
|                              for sent in sents] | ||||
|         if self.cats: | ||||
|             data['cats'] = self.cats | ||||
|         data['tokens'] = [] | ||||
|             data["cats"] = self.cats | ||||
|         data["tokens"] = [] | ||||
|         for token in self: | ||||
|             token_data = {'id': token.i, 'start': token.idx, 'end': token.idx + len(token)} | ||||
|             token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)} | ||||
|             if token.pos_: | ||||
|                 token_data['pos'] = token.pos_ | ||||
|                 token_data["pos"] = token.pos_ | ||||
|             if token.tag_: | ||||
|                 token_data['tag'] = token.tag_ | ||||
|                 token_data["tag"] = token.tag_ | ||||
|             if token.dep_: | ||||
|                 token_data['dep'] = token.dep_ | ||||
|                 token_data["dep"] = token.dep_ | ||||
|             if token.head: | ||||
|                 token_data['head'] = token.head.i | ||||
|             data['tokens'].append(token_data) | ||||
|                 token_data["head"] = token.head.i | ||||
|             data["tokens"].append(token_data) | ||||
|         if underscore: | ||||
|             data['_'] = {} | ||||
|             data["_"] = {} | ||||
|             for attr in underscore: | ||||
|                 if not self.has_extension(attr): | ||||
|                     raise ValueError(Errors.E106.format(attr=attr, opts=underscore)) | ||||
|                 value = self._.get(attr) | ||||
|                 if not srsly.is_json_serializable(value): | ||||
|                     raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) | ||||
|                 data['_'][attr] = value | ||||
|                 data["_"][attr] = value | ||||
|         return data | ||||
| 
 | ||||
| 
 | ||||
|  | @ -1007,9 +1051,8 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: | |||
|         tokens[i].r_kids = 0 | ||||
|         tokens[i].l_edge = i | ||||
|         tokens[i].r_edge = i | ||||
|     # Three times, for non-projectivity | ||||
|     # See issue #3170. This isn't a very satisfying fix, but I think it's | ||||
|     # sufficient. | ||||
|     # Three times, for non-projectivity. See issue #3170. This isn't a very | ||||
|     # satisfying fix, but I think it's sufficient. | ||||
|     for loop_count in range(3): | ||||
|         # Set left edges | ||||
|         for i in range(length): | ||||
|  | @ -1021,7 +1064,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: | |||
|                 head.l_edge = child.l_edge | ||||
|             if child.r_edge > head.r_edge: | ||||
|                 head.r_edge = child.r_edge | ||||
|         # Set right edges --- same as above, but iterate in reverse | ||||
|         # Set right edges - same as above, but iterate in reverse | ||||
|         for i in range(length-1, -1, -1): | ||||
|             child = &tokens[i] | ||||
|             head = &tokens[i + child.head] | ||||
|  | @ -1052,20 +1095,14 @@ cdef int _get_tokens_lca(Token token_j, Token token_k): | |||
|         return token_k.i | ||||
|     elif token_k.head == token_j: | ||||
|         return token_j.i | ||||
| 
 | ||||
|     token_j_ancestors = set(token_j.ancestors) | ||||
| 
 | ||||
|     if token_k in token_j_ancestors: | ||||
|         return token_k.i | ||||
| 
 | ||||
|     for token_k_ancestor in token_k.ancestors: | ||||
| 
 | ||||
|         if token_k_ancestor == token_j: | ||||
|             return token_j.i | ||||
| 
 | ||||
|         if token_k_ancestor in token_j_ancestors: | ||||
|             return token_k_ancestor.i | ||||
| 
 | ||||
|     return -1 | ||||
| 
 | ||||
| 
 | ||||
|  | @ -1083,12 +1120,10 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): | |||
|         with shape (n, n), where n = len(doc). | ||||
|     """ | ||||
|     cdef int [:,:] lca_matrix | ||||
| 
 | ||||
|     n_tokens= end - start | ||||
|     lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32) | ||||
|     lca_mat.fill(-1) | ||||
|     lca_matrix = lca_mat | ||||
| 
 | ||||
|     for j in range(n_tokens): | ||||
|         token_j = doc[start + j] | ||||
|         # the common ancestor of token and itself is itself: | ||||
|  | @ -1109,7 +1144,6 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): | |||
|             else: | ||||
|                 lca_matrix[j, k] = lca - start | ||||
|                 lca_matrix[k, j] = lca - start | ||||
| 
 | ||||
|     return lca_matrix | ||||
| 
 | ||||
| 
 | ||||
|  | @ -1123,8 +1157,7 @@ def pickle_doc(doc): | |||
| def unpickle_doc(vocab, hooks_and_data, bytes_data): | ||||
|     user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data) | ||||
| 
 | ||||
|     doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, | ||||
|                                                      exclude='user_data') | ||||
|     doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude="user_data") | ||||
|     doc.user_hooks.update(doc_hooks) | ||||
|     doc.user_span_hooks.update(span_hooks) | ||||
|     doc.user_token_hooks.update(token_hooks) | ||||
|  | @ -1133,19 +1166,22 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data): | |||
| 
 | ||||
| copy_reg.pickle(Doc, pickle_doc, unpickle_doc) | ||||
| 
 | ||||
| 
 | ||||
| def remove_label_if_necessary(attributes): | ||||
|     # More deprecated attribute handling =/ | ||||
|     if 'label' in attributes: | ||||
|         attributes['ent_type'] = attributes.pop('label') | ||||
|     if "label" in attributes: | ||||
|         attributes["ent_type"] = attributes.pop("label") | ||||
| 
 | ||||
| 
 | ||||
| def fix_attributes(doc, attributes): | ||||
|     if 'label' in attributes and 'ent_type' not in attributes: | ||||
|         if isinstance(attributes['label'], int): | ||||
|             attributes[ENT_TYPE] = attributes['label'] | ||||
|     if "label" in attributes and "ent_type" not in attributes: | ||||
|         if isinstance(attributes["label"], int): | ||||
|             attributes[ENT_TYPE] = attributes["label"] | ||||
|         else: | ||||
|             attributes[ENT_TYPE] = doc.vocab.strings[attributes['label']] | ||||
|     if 'ent_type' in attributes: | ||||
|         attributes[ENT_TYPE] = attributes['ent_type'] | ||||
|             attributes[ENT_TYPE] = doc.vocab.strings[attributes["label"]] | ||||
|     if "ent_type" in attributes: | ||||
|         attributes[ENT_TYPE] = attributes["ent_type"] | ||||
| 
 | ||||
| 
 | ||||
| def get_entity_info(ent_info): | ||||
|     if isinstance(ent_info, Span): | ||||
|  |  | |||
|  | @ -1,12 +1,13 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| from collections import defaultdict | ||||
| 
 | ||||
| cimport numpy as np | ||||
| from libc.math cimport sqrt | ||||
| 
 | ||||
| import numpy | ||||
| import numpy.linalg | ||||
| from libc.math cimport sqrt | ||||
| from thinc.neural.util import get_array_module | ||||
| from collections import defaultdict | ||||
| 
 | ||||
| from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix | ||||
| from .token cimport TokenC | ||||
|  | @ -14,9 +15,10 @@ from ..structs cimport TokenC, LexemeC | |||
| from ..typedefs cimport flags_t, attr_t, hash_t | ||||
| from ..attrs cimport attr_id_t | ||||
| from ..parts_of_speech cimport univ_pos_t | ||||
| from ..util import normalize_slice | ||||
| from ..attrs cimport * | ||||
| from ..lexeme cimport Lexeme | ||||
| 
 | ||||
| from ..util import normalize_slice | ||||
| from ..compat import is_config, basestring_ | ||||
| from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning | ||||
| from ..errors import deprecation_warning | ||||
|  | @ -24,29 +26,66 @@ from .underscore import Underscore, get_ext_args | |||
| 
 | ||||
| 
 | ||||
| cdef class Span: | ||||
|     """A slice from a Doc object.""" | ||||
|     """A slice from a Doc object. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/span | ||||
|     """ | ||||
|     @classmethod | ||||
|     def set_extension(cls, name, **kwargs): | ||||
|         if cls.has_extension(name) and not kwargs.get('force', False): | ||||
|             raise ValueError(Errors.E090.format(name=name, obj='Span')) | ||||
|         """Define a custom attribute which becomes available as `Span._`. | ||||
| 
 | ||||
|         name (unicode): Name of the attribute to set. | ||||
|         default: Optional default value of the attribute. | ||||
|         getter (callable): Optional getter function. | ||||
|         setter (callable): Optional setter function. | ||||
|         method (callable): Optional method for method extension. | ||||
|         force (bool): Force overwriting existing attribute. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#set_extension | ||||
|         USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes | ||||
|         """ | ||||
|         if cls.has_extension(name) and not kwargs.get("force", False): | ||||
|             raise ValueError(Errors.E090.format(name=name, obj="Span")) | ||||
|         Underscore.span_extensions[name] = get_ext_args(**kwargs) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_extension(cls, name): | ||||
|         """Look up a previously registered extension by name. | ||||
| 
 | ||||
|         name (unicode): Name of the extension. | ||||
|         RETURNS (tuple): A `(default, method, getter, setter)` tuple. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#get_extension | ||||
|         """ | ||||
|         return Underscore.span_extensions.get(name) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def has_extension(cls, name): | ||||
|         """Check whether an extension has been registered. | ||||
| 
 | ||||
|         name (unicode): Name of the extension. | ||||
|         RETURNS (bool): Whether the extension has been registered. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#has_extension | ||||
|         """ | ||||
|         return name in Underscore.span_extensions | ||||
| 
 | ||||
|     @classmethod | ||||
|     def remove_extension(cls, name): | ||||
|         """Remove a previously registered extension. | ||||
| 
 | ||||
|         name (unicode): Name of the extension. | ||||
|         RETURNS (tuple): A `(default, method, getter, setter)` tuple of the | ||||
|             removed extension. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#remove_extension | ||||
|         """ | ||||
|         if not cls.has_extension(name): | ||||
|             raise ValueError(Errors.E046.format(name=name)) | ||||
|         return Underscore.span_extensions.pop(name) | ||||
| 
 | ||||
|     def __cinit__(self, Doc doc, int start, int end, label=0, | ||||
|                   vector=None, vector_norm=None): | ||||
|     def __cinit__(self, Doc doc, int start, int end, label=0, vector=None, | ||||
|                   vector_norm=None): | ||||
|         """Create a `Span` object from the slice `doc[start : end]`. | ||||
| 
 | ||||
|         doc (Doc): The parent document. | ||||
|  | @ -56,6 +95,8 @@ cdef class Span: | |||
|         vector (ndarray[ndim=1, dtype='float32']): A meaning representation | ||||
|             of the span. | ||||
|         RETURNS (Span): The newly constructed object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#init | ||||
|         """ | ||||
|         if not (0 <= start <= end <= len(doc)): | ||||
|             raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc))) | ||||
|  | @ -102,6 +143,8 @@ cdef class Span: | |||
|         """Get the number of tokens in the span. | ||||
| 
 | ||||
|         RETURNS (int): The number of tokens in the span. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#len | ||||
|         """ | ||||
|         self._recalculate_indices() | ||||
|         if self.end < self.start: | ||||
|  | @ -111,7 +154,7 @@ cdef class Span: | |||
|     def __repr__(self): | ||||
|         if is_config(python3=True): | ||||
|             return self.text | ||||
|         return self.text.encode('utf-8') | ||||
|         return self.text.encode("utf-8") | ||||
| 
 | ||||
|     def __getitem__(self, object i): | ||||
|         """Get a `Token` or a `Span` object | ||||
|  | @ -120,9 +163,7 @@ cdef class Span: | |||
|             the span to get. | ||||
|         RETURNS (Token or Span): The token at `span[i]`. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> span[0] | ||||
|             >>> span[1:3] | ||||
|         DOCS: https://spacy.io/api/span#getitem | ||||
|         """ | ||||
|         self._recalculate_indices() | ||||
|         if isinstance(i, slice): | ||||
|  | @ -138,6 +179,8 @@ cdef class Span: | |||
|         """Iterate over `Token` objects. | ||||
| 
 | ||||
|         YIELDS (Token): A `Token` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#iter | ||||
|         """ | ||||
|         self._recalculate_indices() | ||||
|         for i in range(self.start, self.end): | ||||
|  | @ -148,31 +191,32 @@ cdef class Span: | |||
| 
 | ||||
|     @property | ||||
|     def _(self): | ||||
|         """User space for adding custom attribute extensions.""" | ||||
|         """Custom extension attributes registered via `set_extension`.""" | ||||
|         return Underscore(Underscore.span_extensions, self, | ||||
|                           start=self.start_char, end=self.end_char) | ||||
| 
 | ||||
|     def as_doc(self): | ||||
|         # TODO: fix | ||||
|         """Create a `Doc` object with a copy of the Span's data. | ||||
|         """Create a `Doc` object with a copy of the `Span`'s data. | ||||
| 
 | ||||
|         RETURNS (Doc): The `Doc` copy of the span. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#as_doc | ||||
|         """ | ||||
|         cdef Doc doc = Doc(self.doc.vocab, | ||||
|             words=[t.text for t in self], | ||||
|             spaces=[bool(t.whitespace_) for t in self]) | ||||
|         # TODO: Fix! | ||||
|         words = [t.text for t in self] | ||||
|         spaces = [bool(t.whitespace_) for t in self] | ||||
|         cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) | ||||
|         array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] | ||||
|         if self.doc.is_tagged: | ||||
|             array_head.append(TAG) | ||||
|         # if doc parsed add head and dep attribute | ||||
|         # If doc parsed add head and dep attribute | ||||
|         if self.doc.is_parsed: | ||||
|             array_head.extend([HEAD, DEP]) | ||||
|         # otherwise add sent_start | ||||
|         # Otherwise add sent_start | ||||
|         else: | ||||
|             array_head.append(SENT_START) | ||||
|         array = self.doc.to_array(array_head) | ||||
|         doc.from_array(array_head, array[self.start : self.end]) | ||||
| 
 | ||||
|         doc.noun_chunks_iterator = self.doc.noun_chunks_iterator | ||||
|         doc.user_hooks = self.doc.user_hooks | ||||
|         doc.user_span_hooks = self.doc.user_span_hooks | ||||
|  | @ -181,7 +225,7 @@ cdef class Span: | |||
|         doc.vector_norm = self.vector_norm | ||||
|         doc.tensor = self.doc.tensor[self.start : self.end] | ||||
|         for key, value in self.doc.cats.items(): | ||||
|             if hasattr(key, '__len__') and len(key) == 3: | ||||
|             if hasattr(key, "__len__") and len(key) == 3: | ||||
|                 cat_start, cat_end, cat_label = key | ||||
|                 if cat_start == self.start_char and cat_end == self.end_char: | ||||
|                     doc.cats[cat_label] = value | ||||
|  | @ -207,6 +251,8 @@ cdef class Span: | |||
| 
 | ||||
|         RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape | ||||
|             (n, n), where n = len(self). | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#get_lca_matrix | ||||
|         """ | ||||
|         return numpy.asarray(_get_lca_matrix(self.doc, self.start, self.end)) | ||||
| 
 | ||||
|  | @ -217,22 +263,24 @@ cdef class Span: | |||
|         other (object): The object to compare with. By default, accepts `Doc`, | ||||
|             `Span`, `Token` and `Lexeme` objects. | ||||
|         RETURNS (float): A scalar similarity score. Higher is more similar. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#similarity | ||||
|         """ | ||||
|         if 'similarity' in self.doc.user_span_hooks: | ||||
|             self.doc.user_span_hooks['similarity'](self, other) | ||||
|         if len(self) == 1 and hasattr(other, 'orth'): | ||||
|         if "similarity" in self.doc.user_span_hooks: | ||||
|             self.doc.user_span_hooks["similarity"](self, other) | ||||
|         if len(self) == 1 and hasattr(other, "orth"): | ||||
|             if self[0].orth == other.orth: | ||||
|                 return 1.0 | ||||
|         elif hasattr(other, '__len__') and len(self) == len(other): | ||||
|         elif hasattr(other, "__len__") and len(self) == len(other): | ||||
|             for i in range(len(self)): | ||||
|                 if self[i].orth != getattr(other[i], 'orth', None): | ||||
|                 if self[i].orth != getattr(other[i], "orth", None): | ||||
|                     break | ||||
|             else: | ||||
|                 return 1.0 | ||||
|         if self.vocab.vectors.n_keys == 0: | ||||
|             models_warning(Warnings.W007.format(obj='Span')) | ||||
|             models_warning(Warnings.W007.format(obj="Span")) | ||||
|         if self.vector_norm == 0.0 or other.vector_norm == 0.0: | ||||
|             user_warning(Warnings.W008.format(obj='Span')) | ||||
|             user_warning(Warnings.W008.format(obj="Span")) | ||||
|             return 0.0 | ||||
|         vector = self.vector | ||||
|         xp = get_array_module(vector) | ||||
|  | @ -251,8 +299,8 @@ cdef class Span: | |||
|         cdef int i, j | ||||
|         cdef attr_id_t feature | ||||
|         cdef np.ndarray[attr_t, ndim=2] output | ||||
|         # Make an array from the attributes --- otherwise our inner loop is Python | ||||
|         # dict iteration. | ||||
|         # Make an array from the attributes - otherwise our inner loop is Python | ||||
|         # dict iteration | ||||
|         cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) | ||||
|         cdef int length = self.end - self.start | ||||
|         output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64) | ||||
|  | @ -282,12 +330,11 @@ cdef class Span: | |||
|     property sent: | ||||
|         """RETURNS (Span): The sentence span that the span is a part of.""" | ||||
|         def __get__(self): | ||||
|             if 'sent' in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks['sent'](self) | ||||
|             # This should raise if we're not parsed | ||||
|             # or doesen't have any sbd component :) | ||||
|             if "sent" in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks["sent"](self) | ||||
|             # This should raise if not parsed / no custom sentence boundaries | ||||
|             self.doc.sents | ||||
|             # if doc is parsed we can use the deps to find the sentence | ||||
|             # If doc is parsed we can use the deps to find the sentence | ||||
|             # otherwise we use the `sent_start` token attribute | ||||
|             cdef int n = 0 | ||||
|             cdef int i | ||||
|  | @ -300,11 +347,11 @@ cdef class Span: | |||
|                         raise RuntimeError(Errors.E038) | ||||
|                 return self.doc[root.l_edge:root.r_edge + 1] | ||||
|             elif self.doc.is_sentenced: | ||||
|                 # find start of the sentence | ||||
|                 # Find start of the sentence | ||||
|                 start = self.start | ||||
|                 while self.doc.c[start].sent_start != 1 and start > 0: | ||||
|                     start += -1 | ||||
|                 # find end of the sentence | ||||
|                 # Find end of the sentence | ||||
|                 end = self.end | ||||
|                 n = 0 | ||||
|                 while end < self.doc.length and self.doc.c[end].sent_start != 1: | ||||
|  | @ -315,7 +362,13 @@ cdef class Span: | |||
|                 return self.doc[start:end] | ||||
| 
 | ||||
|     property ents: | ||||
|         """RETURNS (list): A list of tokens that belong to the current span.""" | ||||
|         """The named entities in the span. Returns a tuple of named entity | ||||
|         `Span` objects, if the entity recognizer has been applied. | ||||
| 
 | ||||
|         RETURNS (tuple): Entities in the span, one `Span` per entity. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#ents | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             ents = [] | ||||
|             for ent in self.doc.ents: | ||||
|  | @ -324,11 +377,16 @@ cdef class Span: | |||
|             return ents | ||||
| 
 | ||||
|     property has_vector: | ||||
|         """RETURNS (bool): Whether a word vector is associated with the object. | ||||
|         """A boolean value indicating whether a word vector is associated with | ||||
|         the object. | ||||
| 
 | ||||
|         RETURNS (bool): Whether a word vector is associated with the object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#has_vector | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'has_vector' in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks['has_vector'](self) | ||||
|             if "has_vector" in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks["has_vector"](self) | ||||
|             elif self.vocab.vectors.data.size > 0: | ||||
|                 return any(token.has_vector for token in self) | ||||
|             elif self.doc.tensor.size > 0: | ||||
|  | @ -342,19 +400,26 @@ cdef class Span: | |||
| 
 | ||||
|         RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array | ||||
|             representing the span's semantics. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#vector | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'vector' in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks['vector'](self) | ||||
|             if "vector" in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks["vector"](self) | ||||
|             if self._vector is None: | ||||
|                 self._vector = sum(t.vector for t in self) / len(self) | ||||
|             return self._vector | ||||
| 
 | ||||
|     property vector_norm: | ||||
|         """RETURNS (float): The L2 norm of the vector representation.""" | ||||
|         """The L2 norm of the span's vector representation. | ||||
| 
 | ||||
|         RETURNS (float): The L2 norm of the vector representation. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#vector_norm | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'vector_norm' in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks['vector'](self) | ||||
|             if "vector_norm" in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks["vector"](self) | ||||
|             cdef float value | ||||
|             cdef double norm = 0 | ||||
|             if self._vector_norm is None: | ||||
|  | @ -369,8 +434,8 @@ cdef class Span: | |||
|             negativity of the span. | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'sentiment' in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks['sentiment'](self) | ||||
|             if "sentiment" in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks["sentiment"](self) | ||||
|             else: | ||||
|                 return sum([token.sentiment for token in self]) / len(self) | ||||
| 
 | ||||
|  | @ -390,7 +455,7 @@ cdef class Span: | |||
|             whitespace). | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return u''.join([t.text_with_ws for t in self]) | ||||
|             return "".join([t.text_with_ws for t in self]) | ||||
| 
 | ||||
|     property noun_chunks: | ||||
|         """Yields base noun-phrase `Span` objects, if the document has been | ||||
|  | @ -399,7 +464,9 @@ cdef class Span: | |||
|         NP-level coordination, no prepositional phrases, and no relative | ||||
|         clauses. | ||||
| 
 | ||||
|         YIELDS (Span): Base noun-phrase `Span` objects | ||||
|         YIELDS (Span): Base noun-phrase `Span` objects. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#noun_chunks | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if not self.doc.is_parsed: | ||||
|  | @ -418,52 +485,18 @@ cdef class Span: | |||
|                 yield span | ||||
| 
 | ||||
|     property root: | ||||
|         """The token within the span that's highest in the parse tree. | ||||
|         If there's a tie, the earliest is prefered. | ||||
|         """The token with the shortest path to the root of the | ||||
|         sentence (or the root itself). If multiple tokens are equally | ||||
|         high in the tree, the first token is taken. | ||||
| 
 | ||||
|         RETURNS (Token): The root token. | ||||
| 
 | ||||
|         EXAMPLE: The root token has the shortest path to the root of the | ||||
|             sentence (or is the root itself). If multiple words are equally | ||||
|             high in the tree, the first word is taken. For example: | ||||
| 
 | ||||
|             >>> toks = nlp(u'I like New York in Autumn.') | ||||
| 
 | ||||
|             Let's name the indices – easier than writing `toks[4]` etc. | ||||
| 
 | ||||
|             >>> i, like, new, york, in_, autumn, dot = range(len(toks)) | ||||
| 
 | ||||
|             The head of 'new' is 'York', and the head of "York" is "like" | ||||
| 
 | ||||
|             >>> toks[new].head.text | ||||
|             'York' | ||||
|             >>> toks[york].head.text | ||||
|             'like' | ||||
| 
 | ||||
|             Create a span for "New York". Its root is "York". | ||||
| 
 | ||||
|             >>> new_york = toks[new:york+1] | ||||
|             >>> new_york.root.text | ||||
|             'York' | ||||
| 
 | ||||
|             Here's a more complicated case, raised by issue #214: | ||||
| 
 | ||||
|             >>> toks = nlp(u'to, north and south carolina') | ||||
|             >>> to, north, and_, south, carolina = toks | ||||
|             >>> south.head.text, carolina.head.text | ||||
|             ('north', 'to') | ||||
| 
 | ||||
|             Here "south" is a child of "north", which is a child of "carolina". | ||||
|             Carolina is the root of the span: | ||||
| 
 | ||||
|             >>> south_carolina = toks[-2:] | ||||
|             >>> south_carolina.root.text | ||||
|             'carolina' | ||||
|         DOCS: https://spacy.io/api/span#root | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             self._recalculate_indices() | ||||
|             if 'root' in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks['root'](self) | ||||
|             if "root" in self.doc.user_span_hooks: | ||||
|                 return self.doc.user_span_hooks["root"](self) | ||||
|             # This should probably be called 'head', and the other one called | ||||
|             # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ | ||||
|             cdef int i | ||||
|  | @ -499,6 +532,8 @@ cdef class Span: | |||
|         `Span`. | ||||
| 
 | ||||
|         YIELDS (Token):A left-child of a token of the span. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#lefts | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             for token in reversed(self):  # Reverse, so we get tokens in order | ||||
|  | @ -511,6 +546,8 @@ cdef class Span: | |||
|         `Span`. | ||||
| 
 | ||||
|         YIELDS (Token): A right-child of a token of the span. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#rights | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             for token in self: | ||||
|  | @ -519,15 +556,25 @@ cdef class Span: | |||
|                         yield right | ||||
| 
 | ||||
|     property n_lefts: | ||||
|         """RETURNS (int): The number of leftward immediate children of the | ||||
|         """The number of tokens that are to the left of the span, whose | ||||
|         heads are within the span. | ||||
| 
 | ||||
|         RETURNS (int): The number of leftward immediate children of the | ||||
|             span, in the syntactic dependency parse. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#n_lefts | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return len(list(self.lefts)) | ||||
| 
 | ||||
|     property n_rights: | ||||
|         """RETURNS (int): The number of rightward immediate children of the | ||||
|         """The number of tokens that are to the right of the span, whose | ||||
|         heads are within the span. | ||||
| 
 | ||||
|         RETURNS (int): The number of rightward immediate children of the | ||||
|             span, in the syntactic dependency parse. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#n_rights | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return len(list(self.rights)) | ||||
|  | @ -536,6 +583,8 @@ cdef class Span: | |||
|         """Tokens within the span and tokens which descend from them. | ||||
| 
 | ||||
|         YIELDS (Token): A token within the span, or a descendant from it. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#subtree | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             for word in self.lefts: | ||||
|  | @ -550,7 +599,7 @@ cdef class Span: | |||
|             return self.root.ent_id | ||||
| 
 | ||||
|         def __set__(self, hash_t key): | ||||
|             raise NotImplementedError(TempErrors.T007.format(attr='ent_id')) | ||||
|             raise NotImplementedError(TempErrors.T007.format(attr="ent_id")) | ||||
| 
 | ||||
|     property ent_id_: | ||||
|         """RETURNS (unicode): The (string) entity ID.""" | ||||
|  | @ -558,10 +607,10 @@ cdef class Span: | |||
|             return self.root.ent_id_ | ||||
| 
 | ||||
|         def __set__(self, hash_t key): | ||||
|             raise NotImplementedError(TempErrors.T007.format(attr='ent_id_')) | ||||
|             raise NotImplementedError(TempErrors.T007.format(attr="ent_id_")) | ||||
| 
 | ||||
|     property orth_: | ||||
|         """Verbatim text content (identical to Span.text). Exists mostly for | ||||
|         """Verbatim text content (identical to `Span.text`). Exists mostly for | ||||
|         consistency with other attributes. | ||||
| 
 | ||||
|         RETURNS (unicode): The span's text.""" | ||||
|  | @ -571,27 +620,28 @@ cdef class Span: | |||
|     property lemma_: | ||||
|         """RETURNS (unicode): The span's lemma.""" | ||||
|         def __get__(self): | ||||
|             return ' '.join([t.lemma_ for t in self]).strip() | ||||
|             return " ".join([t.lemma_ for t in self]).strip() | ||||
| 
 | ||||
|     property upper_: | ||||
|         """Deprecated. Use Span.text.upper() instead.""" | ||||
|         """Deprecated. Use `Span.text.upper()` instead.""" | ||||
|         def __get__(self): | ||||
|             return ''.join([t.text_with_ws.upper() for t in self]).strip() | ||||
|             return "".join([t.text_with_ws.upper() for t in self]).strip() | ||||
| 
 | ||||
|     property lower_: | ||||
|         """Deprecated. Use Span.text.lower() instead.""" | ||||
|         """Deprecated. Use `Span.text.lower()` instead.""" | ||||
|         def __get__(self): | ||||
|             return ''.join([t.text_with_ws.lower() for t in self]).strip() | ||||
|             return "".join([t.text_with_ws.lower() for t in self]).strip() | ||||
| 
 | ||||
|     property string: | ||||
|         """Deprecated: Use Span.text_with_ws instead.""" | ||||
|         """Deprecated: Use `Span.text_with_ws` instead.""" | ||||
|         def __get__(self): | ||||
|             return ''.join([t.text_with_ws for t in self]) | ||||
|             return "".join([t.text_with_ws for t in self]) | ||||
| 
 | ||||
|     property label_: | ||||
|         """RETURNS (unicode): The span's label.""" | ||||
|         def __get__(self): | ||||
|             return self.doc.vocab.strings[self.label] | ||||
| 
 | ||||
|         def __set__(self, unicode label_): | ||||
|             self.label = self.doc.vocab.strings.add(label_) | ||||
| 
 | ||||
|  |  | |||
|  | @ -8,42 +8,82 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free | |||
| from cython.view cimport array as cvarray | ||||
| cimport numpy as np | ||||
| np.import_array() | ||||
| 
 | ||||
| import numpy | ||||
| from thinc.neural.util import get_array_module | ||||
| 
 | ||||
| from ..typedefs cimport hash_t | ||||
| from ..lexeme cimport Lexeme | ||||
| from .. import parts_of_speech | ||||
| from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE | ||||
| from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT | ||||
| from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL | ||||
| from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX | ||||
| from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP | ||||
| from ..symbols cimport conj | ||||
| 
 | ||||
| from .. import parts_of_speech | ||||
| from .. import util | ||||
| from ..compat import is_config | ||||
| from ..errors import Errors, Warnings, user_warning, models_warning | ||||
| from .. import util | ||||
| from .underscore import Underscore, get_ext_args | ||||
| 
 | ||||
| 
 | ||||
| cdef class Token: | ||||
|     """An individual token – i.e. a word, punctuation symbol, whitespace, | ||||
|     etc.""" | ||||
|     etc. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/token | ||||
|     """ | ||||
|     @classmethod | ||||
|     def set_extension(cls, name, **kwargs): | ||||
|         if cls.has_extension(name) and not kwargs.get('force', False): | ||||
|             raise ValueError(Errors.E090.format(name=name, obj='Token')) | ||||
|         """Define a custom attribute which becomes available as `Token._`. | ||||
| 
 | ||||
|         name (unicode): Name of the attribute to set. | ||||
|         default: Optional default value of the attribute. | ||||
|         getter (callable): Optional getter function. | ||||
|         setter (callable): Optional setter function. | ||||
|         method (callable): Optional method for method extension. | ||||
|         force (bool): Force overwriting existing attribute. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#set_extension | ||||
|         USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes | ||||
|         """ | ||||
|         if cls.has_extension(name) and not kwargs.get("force", False): | ||||
|             raise ValueError(Errors.E090.format(name=name, obj="Token")) | ||||
|         Underscore.token_extensions[name] = get_ext_args(**kwargs) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def get_extension(cls, name): | ||||
|         """Look up a previously registered extension by name. | ||||
| 
 | ||||
|         name (unicode): Name of the extension. | ||||
|         RETURNS (tuple): A `(default, method, getter, setter)` tuple. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#get_extension | ||||
|         """ | ||||
|         return Underscore.token_extensions.get(name) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def has_extension(cls, name): | ||||
|         """Check whether an extension has been registered. | ||||
| 
 | ||||
|         name (unicode): Name of the extension. | ||||
|         RETURNS (bool): Whether the extension has been registered. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#has_extension | ||||
|         """ | ||||
|         return name in Underscore.token_extensions | ||||
| 
 | ||||
|     @classmethod | ||||
|     def remove_extension(cls, name): | ||||
|         """Remove a previously registered extension. | ||||
| 
 | ||||
|         name (unicode): Name of the extension. | ||||
|         RETURNS (tuple): A `(default, method, getter, setter)` tuple of the | ||||
|             removed extension. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#remove_extension | ||||
|         """ | ||||
|         if not cls.has_extension(name): | ||||
|             raise ValueError(Errors.E046.format(name=name)) | ||||
|         return Underscore.token_extensions.pop(name) | ||||
|  | @ -54,6 +94,8 @@ cdef class Token: | |||
|         vocab (Vocab): A storage container for lexical types. | ||||
|         doc (Doc): The parent document. | ||||
|         offset (int): The index of the token within the document. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#init | ||||
|         """ | ||||
|         self.vocab = vocab | ||||
|         self.doc = doc | ||||
|  | @ -67,6 +109,8 @@ cdef class Token: | |||
|         """The number of unicode characters in the token, i.e. `token.text`. | ||||
| 
 | ||||
|         RETURNS (int): The number of unicode characters in the token. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#len | ||||
|         """ | ||||
|         return self.c.lex.length | ||||
| 
 | ||||
|  | @ -121,6 +165,7 @@ cdef class Token: | |||
| 
 | ||||
|     @property | ||||
|     def _(self): | ||||
|         """Custom extension attributes registered via `set_extension`.""" | ||||
|         return Underscore(Underscore.token_extensions, self, | ||||
|                           start=self.idx, end=None) | ||||
| 
 | ||||
|  | @ -130,12 +175,7 @@ cdef class Token: | |||
|         flag_id (int): The ID of the flag attribute. | ||||
|         RETURNS (bool): Whether the flag is set. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> from spacy.attrs import IS_TITLE | ||||
|             >>> doc = nlp(u'Give it back! He pleaded.') | ||||
|             >>> token = doc[0] | ||||
|             >>> token.check_flag(IS_TITLE) | ||||
|             True | ||||
|         DOCS: https://spacy.io/api/token#check_flag | ||||
|         """ | ||||
|         return Lexeme.c_check_flag(self.c.lex, flag_id) | ||||
| 
 | ||||
|  | @ -144,6 +184,8 @@ cdef class Token: | |||
| 
 | ||||
|         i (int): The relative position of the token to get. Defaults to 1. | ||||
|         RETURNS (Token): The token at position `self.doc[self.i+i]`. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#nbor | ||||
|         """ | ||||
|         if self.i+i < 0 or (self.i+i >= len(self.doc)): | ||||
|             raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc))) | ||||
|  | @ -156,19 +198,21 @@ cdef class Token: | |||
|         other (object): The object to compare with. By default, accepts `Doc`, | ||||
|             `Span`, `Token` and `Lexeme` objects. | ||||
|         RETURNS (float): A scalar similarity score. Higher is more similar. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#similarity | ||||
|         """ | ||||
|         if 'similarity' in self.doc.user_token_hooks: | ||||
|             return self.doc.user_token_hooks['similarity'](self) | ||||
|         if hasattr(other, '__len__') and len(other) == 1 and hasattr(other, "__getitem__"): | ||||
|             if self.c.lex.orth == getattr(other[0], 'orth', None): | ||||
|         if "similarity" in self.doc.user_token_hooks: | ||||
|             return self.doc.user_token_hooks["similarity"](self) | ||||
|         if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"): | ||||
|             if self.c.lex.orth == getattr(other[0], "orth", None): | ||||
|                 return 1.0 | ||||
|         elif hasattr(other, 'orth'): | ||||
|         elif hasattr(other, "orth"): | ||||
|             if self.c.lex.orth == other.orth: | ||||
|                 return 1.0 | ||||
|         if self.vocab.vectors.n_keys == 0: | ||||
|             models_warning(Warnings.W007.format(obj='Token')) | ||||
|             models_warning(Warnings.W007.format(obj="Token")) | ||||
|         if self.vector_norm == 0 or other.vector_norm == 0: | ||||
|             user_warning(Warnings.W008.format(obj='Token')) | ||||
|             user_warning(Warnings.W008.format(obj="Token")) | ||||
|             return 0.0 | ||||
|         vector = self.vector | ||||
|         xp = get_array_module(vector) | ||||
|  | @ -202,7 +246,7 @@ cdef class Token: | |||
|         def __get__(self): | ||||
|             cdef unicode orth = self.vocab.strings[self.c.lex.orth] | ||||
|             if self.c.spacy: | ||||
|                 return orth + u' ' | ||||
|                 return orth + " " | ||||
|             else: | ||||
|                 return orth | ||||
| 
 | ||||
|  | @ -215,8 +259,8 @@ cdef class Token: | |||
|         """RETURNS (float): A scalar value indicating the positivity or | ||||
|             negativity of the token.""" | ||||
|         def __get__(self): | ||||
|             if 'sentiment' in self.doc.user_token_hooks: | ||||
|                 return self.doc.user_token_hooks['sentiment'](self) | ||||
|             if "sentiment" in self.doc.user_token_hooks: | ||||
|                 return self.doc.user_token_hooks["sentiment"](self) | ||||
|             return self.c.lex.sentiment | ||||
| 
 | ||||
|     property lang: | ||||
|  | @ -298,6 +342,7 @@ cdef class Token: | |||
|         """RETURNS (uint64): ID of coarse-grained part-of-speech tag.""" | ||||
|         def __get__(self): | ||||
|             return self.c.pos | ||||
| 
 | ||||
|         def __set__(self, pos): | ||||
|             self.c.pos = pos | ||||
| 
 | ||||
|  | @ -322,10 +367,12 @@ cdef class Token: | |||
|         the object. | ||||
| 
 | ||||
|         RETURNS (bool): Whether a word vector is associated with the object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#has_vector | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'has_vector' in self.doc.user_token_hooks: | ||||
|                 return self.doc.user_token_hooks['has_vector'](self) | ||||
|                 return self.doc.user_token_hooks["has_vector"](self) | ||||
|             if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: | ||||
|                 return True | ||||
|             return self.vocab.has_vector(self.c.lex.orth) | ||||
|  | @ -335,10 +382,12 @@ cdef class Token: | |||
| 
 | ||||
|         RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array | ||||
|             representing the token's semantics. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#vector | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'vector' in self.doc.user_token_hooks: | ||||
|                 return self.doc.user_token_hooks['vector'](self) | ||||
|                 return self.doc.user_token_hooks["vector"](self) | ||||
|             if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: | ||||
|                 return self.doc.tensor[self.i] | ||||
|             else: | ||||
|  | @ -348,23 +397,35 @@ cdef class Token: | |||
|         """The L2 norm of the token's vector representation. | ||||
| 
 | ||||
|         RETURNS (float): The L2 norm of the vector representation. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#vector_norm | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if 'vector_norm' in self.doc.user_token_hooks: | ||||
|                 return self.doc.user_token_hooks['vector_norm'](self) | ||||
|                 return self.doc.user_token_hooks["vector_norm"](self) | ||||
|             vector = self.vector | ||||
|             return numpy.sqrt((vector ** 2).sum()) | ||||
| 
 | ||||
|     property n_lefts: | ||||
|         """RETURNS (int): The number of leftward immediate children of the | ||||
|         """The number of leftward immediate children of the word, in the | ||||
|         syntactic dependency parse. | ||||
| 
 | ||||
|         RETURNS (int): The number of leftward immediate children of the | ||||
|             word, in the syntactic dependency parse. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#n_lefts | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return self.c.l_kids | ||||
| 
 | ||||
|     property n_rights: | ||||
|         """RETURNS (int): The number of rightward immediate children of the | ||||
|         """The number of rightward immediate children of the word, in the | ||||
|         syntactic dependency parse. | ||||
| 
 | ||||
|         RETURNS (int): The number of rightward immediate children of the | ||||
|             word, in the syntactic dependency parse. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#n_rights | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return self.c.r_kids | ||||
|  | @ -373,7 +434,7 @@ cdef class Token: | |||
|         """RETURNS (Span): The sentence span that the token is a part of.""" | ||||
|         def __get__(self): | ||||
|             if 'sent' in self.doc.user_token_hooks: | ||||
|                 return self.doc.user_token_hooks['sent'](self) | ||||
|                 return self.doc.user_token_hooks["sent"](self) | ||||
|             return self.doc[self.i : self.i+1].sent | ||||
| 
 | ||||
|     property sent_start: | ||||
|  | @ -390,8 +451,13 @@ cdef class Token: | |||
|             self.is_sent_start = value | ||||
| 
 | ||||
|     property is_sent_start: | ||||
|         """RETURNS (bool / None): Whether the token starts a sentence. | ||||
|         """A boolean value indicating whether the token starts a sentence. | ||||
|         `None` if unknown. Defaults to `True` for the first token in the `Doc`. | ||||
| 
 | ||||
|         RETURNS (bool / None): Whether the token starts a sentence. | ||||
|             None if unknown. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#is_sent_start | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             if self.c.sent_start == 0: | ||||
|  | @ -418,6 +484,8 @@ cdef class Token: | |||
|         dependency parse. | ||||
| 
 | ||||
|         YIELDS (Token): A left-child of the token. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#lefts | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             cdef int nr_iter = 0 | ||||
|  | @ -429,13 +497,15 @@ cdef class Token: | |||
|                 nr_iter += 1 | ||||
|                 # This is ugly, but it's a way to guard out infinite loops | ||||
|                 if nr_iter >= 10000000: | ||||
|                     raise RuntimeError(Errors.E045.format(attr='token.lefts')) | ||||
|                     raise RuntimeError(Errors.E045.format(attr="token.lefts")) | ||||
| 
 | ||||
|     property rights: | ||||
|         """The rightward immediate children of the word, in the syntactic | ||||
|         dependency parse. | ||||
| 
 | ||||
|         YIELDS (Token): A right-child of the token. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#rights | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) | ||||
|  | @ -447,7 +517,7 @@ cdef class Token: | |||
|                 ptr -= 1 | ||||
|                 nr_iter += 1 | ||||
|                 if nr_iter >= 10000000: | ||||
|                     raise RuntimeError(Errors.E045.format(attr='token.rights')) | ||||
|                     raise RuntimeError(Errors.E045.format(attr="token.rights")) | ||||
|             tokens.reverse() | ||||
|             for t in tokens: | ||||
|                 yield t | ||||
|  | @ -455,7 +525,9 @@ cdef class Token: | |||
|     property children: | ||||
|         """A sequence of the token's immediate syntactic children. | ||||
| 
 | ||||
|         YIELDS (Token): A child token such that child.head==self | ||||
|         YIELDS (Token): A child token such that `child.head==self`. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#children | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             yield from self.lefts | ||||
|  | @ -467,6 +539,8 @@ cdef class Token: | |||
| 
 | ||||
|         YIELDS (Token): A descendent token such that | ||||
|             `self.is_ancestor(descendent) or token == self`. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#subtree | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             for word in self.lefts: | ||||
|  | @ -496,11 +570,13 @@ cdef class Token: | |||
| 
 | ||||
|         YIELDS (Token): A sequence of ancestor tokens such that | ||||
|             `ancestor.is_ancestor(self)`. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#ancestors | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             cdef const TokenC* head_ptr = self.c | ||||
|             # guard against infinite loop, no token can have | ||||
|             # more ancestors than tokens in the tree | ||||
|             # Guard against infinite loop, no token can have | ||||
|             # more ancestors than tokens in the tree. | ||||
|             cdef int i = 0 | ||||
|             while head_ptr.head != 0 and i < self.doc.length: | ||||
|                 head_ptr += head_ptr.head | ||||
|  | @ -513,6 +589,8 @@ cdef class Token: | |||
| 
 | ||||
|         descendant (Token): Another token. | ||||
|         RETURNS (bool): Whether this token is the ancestor of the descendant. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#is_ancestor | ||||
|         """ | ||||
|         if self.doc is not descendant.doc: | ||||
|             return False | ||||
|  | @ -528,34 +606,28 @@ cdef class Token: | |||
|             return self.doc[self.i + self.c.head] | ||||
| 
 | ||||
|         def __set__(self, Token new_head): | ||||
|             # this function sets the head of self to new_head | ||||
|             # and updates the counters for left/right dependents | ||||
|             # and left/right corner for the new and the old head | ||||
| 
 | ||||
|             # do nothing if old head is new head | ||||
|             # This function sets the head of self to new_head and updates the | ||||
|             # counters for left/right dependents and left/right corner for the | ||||
|             # new and the old head | ||||
|             # Do nothing if old head is new head | ||||
|             if self.i + self.c.head == new_head.i: | ||||
|                 return | ||||
| 
 | ||||
|             cdef Token old_head = self.head | ||||
|             cdef int rel_newhead_i = new_head.i - self.i | ||||
| 
 | ||||
|             # is the new head a descendant of the old head | ||||
|             # Is the new head a descendant of the old head | ||||
|             cdef bint is_desc = old_head.is_ancestor(new_head) | ||||
| 
 | ||||
|             cdef int new_edge | ||||
|             cdef Token anc, child | ||||
| 
 | ||||
|             # update number of deps of old head | ||||
|             # Update number of deps of old head | ||||
|             if self.c.head > 0:  # left dependent | ||||
|                 old_head.c.l_kids -= 1 | ||||
|                 if self.c.l_edge == old_head.c.l_edge: | ||||
|                     # the token dominates the left edge so the left edge of | ||||
|                     # The token dominates the left edge so the left edge of | ||||
|                     # the head may change when the token is reattached, it may | ||||
|                     # not change if the new head is a descendant of the current | ||||
|                     # head | ||||
| 
 | ||||
|                     # head. | ||||
|                     new_edge = self.c.l_edge | ||||
|                     # the new l_edge is the left-most l_edge on any of the | ||||
|                     # The new l_edge is the left-most l_edge on any of the | ||||
|                     # other dependents where the l_edge is left of the head, | ||||
|                     # otherwise it is the head | ||||
|                     if not is_desc: | ||||
|  | @ -566,21 +638,18 @@ cdef class Token: | |||
|                             if child.c.l_edge < new_edge: | ||||
|                                 new_edge = child.c.l_edge | ||||
|                         old_head.c.l_edge = new_edge | ||||
| 
 | ||||
|                     # walk up the tree from old_head and assign new l_edge to | ||||
|                     # Walk up the tree from old_head and assign new l_edge to | ||||
|                     # ancestors until an ancestor already has an l_edge that's | ||||
|                     # further left | ||||
|                     for anc in old_head.ancestors: | ||||
|                         if anc.c.l_edge <= new_edge: | ||||
|                             break | ||||
|                         anc.c.l_edge = new_edge | ||||
| 
 | ||||
|             elif self.c.head < 0:  # right dependent | ||||
|                 old_head.c.r_kids -= 1 | ||||
|                 # do the same thing as for l_edge | ||||
|                 # Do the same thing as for l_edge | ||||
|                 if self.c.r_edge == old_head.c.r_edge: | ||||
|                     new_edge = self.c.r_edge | ||||
| 
 | ||||
|                     if not is_desc: | ||||
|                         new_edge = old_head.i | ||||
|                         for child in old_head.children: | ||||
|  | @ -589,16 +658,14 @@ cdef class Token: | |||
|                             if child.c.r_edge > new_edge: | ||||
|                                 new_edge = child.c.r_edge | ||||
|                         old_head.c.r_edge = new_edge | ||||
| 
 | ||||
|                     for anc in old_head.ancestors: | ||||
|                         if anc.c.r_edge >= new_edge: | ||||
|                             break | ||||
|                         anc.c.r_edge = new_edge | ||||
| 
 | ||||
|             # update number of deps of new head | ||||
|             # Update number of deps of new head | ||||
|             if rel_newhead_i > 0:  # left dependent | ||||
|                 new_head.c.l_kids += 1 | ||||
|                 # walk up the tree from new head and set l_edge to self.l_edge | ||||
|                 # Walk up the tree from new head and set l_edge to self.l_edge | ||||
|                 # until you hit a token with an l_edge further to the left | ||||
|                 if self.c.l_edge < new_head.c.l_edge: | ||||
|                     new_head.c.l_edge = self.c.l_edge | ||||
|  | @ -606,34 +673,33 @@ cdef class Token: | |||
|                         if anc.c.l_edge <= self.c.l_edge: | ||||
|                             break | ||||
|                         anc.c.l_edge = self.c.l_edge | ||||
| 
 | ||||
|             elif rel_newhead_i < 0:  # right dependent | ||||
|                 new_head.c.r_kids += 1 | ||||
|                 # do the same as for l_edge | ||||
|                 # Do the same as for l_edge | ||||
|                 if self.c.r_edge > new_head.c.r_edge: | ||||
|                     new_head.c.r_edge = self.c.r_edge | ||||
|                     for anc in new_head.ancestors: | ||||
|                         if anc.c.r_edge >= self.c.r_edge: | ||||
|                             break | ||||
|                         anc.c.r_edge = self.c.r_edge | ||||
| 
 | ||||
|             # set new head | ||||
|             # Set new head | ||||
|             self.c.head = rel_newhead_i | ||||
| 
 | ||||
|     property conjuncts: | ||||
|         """A sequence of coordinated tokens, including the token itself. | ||||
| 
 | ||||
|         YIELDS (Token): A coordinated token. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/token#conjuncts | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             """Get a list of conjoined words.""" | ||||
|             cdef Token word | ||||
|             if 'conjuncts' in self.doc.user_token_hooks: | ||||
|                 yield from self.doc.user_token_hooks['conjuncts'](self) | ||||
|             if "conjuncts" in self.doc.user_token_hooks: | ||||
|                 yield from self.doc.user_token_hooks["conjuncts"](self) | ||||
|             else: | ||||
|                 if self.dep_ != 'conj': | ||||
|                 if self.dep != conj: | ||||
|                     for word in self.rights: | ||||
|                         if word.dep_ == 'conj': | ||||
|                         if word.dep == conj: | ||||
|                             yield word | ||||
|                             yield from word.conjuncts | ||||
| 
 | ||||
|  | @ -670,7 +736,7 @@ cdef class Token: | |||
|         RETURNS (unicode): IOB code of named entity tag. | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             iob_strings = ('', 'I', 'O', 'B') | ||||
|             iob_strings = ("", "I", "O", "B") | ||||
|             return iob_strings[self.c.ent_iob] | ||||
| 
 | ||||
|     property ent_id: | ||||
|  | @ -697,7 +763,7 @@ cdef class Token: | |||
|         """RETURNS (unicode): The trailing whitespace character, if present. | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return ' ' if self.c.spacy else '' | ||||
|             return " " if self.c.spacy else "" | ||||
| 
 | ||||
|     property orth_: | ||||
|         """RETURNS (unicode): Verbatim text content (identical to | ||||
|  | @ -770,6 +836,7 @@ cdef class Token: | |||
|         """RETURNS (unicode): Coarse-grained part-of-speech tag.""" | ||||
|         def __get__(self): | ||||
|             return parts_of_speech.NAMES[self.c.pos] | ||||
| 
 | ||||
|         def __set__(self, pos_name): | ||||
|             self.c.pos = parts_of_speech.IDS[pos_name] | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,30 +1,31 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| cimport numpy as np | ||||
| from cython.operator cimport dereference as deref | ||||
| from libcpp.set cimport set as cppset | ||||
| 
 | ||||
| import functools | ||||
| import numpy | ||||
| from collections import OrderedDict | ||||
| import srsly | ||||
| 
 | ||||
| cimport numpy as np | ||||
| from thinc.neural.util import get_array_module | ||||
| from thinc.neural._classes.model import Model | ||||
| 
 | ||||
| from .strings cimport StringStore | ||||
| 
 | ||||
| from .strings import get_string_id | ||||
| from .compat import basestring_, path2str | ||||
| from .errors import Errors | ||||
| from . import util | ||||
| 
 | ||||
| from cython.operator cimport dereference as deref | ||||
| from libcpp.set cimport set as cppset | ||||
| 
 | ||||
| def unpickle_vectors(bytes_data): | ||||
|     return Vectors().from_bytes(bytes_data) | ||||
| 
 | ||||
| 
 | ||||
| class GlobalRegistry(object): | ||||
|     '''Global store of vectors, to avoid repeatedly loading the data.''' | ||||
|     """Global store of vectors, to avoid repeatedly loading the data.""" | ||||
|     data = {} | ||||
| 
 | ||||
|     @classmethod | ||||
|  | @ -46,8 +47,10 @@ cdef class Vectors: | |||
|     rows in the vectors.data table. | ||||
| 
 | ||||
|     Multiple keys can be mapped to the same vector, and not all of the rows in | ||||
|     the table need to be assigned --- so len(list(vectors.keys())) may be | ||||
|     the table need to be assigned - so len(list(vectors.keys())) may be | ||||
|     greater or smaller than vectors.shape[0]. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/vectors | ||||
|     """ | ||||
|     cdef public object name | ||||
|     cdef public object data | ||||
|  | @ -62,12 +65,14 @@ cdef class Vectors: | |||
|         keys (iterable): A sequence of keys, aligned with the data. | ||||
|         name (string): A name to identify the vectors table. | ||||
|         RETURNS (Vectors): The newly created object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#init | ||||
|         """ | ||||
|         self.name = name | ||||
|         if data is None: | ||||
|             if shape is None: | ||||
|                 shape = (0,0) | ||||
|             data = numpy.zeros(shape, dtype='f') | ||||
|             data = numpy.zeros(shape, dtype="f") | ||||
|         self.data = data | ||||
|         self.key2row = OrderedDict() | ||||
|         if self.data is not None: | ||||
|  | @ -84,23 +89,40 @@ cdef class Vectors: | |||
|         in the vector table. | ||||
| 
 | ||||
|         RETURNS (tuple): A `(rows, dims)` pair. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#shape | ||||
|         """ | ||||
|         return self.data.shape | ||||
| 
 | ||||
|     @property | ||||
|     def size(self): | ||||
|         """RETURNS (int): rows*dims""" | ||||
|         """The vector size i,e. rows * dims. | ||||
| 
 | ||||
|         RETURNS (int): The vector size. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#size | ||||
|         """ | ||||
|         return self.data.shape[0] * self.data.shape[1] | ||||
| 
 | ||||
|     @property | ||||
|     def is_full(self): | ||||
|         """RETURNS (bool): `True` if no slots are available for new keys.""" | ||||
|         """Whether the vectors table is full. | ||||
| 
 | ||||
|         RETURNS (bool): `True` if no slots are available for new keys. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#is_full | ||||
|         """ | ||||
|         return self._unset.size() == 0 | ||||
| 
 | ||||
|     @property | ||||
|     def n_keys(self): | ||||
|         """RETURNS (int) The number of keys in the table. Note that this is the | ||||
|         number of all keys, not just unique vectors.""" | ||||
|         """Get the number of keys in the table. Note that this is the number | ||||
|         of all keys, not just unique vectors. | ||||
| 
 | ||||
|         RETURNS (int): The number of keys in the table. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#n_keys | ||||
|         """ | ||||
|         return len(self.key2row) | ||||
| 
 | ||||
|     def __reduce__(self): | ||||
|  | @ -111,6 +133,8 @@ cdef class Vectors: | |||
| 
 | ||||
|         key (int): The key to get the vector for. | ||||
|         RETURNS (ndarray): The vector for the key. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#getitem | ||||
|         """ | ||||
|         i = self.key2row[key] | ||||
|         if i is None: | ||||
|  | @ -123,6 +147,8 @@ cdef class Vectors: | |||
| 
 | ||||
|         key (int): The key to set the vector for. | ||||
|         vector (ndarray): The vector to set. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#setitem | ||||
|         """ | ||||
|         i = self.key2row[key] | ||||
|         self.data[i] = vector | ||||
|  | @ -133,6 +159,8 @@ cdef class Vectors: | |||
|         """Iterate over the keys in the table. | ||||
| 
 | ||||
|         YIELDS (int): A key in the table. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#iter | ||||
|         """ | ||||
|         yield from self.key2row | ||||
| 
 | ||||
|  | @ -140,6 +168,8 @@ cdef class Vectors: | |||
|         """Return the number of vectors in the table. | ||||
| 
 | ||||
|         RETURNS (int): The number of vectors in the data. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#len | ||||
|         """ | ||||
|         return self.data.shape[0] | ||||
| 
 | ||||
|  | @ -148,6 +178,8 @@ cdef class Vectors: | |||
| 
 | ||||
|         key (int): The key to check. | ||||
|         RETURNS (bool): Whether the key has a vector entry. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#contains | ||||
|         """ | ||||
|         return key in self.key2row | ||||
| 
 | ||||
|  | @ -159,6 +191,12 @@ cdef class Vectors: | |||
|         If the number of vectors is reduced, keys mapped to rows that have been | ||||
|         deleted are removed. These removed items are returned as a list of | ||||
|         `(key, row)` tuples. | ||||
| 
 | ||||
|         shape (tuple): A `(rows, dims)` tuple. | ||||
|         inplace (bool): Reallocate the memory. | ||||
|         RETURNS (list): The removed items as a list of `(key, row)` tuples. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#resize | ||||
|         """ | ||||
|         if inplace: | ||||
|             self.data.resize(shape, refcheck=False) | ||||
|  | @ -175,10 +213,7 @@ cdef class Vectors: | |||
|         return removed_items | ||||
| 
 | ||||
|     def keys(self): | ||||
|         """A sequence of the keys in the table. | ||||
| 
 | ||||
|         RETURNS (iterable): The keys. | ||||
|         """ | ||||
|         """RETURNS (iterable): A sequence of keys in the table.""" | ||||
|         return self.key2row.keys() | ||||
| 
 | ||||
|     def values(self): | ||||
|  | @ -188,6 +223,8 @@ cdef class Vectors: | |||
|         returned may be less than the length of the vectors table. | ||||
| 
 | ||||
|         YIELDS (ndarray): A vector in the table. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#values | ||||
|         """ | ||||
|         for row, vector in enumerate(range(self.data.shape[0])): | ||||
|             if not self._unset.count(row): | ||||
|  | @ -197,6 +234,8 @@ cdef class Vectors: | |||
|         """Iterate over `(key, vector)` pairs. | ||||
| 
 | ||||
|         YIELDS (tuple): A key/vector pair. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#items | ||||
|         """ | ||||
|         for key, row in self.key2row.items(): | ||||
|             yield key, self.data[row] | ||||
|  | @ -215,7 +254,7 @@ cdef class Vectors: | |||
|         RETURNS: The requested key, keys, row or rows. | ||||
|         """ | ||||
|         if sum(arg is None for arg in (key, keys, row, rows)) != 3: | ||||
|             bad_kwargs = {'key': key, 'keys': keys, 'row': row, 'rows': rows} | ||||
|             bad_kwargs = {"key": key, "keys": keys, "row": row, "rows": rows} | ||||
|             raise ValueError(Errors.E059.format(kwargs=bad_kwargs)) | ||||
|         xp = get_array_module(self.data) | ||||
|         if key is not None: | ||||
|  | @ -224,7 +263,7 @@ cdef class Vectors: | |||
|         elif keys is not None: | ||||
|             keys = [get_string_id(key) for key in keys] | ||||
|             rows = [self.key2row.get(key, -1.) for key in keys] | ||||
|             return xp.asarray(rows, dtype='i') | ||||
|             return xp.asarray(rows, dtype="i") | ||||
|         else: | ||||
|             targets = set() | ||||
|             if row is not None: | ||||
|  | @ -236,7 +275,7 @@ cdef class Vectors: | |||
|                 if row in targets: | ||||
|                     results.append(key) | ||||
|                     targets.remove(row) | ||||
|             return xp.asarray(results, dtype='uint64') | ||||
|             return xp.asarray(results, dtype="uint64") | ||||
| 
 | ||||
|     def add(self, key, *, vector=None, row=None): | ||||
|         """Add a key to the table. Keys can be mapped to an existing vector | ||||
|  | @ -246,6 +285,8 @@ cdef class Vectors: | |||
|         vector (ndarray / None): A vector to add for the key. | ||||
|         row (int / None): The row number of a vector to map the key to. | ||||
|         RETURNS (int): The row the vector was added to. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#add | ||||
|         """ | ||||
|         key = get_string_id(key) | ||||
|         if row is None and key in self.key2row: | ||||
|  | @ -292,11 +333,10 @@ cdef class Vectors: | |||
|             sims = xp.dot(batch, vectors.T) | ||||
|             best_rows[i:i+batch_size] = sims.argmax(axis=1) | ||||
|             scores[i:i+batch_size] = sims.max(axis=1) | ||||
| 
 | ||||
|         xp = get_array_module(self.data) | ||||
|         row2key = {row: key for key, row in self.key2row.items()} | ||||
|         keys = xp.asarray( | ||||
|             [row2key[row] for row in best_rows if row in row2key], dtype='uint64') | ||||
|             [row2key[row] for row in best_rows if row in row2key], dtype="uint64") | ||||
|         return (keys, best_rows, scores) | ||||
| 
 | ||||
|     def from_glove(self, path): | ||||
|  | @ -308,29 +348,30 @@ cdef class Vectors: | |||
| 
 | ||||
|         path (unicode / Path): The path to load the GloVe vectors from. | ||||
|         RETURNS: A `StringStore` object, holding the key-to-string mapping. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#from_glove | ||||
|         """ | ||||
|         path = util.ensure_path(path) | ||||
|         width = None | ||||
|         for name in path.iterdir(): | ||||
|             if name.parts[-1].startswith('vectors'): | ||||
|             if name.parts[-1].startswith("vectors"): | ||||
|                 _, dims, dtype, _2 = name.parts[-1].split('.') | ||||
|                 width = int(dims) | ||||
|                 break | ||||
|         else: | ||||
|             raise IOError(Errors.E061.format(filename=path)) | ||||
|         bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims, | ||||
|                                                              dtype=dtype) | ||||
|         bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype) | ||||
|         xp = get_array_module(self.data) | ||||
|         self.data = None | ||||
|         with bin_loc.open('rb') as file_: | ||||
|         with bin_loc.open("rb") as file_: | ||||
|             self.data = xp.fromfile(file_, dtype=dtype) | ||||
|             if dtype != 'float32': | ||||
|                 self.data = xp.ascontiguousarray(self.data, dtype='float32') | ||||
|             if dtype != "float32": | ||||
|                 self.data = xp.ascontiguousarray(self.data, dtype="float32") | ||||
|         if self.data.ndim == 1: | ||||
|             self.data = self.data.reshape((self.data.size//width, width)) | ||||
|         n = 0 | ||||
|         strings = StringStore() | ||||
|         with (path / 'vocab.txt').open('r') as file_: | ||||
|         with (path / "vocab.txt").open("r") as file_: | ||||
|             for i, line in enumerate(file_): | ||||
|                 key = strings.add(line.strip()) | ||||
|                 self.add(key, row=i) | ||||
|  | @ -341,16 +382,17 @@ cdef class Vectors: | |||
| 
 | ||||
|         path (unicode / Path): A path to a directory, which will be created if | ||||
|             it doesn't exists. Either a string or a Path-like object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#to_disk | ||||
|         """ | ||||
|         xp = get_array_module(self.data) | ||||
|         if xp is numpy: | ||||
|             save_array = lambda arr, file_: xp.save(file_, arr, | ||||
|                                                     allow_pickle=False) | ||||
|             save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) | ||||
|         else: | ||||
|             save_array = lambda arr, file_: xp.save(file_, arr) | ||||
|         serializers = OrderedDict(( | ||||
|             ('vectors', lambda p: save_array(self.data, p.open('wb'))), | ||||
|             ('key2row', lambda p: srsly.write_msgpack(p, self.key2row)) | ||||
|             ("vectors", lambda p: save_array(self.data, p.open("wb"))), | ||||
|             ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) | ||||
|         )) | ||||
|         return util.to_disk(path, serializers, exclude) | ||||
| 
 | ||||
|  | @ -360,6 +402,8 @@ cdef class Vectors: | |||
| 
 | ||||
|         path (unicode / Path): Directory path, string or Path-like object. | ||||
|         RETURNS (Vectors): The modified object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#from_disk | ||||
|         """ | ||||
|         def load_key2row(path): | ||||
|             if path.exists(): | ||||
|  | @ -380,9 +424,9 @@ cdef class Vectors: | |||
|                 self.data = xp.load(str(path)) | ||||
| 
 | ||||
|         serializers = OrderedDict(( | ||||
|             ('key2row', load_key2row), | ||||
|             ('keys', load_keys), | ||||
|             ('vectors', load_vectors), | ||||
|             ("key2row", load_key2row), | ||||
|             ("keys", load_keys), | ||||
|             ("vectors", load_vectors), | ||||
|         )) | ||||
|         util.from_disk(path, serializers, exclude) | ||||
|         return self | ||||
|  | @ -392,15 +436,17 @@ cdef class Vectors: | |||
| 
 | ||||
|         **exclude: Named attributes to prevent from being serialized. | ||||
|         RETURNS (bytes): The serialized form of the `Vectors` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#to_bytes | ||||
|         """ | ||||
|         def serialize_weights(): | ||||
|             if hasattr(self.data, 'to_bytes'): | ||||
|             if hasattr(self.data, "to_bytes"): | ||||
|                 return self.data.to_bytes() | ||||
|             else: | ||||
|                 return srsly.msgpack_dumps(self.data) | ||||
|         serializers = OrderedDict(( | ||||
|             ('key2row', lambda: srsly.msgpack_dumps(self.key2row)), | ||||
|             ('vectors', serialize_weights) | ||||
|             ("key2row", lambda: srsly.msgpack_dumps(self.key2row)), | ||||
|             ("vectors", serialize_weights) | ||||
|         )) | ||||
|         return util.to_bytes(serializers, exclude) | ||||
| 
 | ||||
|  | @ -410,16 +456,18 @@ cdef class Vectors: | |||
|         data (bytes): The data to load from. | ||||
|         **exclude: Named attributes to prevent from being loaded. | ||||
|         RETURNS (Vectors): The `Vectors` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vectors#from_bytes | ||||
|         """ | ||||
|         def deserialize_weights(b): | ||||
|             if hasattr(self.data, 'from_bytes'): | ||||
|             if hasattr(self.data, "from_bytes"): | ||||
|                 self.data.from_bytes() | ||||
|             else: | ||||
|                 self.data = srsly.msgpack_loads(b) | ||||
| 
 | ||||
|         deserializers = OrderedDict(( | ||||
|             ('key2row', lambda b: self.key2row.update(srsly.msgpack_loads(b))), | ||||
|             ('vectors', deserialize_weights) | ||||
|             ("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))), | ||||
|             ("vectors", deserialize_weights) | ||||
|         )) | ||||
|         util.from_bytes(data, deserializers, exclude) | ||||
|         return self | ||||
|  |  | |||
|  | @ -4,9 +4,9 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| import numpy | ||||
| import srsly | ||||
| 
 | ||||
| from collections import OrderedDict | ||||
| from thinc.neural.util import get_array_module | ||||
| 
 | ||||
| from .lexeme cimport EMPTY_LEXEME | ||||
| from .lexeme cimport Lexeme | ||||
| from .typedefs cimport attr_t | ||||
|  | @ -27,6 +27,8 @@ cdef class Vocab: | |||
|     """A look-up table that allows you to access `Lexeme` objects. The `Vocab` | ||||
|     instance also provides access to the `StringStore`, and owns underlying | ||||
|     C-data that is shared between `Doc` objects. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/vocab | ||||
|     """ | ||||
|     def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, | ||||
|                  strings=tuple(), oov_prob=-20., **deprecated_kwargs): | ||||
|  | @ -62,7 +64,7 @@ cdef class Vocab: | |||
|             langfunc = None | ||||
|             if self.lex_attr_getters: | ||||
|                 langfunc = self.lex_attr_getters.get(LANG, None) | ||||
|             return langfunc('_') if langfunc else '' | ||||
|             return langfunc("_") if langfunc else "" | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         """The current number of lexemes stored. | ||||
|  | @ -87,11 +89,7 @@ cdef class Vocab: | |||
|             available bit will be chosen. | ||||
|         RETURNS (int): The integer ID by which the flag value can be checked. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy'] | ||||
|             >>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter) | ||||
|             >>> doc = nlp(u'I like spaCy') | ||||
|             >>> assert doc[2].check_flag(MY_PRODUCT) == True | ||||
|         DOCS: https://spacy.io/api/vocab#add_flag | ||||
|         """ | ||||
|         if flag_id == -1: | ||||
|             for bit in range(1, 64): | ||||
|  | @ -112,7 +110,7 @@ cdef class Vocab: | |||
|         `Lexeme` if necessary using memory acquired from the given pool. If the | ||||
|         pool is the lexicon's own memory, the lexeme is saved in the lexicon. | ||||
|         """ | ||||
|         if string == u'': | ||||
|         if string == "": | ||||
|             return &EMPTY_LEXEME | ||||
|         cdef LexemeC* lex | ||||
|         cdef hash_t key = self.strings[string] | ||||
|  | @ -176,10 +174,12 @@ cdef class Vocab: | |||
| 
 | ||||
|         string (unicode): The ID string. | ||||
|         RETURNS (bool) Whether the string has an entry in the vocabulary. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#contains | ||||
|         """ | ||||
|         cdef hash_t int_key | ||||
|         if isinstance(key, bytes): | ||||
|             int_key = self.strings[key.decode('utf8')] | ||||
|             int_key = self.strings[key.decode("utf8")] | ||||
|         elif isinstance(key, unicode): | ||||
|             int_key = self.strings[key] | ||||
|         else: | ||||
|  | @ -191,6 +191,8 @@ cdef class Vocab: | |||
|         """Iterate over the lexemes in the vocabulary. | ||||
| 
 | ||||
|         YIELDS (Lexeme): An entry in the vocabulary. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#iter | ||||
|         """ | ||||
|         cdef attr_t key | ||||
|         cdef size_t addr | ||||
|  | @ -210,8 +212,10 @@ cdef class Vocab: | |||
|         RETURNS (Lexeme): The lexeme indicated by the given ID. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> apple = nlp.vocab.strings['apple'] | ||||
|             >>> assert nlp.vocab[apple] == nlp.vocab[u'apple'] | ||||
|             >>> apple = nlp.vocab.strings["apple"] | ||||
|             >>> assert nlp.vocab[apple] == nlp.vocab[u"apple"] | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#getitem | ||||
|         """ | ||||
|         cdef attr_t orth | ||||
|         if isinstance(id_or_string, unicode): | ||||
|  | @ -284,6 +288,8 @@ cdef class Vocab: | |||
|             `(string, score)` tuples, where `string` is the entry the removed | ||||
|             word was mapped to, and `score` the similarity score between the | ||||
|             two words. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#prune_vectors | ||||
|         """ | ||||
|         xp = get_array_module(self.vectors.data) | ||||
|         # Make prob negative so it sorts by rank ascending | ||||
|  | @ -291,16 +297,12 @@ cdef class Vocab: | |||
|         priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth) | ||||
|                     for lex in self if lex.orth in self.vectors.key2row] | ||||
|         priority.sort() | ||||
|         indices = xp.asarray([i for (prob, i, key) in priority], dtype='i') | ||||
|         keys = xp.asarray([key for (prob, i, key) in priority], dtype='uint64') | ||||
| 
 | ||||
|         indices = xp.asarray([i for (prob, i, key) in priority], dtype="i") | ||||
|         keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") | ||||
|         keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) | ||||
|         toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) | ||||
| 
 | ||||
|         self.vectors = Vectors(data=keep, keys=keys) | ||||
| 
 | ||||
|         syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) | ||||
| 
 | ||||
|         remap = {} | ||||
|         for i, key in enumerate(keys[nr_row:]): | ||||
|             self.vectors.add(key, row=syn_rows[i]) | ||||
|  | @ -319,21 +321,22 @@ cdef class Vocab: | |||
|         RETURNS (numpy.ndarray): A word vector. Size | ||||
|             and shape determined by the `vocab.vectors` instance. Usually, a | ||||
|             numpy ndarray of shape (300,) and dtype float32. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#get_vector | ||||
|         """ | ||||
|         if isinstance(orth, basestring_): | ||||
|             orth = self.strings.add(orth) | ||||
|         word = self[orth].orth_ | ||||
|         if orth in self.vectors.key2row: | ||||
|             return self.vectors[orth] | ||||
| 
 | ||||
|         # Assign default ngram limits to minn and maxn which is the length of the word. | ||||
|         if minn is None: | ||||
|             minn = len(word) | ||||
|         if maxn is None: | ||||
|             maxn = len(word) | ||||
|         vectors = numpy.zeros((self.vectors_length,), dtype='f') | ||||
| 
 | ||||
|         # Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText | ||||
|         vectors = numpy.zeros((self.vectors_length,), dtype="f") | ||||
|         # Fasttext's ngram computation taken from | ||||
|         # https://github.com/facebookresearch/fastText | ||||
|         ngrams_size = 0; | ||||
|         for i in range(len(word)): | ||||
|             ngram = "" | ||||
|  | @ -356,12 +359,16 @@ cdef class Vocab: | |||
|                 n = n + 1 | ||||
|         if ngrams_size > 0: | ||||
|             vectors = vectors * (1.0/ngrams_size) | ||||
| 
 | ||||
|         return vectors | ||||
| 
 | ||||
|     def set_vector(self, orth, vector): | ||||
|         """Set a vector for a word in the vocabulary. Words can be referenced | ||||
|         by string or int ID. | ||||
| 
 | ||||
|         orth (int / unicode): The word. | ||||
|         vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#set_vector | ||||
|         """ | ||||
|         if isinstance(orth, basestring_): | ||||
|             orth = self.strings.add(orth) | ||||
|  | @ -372,13 +379,19 @@ cdef class Vocab: | |||
|             else: | ||||
|                 width = self.vectors.shape[1] | ||||
|             self.vectors.resize((new_rows, width)) | ||||
|             lex = self[orth] # Adds worse to vocab | ||||
|             lex = self[orth]  # Adds words to vocab | ||||
|             self.vectors.add(orth, vector=vector) | ||||
|         self.vectors.add(orth, vector=vector) | ||||
| 
 | ||||
|     def has_vector(self, orth): | ||||
|         """Check whether a word has a vector. Returns False if no vectors have | ||||
|         been loaded. Words can be looked up by string or int ID.""" | ||||
|         been loaded. Words can be looked up by string or int ID. | ||||
| 
 | ||||
|         orth (int / unicode): The word. | ||||
|         RETURNS (bool): Whether the word has a vector. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#has_vector | ||||
|         """ | ||||
|         if isinstance(orth, basestring_): | ||||
|             orth = self.strings.add(orth) | ||||
|         return orth in self.vectors | ||||
|  | @ -388,12 +401,14 @@ cdef class Vocab: | |||
| 
 | ||||
|         path (unicode or Path): A path to a directory, which will be created if | ||||
|             it doesn't exist. Paths may be either strings or Path-like objects. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#to_disk | ||||
|         """ | ||||
|         path = util.ensure_path(path) | ||||
|         if not path.exists(): | ||||
|             path.mkdir() | ||||
|         self.strings.to_disk(path / 'strings.json') | ||||
|         with (path / 'lexemes.bin').open('wb') as file_: | ||||
|         self.strings.to_disk(path / "strings.json") | ||||
|         with (path / "lexemes.bin").open('wb') as file_: | ||||
|             file_.write(self.lexemes_to_bytes()) | ||||
|         if self.vectors is not None: | ||||
|             self.vectors.to_disk(path) | ||||
|  | @ -405,13 +420,15 @@ cdef class Vocab: | |||
|         path (unicode or Path): A path to a directory. Paths may be either | ||||
|             strings or `Path`-like objects. | ||||
|         RETURNS (Vocab): The modified `Vocab` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#to_disk | ||||
|         """ | ||||
|         path = util.ensure_path(path) | ||||
|         self.strings.from_disk(path / 'strings.json') | ||||
|         with (path / 'lexemes.bin').open('rb') as file_: | ||||
|         self.strings.from_disk(path / "strings.json") | ||||
|         with (path / "lexemes.bin").open("rb") as file_: | ||||
|             self.lexemes_from_bytes(file_.read()) | ||||
|         if self.vectors is not None: | ||||
|             self.vectors.from_disk(path, exclude='strings.json') | ||||
|             self.vectors.from_disk(path, exclude="strings.json") | ||||
|         if self.vectors.name is not None: | ||||
|             link_vectors_to_models(self) | ||||
|         return self | ||||
|  | @ -421,6 +438,8 @@ cdef class Vocab: | |||
| 
 | ||||
|         **exclude: Named attributes to prevent from being serialized. | ||||
|         RETURNS (bytes): The serialized form of the `Vocab` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#to_bytes | ||||
|         """ | ||||
|         def deserialize_vectors(): | ||||
|             if self.vectors is None: | ||||
|  | @ -429,9 +448,9 @@ cdef class Vocab: | |||
|                 return self.vectors.to_bytes() | ||||
| 
 | ||||
|         getters = OrderedDict(( | ||||
|             ('strings', lambda: self.strings.to_bytes()), | ||||
|             ('lexemes', lambda: self.lexemes_to_bytes()), | ||||
|             ('vectors', deserialize_vectors) | ||||
|             ("strings", lambda: self.strings.to_bytes()), | ||||
|             ("lexemes", lambda: self.lexemes_to_bytes()), | ||||
|             ("vectors", deserialize_vectors) | ||||
|         )) | ||||
|         return util.to_bytes(getters, exclude) | ||||
| 
 | ||||
|  | @ -441,6 +460,8 @@ cdef class Vocab: | |||
|         bytes_data (bytes): The data to load from. | ||||
|         **exclude: Named attributes to prevent from being loaded. | ||||
|         RETURNS (Vocab): The `Vocab` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/vocab#from_bytes | ||||
|         """ | ||||
|         def serialize_vectors(b): | ||||
|             if self.vectors is None: | ||||
|  | @ -448,9 +469,9 @@ cdef class Vocab: | |||
|             else: | ||||
|                 return self.vectors.from_bytes(b) | ||||
|         setters = OrderedDict(( | ||||
|             ('strings', lambda b: self.strings.from_bytes(b)), | ||||
|             ('lexemes', lambda b: self.lexemes_from_bytes(b)), | ||||
|             ('vectors', lambda b: serialize_vectors(b)) | ||||
|             ("strings", lambda b: self.strings.from_bytes(b)), | ||||
|             ("lexemes", lambda b: self.lexemes_from_bytes(b)), | ||||
|             ("vectors", lambda b: serialize_vectors(b)) | ||||
|         )) | ||||
|         util.from_bytes(bytes_data, setters, exclude) | ||||
|         if self.vectors.name is not None: | ||||
|  | @ -467,7 +488,7 @@ cdef class Vocab: | |||
|             if addr == 0: | ||||
|                 continue | ||||
|             size += sizeof(lex_data.data) | ||||
|         byte_string = b'\0' * size | ||||
|         byte_string = b"\0" * size | ||||
|         byte_ptr = <unsigned char*>byte_string | ||||
|         cdef int j | ||||
|         cdef int i = 0 | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| --- | ||||
| title: DependencyParser | ||||
| tag: class | ||||
| source: spacy/pipeline.pyx | ||||
| source: spacy/pipeline/pipes.pyx | ||||
| --- | ||||
| 
 | ||||
| This class is a subclass of `Pipe` and follows the same API. The pipeline | ||||
|  | @ -211,7 +211,7 @@ Modify the pipe's model, to use the given parameter values. | |||
| > ```python | ||||
| > parser = DependencyParser(nlp.vocab) | ||||
| > with parser.use_params(): | ||||
| >     parser.to_disk('/best_model') | ||||
| >     parser.to_disk("/best_model") | ||||
| > ``` | ||||
| 
 | ||||
| | Name     | Type | Description                                                                                                | | ||||
|  | @ -226,7 +226,7 @@ Add a new label to the pipe. | |||
| > | ||||
| > ```python | ||||
| > parser = DependencyParser(nlp.vocab) | ||||
| > parser.add_label('MY_LABEL') | ||||
| > parser.add_label("MY_LABEL") | ||||
| > ``` | ||||
| 
 | ||||
| | Name    | Type    | Description       | | ||||
|  | @ -241,7 +241,7 @@ Serialize the pipe to disk. | |||
| > | ||||
| > ```python | ||||
| > parser = DependencyParser(nlp.vocab) | ||||
| > parser.to_disk('/path/to/parser') | ||||
| > parser.to_disk("/path/to/parser") | ||||
| > ``` | ||||
| 
 | ||||
| | Name   | Type             | Description                                                                                                           | | ||||
|  | @ -256,7 +256,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | |||
| > | ||||
| > ```python | ||||
| > parser = DependencyParser(nlp.vocab) | ||||
| > parser.from_disk('/path/to/parser') | ||||
| > parser.from_disk("/path/to/parser") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type               | Description                                                                | | ||||
|  | @ -266,7 +266,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | |||
| 
 | ||||
| ## DependencyParser.to_bytes {#to_bytes tag="method"} | ||||
| 
 | ||||
| > #### example | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > parser = DependencyParser(nlp.vocab) | ||||
|  |  | |||
|  | @ -127,6 +127,7 @@ details, see the documentation on | |||
| | `method`  | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`.                                                          | | ||||
| | `getter`  | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute.          | | ||||
| | `setter`  | callable | Setter function that takes the `Doc` and a value, and modifies the object. Is called when the user writes to the `Doc._` attribute. | | ||||
| | `force`   | bool     | Force overwriting existing attribute.                                                                                               | | ||||
| 
 | ||||
| ## Doc.get_extension {#get_extension tag="classmethod" new="2"} | ||||
| 
 | ||||
|  | @ -263,6 +264,46 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | |||
| | ----------- | -------------------------------------- | ----------------------------------------------- | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Doc`. | | ||||
| 
 | ||||
| ## Doc.to_json {#to_json, tag="method" new="2.1"} | ||||
| 
 | ||||
| Convert a Doc to JSON. The format it produces will be the new format for the | ||||
| [`spacy train`](/api/cli#train) command (not implemented yet). If custom | ||||
| underscore attributes are specified, their values need to be JSON-serializable. | ||||
| They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > doc = nlp(u"Hello") | ||||
| > json_doc = doc.to_json() | ||||
| > ``` | ||||
| > | ||||
| > #### Result | ||||
| > | ||||
| > ```python | ||||
| > { | ||||
| >   "text": "Hello", | ||||
| >   "ents": [], | ||||
| >   "sents": [{"start": 0, "end": 5}], | ||||
| >   "tokens": [{"id": 0, "start": 0, "end": 5, "pos": "INTJ", "tag": "UH", "dep": "ROOT", "head": 0} | ||||
| >   ] | ||||
| > } | ||||
| > ``` | ||||
| 
 | ||||
| | Name         | Type | Description                                                                    | | ||||
| | ------------ | ---- | ------------------------------------------------------------------------------ | | ||||
| | `underscore` | list | Optional list of string names of custom JSON-serializable `doc._.` attributes. | | ||||
| | **RETURNS**  | dict | The JSON-formatted data.                                                       | | ||||
| 
 | ||||
| <Infobox title="Deprecation note" variant="warning"> | ||||
| 
 | ||||
| spaCy previously implemented a `Doc.print_tree` method that returned a similar | ||||
| JSON-formatted representation of a `Doc`. As of v2.1, this method is deprecated | ||||
| in favor of `Doc.to_json`. If you need more complex nested representations, you | ||||
| might want to write your own function to extract the data. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ## Doc.to_array {#to_array tag="method"} | ||||
| 
 | ||||
| Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence | ||||
|  | @ -310,7 +351,7 @@ array of attributes. | |||
| 
 | ||||
| | Name        | Type                                   | Description                   | | ||||
| | ----------- | -------------------------------------- | ----------------------------- | | ||||
| | `attrs`     | ints                                   | A list of attribute ID ints.  | | ||||
| | `attrs`     | list                                   | A list of attribute ID ints.  | | ||||
| | `array`     | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. | | ||||
| | **RETURNS** | `Doc`                                  | Itself.                       | | ||||
| 
 | ||||
|  | @ -429,14 +470,16 @@ to specify how the new subtokens should be integrated into the dependency tree. | |||
| The list of per-token heads can either be a token in the original document, e.g. | ||||
| `doc[2]`, or a tuple consisting of the token in the original document and its | ||||
| subtoken index. For example, `(doc[3], 1)` will attach the subtoken to the | ||||
| second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other | ||||
| newly created subtokens, without having to keep track of the changing token | ||||
| indices. If the specified head token will be split within the retokenizer block | ||||
| and no subtoken index is specified, it will default to `0`. Attributes to set on | ||||
| subtokens can be provided as a list of values. They'll be applied to the | ||||
| resulting token (if they're context-dependent token attributes like `LEMMA` or | ||||
| `DEP`) or to the underlying lexeme (if they're context-independent lexical | ||||
| attributes like `LOWER` or `IS_STOP`). | ||||
| second subtoken of `doc[3]`. | ||||
| 
 | ||||
| This mechanism allows attaching subtokens to other newly created subtokens, | ||||
| without having to keep track of the changing token indices. If the specified | ||||
| head token will be split within the retokenizer block and no subtoken index is | ||||
| specified, it will default to `0`. Attributes to set on subtokens can be | ||||
| provided as a list of values. They'll be applied to the resulting token (if | ||||
| they're context-dependent token attributes like `LEMMA` or `DEP`) or to the | ||||
| underlying lexeme (if they're context-independent lexical attributes like | ||||
| `LOWER` or `IS_STOP`). | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -487,8 +530,8 @@ and end token boundaries, the document remains unchanged. | |||
| 
 | ||||
| ## Doc.ents {#ents tag="property" model="NER"} | ||||
| 
 | ||||
| Iterate over the entities in the document. Yields named-entity `Span` objects, | ||||
| if the entity recognizer has been applied to the document. | ||||
| The named entities in the document. Returns a tuple of named entity `Span` | ||||
| objects, if the entity recognizer has been applied. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -501,8 +544,8 @@ if the entity recognizer has been applied to the document. | |||
| > ``` | ||||
| 
 | ||||
| | Name        | Type  | Description                                      | | ||||
| | ---------- | ------ | ------------------------- | | ||||
| | **YIELDS** | `Span` | Entities in the document. | | ||||
| | ----------- | ----- | ------------------------------------------------ | | ||||
| | **RETURNS** | tuple | Entities in the document, one `Span` per entity. | | ||||
| 
 | ||||
| ## Doc.noun_chunks {#noun_chunks tag="property" model="parser"} | ||||
| 
 | ||||
|  | @ -542,8 +585,8 @@ will be unavailable. | |||
| > ``` | ||||
| 
 | ||||
| | Name       | Type   | Description                | | ||||
| | ---------- | ---------------------------------- | ----------- | | ||||
| | **YIELDS** | `Span | Sentences in the document. | | ||||
| | ---------- | ------ | -------------------------- | | ||||
| | **YIELDS** | `Span` | Sentences in the document. | | ||||
| 
 | ||||
| ## Doc.has_vector {#has_vector tag="property" model="vectors"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| --- | ||||
| title: EntityRecognizer | ||||
| tag: class | ||||
| source: spacy/pipeline.pyx | ||||
| source: spacy/pipeline/pipes.pyx | ||||
| --- | ||||
| 
 | ||||
| This class is a subclass of `Pipe` and follows the same API. The pipeline | ||||
|  | @ -211,7 +211,7 @@ Modify the pipe's model, to use the given parameter values. | |||
| > ```python | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
| > with ner.use_params(): | ||||
| >     ner.to_disk('/best_model') | ||||
| >     ner.to_disk("/best_model") | ||||
| > ``` | ||||
| 
 | ||||
| | Name     | Type | Description                                                                                                | | ||||
|  | @ -226,7 +226,7 @@ Add a new label to the pipe. | |||
| > | ||||
| > ```python | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
| > ner.add_label('MY_LABEL') | ||||
| > ner.add_label("MY_LABEL") | ||||
| > ``` | ||||
| 
 | ||||
| | Name    | Type    | Description       | | ||||
|  | @ -241,7 +241,7 @@ Serialize the pipe to disk. | |||
| > | ||||
| > ```python | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
| > ner.to_disk('/path/to/ner') | ||||
| > ner.to_disk("/path/to/ner") | ||||
| > ``` | ||||
| 
 | ||||
| | Name   | Type             | Description                                                                                                           | | ||||
|  | @ -256,7 +256,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | |||
| > | ||||
| > ```python | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
| > ner.from_disk('/path/to/ner') | ||||
| > ner.from_disk("/path/to/ner") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type               | Description                                                                | | ||||
|  | @ -266,7 +266,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | |||
| 
 | ||||
| ## EntityRecognizer.to_bytes {#to_bytes tag="method"} | ||||
| 
 | ||||
| > #### example | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > ner = EntityRecognizer(nlp.vocab) | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| --- | ||||
| title: EntityRuler | ||||
| tag: class | ||||
| source: spacy/pipeline.pyx | ||||
| source: spacy/pipeline/entityruler.py | ||||
| new: 2.1 | ||||
| --- | ||||
| 
 | ||||
|  | @ -128,7 +128,7 @@ newline-delimited JSON (JSONL). | |||
| > | ||||
| > ```python | ||||
| > ruler = EntityRuler(nlp) | ||||
| > ruler.to_disk('/path/to/rules.jsonl') | ||||
| > ruler.to_disk("/path/to/rules.jsonl") | ||||
| > ``` | ||||
| 
 | ||||
| | Name   | Type             | Description                                                                                                      | | ||||
|  | @ -144,7 +144,7 @@ JSON (JSONL) with one entry per line. | |||
| > | ||||
| > ```python | ||||
| > ruler = EntityRuler(nlp) | ||||
| > ruler.from_disk('/path/to/rules.jsonl') | ||||
| > ruler.from_disk("/path/to/rules.jsonl") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type             | Description                                                                 | | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| --- | ||||
| title: Pipeline Functions | ||||
| teaser: Other built-in pipeline components and helpers | ||||
| source: spacy/pipeline.pyx | ||||
| source: spacy/pipeline/functions.py | ||||
| menu: | ||||
|   - ['merge_noun_chunks', 'merge_noun_chunks'] | ||||
|   - ['merge_entities', 'merge_entities'] | ||||
|  | @ -73,10 +73,10 @@ components to the end of the pipeline and after all other components. | |||
| | `doc`       | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | ||||
| | **RETURNS** | `Doc` | The modified `Doc` with merged entities.                     | | ||||
| 
 | ||||
| ## merge_subtokens {#merge_entities tag="function" new="2.1"} | ||||
| ## merge_subtokens {#merge_subtokens tag="function" new="2.1"} | ||||
| 
 | ||||
| Merge subtokens into a single token. Also available via the string name | ||||
| `"merge_entities"`. After initialization, the component is typically added to | ||||
| `"merge_subtokens"`. After initialization, the component is typically added to | ||||
| the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). | ||||
| 
 | ||||
| As of v2.1, the parser is able to predict "subtokens" that should be merged into | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| --- | ||||
| title: SentenceSegmenter | ||||
| tag: class | ||||
| source: spacy/pipeline.pyx | ||||
| source: spacy/pipeline/hooks.py | ||||
| --- | ||||
| 
 | ||||
| A simple spaCy hook, to allow custom sentence boundary detection logic that | ||||
|  |  | |||
|  | @ -260,8 +260,8 @@ Retokenize the document, such that the span is merged into a single token. | |||
| 
 | ||||
| ## Span.ents {#ents tag="property" new="2.0.12" model="ner"} | ||||
| 
 | ||||
| Iterate over the entities in the span. Yields named-entity `Span` objects, if | ||||
| the entity recognizer has been applied to the parent document. | ||||
| The named entities in the span. Returns a tuple of named entity `Span` objects, | ||||
| if the entity recognizer has been applied. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -275,8 +275,8 @@ the entity recognizer has been applied to the parent document. | |||
| > ``` | ||||
| 
 | ||||
| | Name        | Type  | Description                                  | | ||||
| | ---------- | ------ | ------------------------- | | ||||
| | **YIELDS** | `Span` | Entities in the document. | | ||||
| | ----------- | ----- | -------------------------------------------- | | ||||
| | **RETURNS** | tuple | Entities in the span, one `Span` per entity. | | ||||
| 
 | ||||
| ## Span.as_doc {#as_doc tag="method"} | ||||
| 
 | ||||
|  | @ -297,8 +297,9 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. | |||
| 
 | ||||
| ## Span.root {#root tag="property" model="parser"} | ||||
| 
 | ||||
| The token within the span that's highest in the parse tree. If there's a tie, | ||||
| the earliest is preferred. | ||||
| The token with the shortest path to the root of the sentence (or the root | ||||
| itself). If multiple tokens are equally high in the tree, the first token is | ||||
| taken. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| --- | ||||
| title: Tagger | ||||
| tag: class | ||||
| source: spacy/pipeline.pyx | ||||
| source: spacy/pipeline/pipes.pyx | ||||
| --- | ||||
| 
 | ||||
| This class is a subclass of `Pipe` and follows the same API. The pipeline | ||||
|  | @ -209,7 +209,7 @@ Modify the pipe's model, to use the given parameter values. | |||
| > ```python | ||||
| > tagger = Tagger(nlp.vocab) | ||||
| > with tagger.use_params(): | ||||
| >     tagger.to_disk('/best_model') | ||||
| >     tagger.to_disk("/best_model") | ||||
| > ``` | ||||
| 
 | ||||
| | Name     | Type | Description                                                                                                | | ||||
|  | @ -225,7 +225,7 @@ Add a new label to the pipe. | |||
| > ```python | ||||
| > from spacy.symbols import POS | ||||
| > tagger = Tagger(nlp.vocab) | ||||
| > tagger.add_label('MY_LABEL', {POS: 'NOUN'}) | ||||
| > tagger.add_label("MY_LABEL", {POS: 'NOUN'}) | ||||
| > ``` | ||||
| 
 | ||||
| | Name     | Type    | Description                                                     | | ||||
|  | @ -241,7 +241,7 @@ Serialize the pipe to disk. | |||
| > | ||||
| > ```python | ||||
| > tagger = Tagger(nlp.vocab) | ||||
| > tagger.to_disk('/path/to/tagger') | ||||
| > tagger.to_disk("/path/to/tagger") | ||||
| > ``` | ||||
| 
 | ||||
| | Name   | Type             | Description                                                                                                           | | ||||
|  | @ -256,7 +256,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | |||
| > | ||||
| > ```python | ||||
| > tagger = Tagger(nlp.vocab) | ||||
| > tagger.from_disk('/path/to/tagger') | ||||
| > tagger.from_disk("/path/to/tagger") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type             | Description                                                                | | ||||
|  | @ -266,7 +266,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | |||
| 
 | ||||
| ## Tagger.to_bytes {#to_bytes tag="method"} | ||||
| 
 | ||||
| > #### example | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > tagger = Tagger(nlp.vocab) | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| --- | ||||
| title: TextCategorizer | ||||
| tag: class | ||||
| source: spacy/pipeline.pyx | ||||
| source: spacy/pipeline/pipes.pyx | ||||
| new: 2 | ||||
| --- | ||||
| 
 | ||||
|  | @ -227,7 +227,7 @@ Modify the pipe's model, to use the given parameter values. | |||
| > ```python | ||||
| > textcat = TextCategorizer(nlp.vocab) | ||||
| > with textcat.use_params(): | ||||
| >     textcat.to_disk('/best_model') | ||||
| >     textcat.to_disk("/best_model") | ||||
| > ``` | ||||
| 
 | ||||
| | Name     | Type | Description                                                                                                | | ||||
|  | @ -242,7 +242,7 @@ Add a new label to the pipe. | |||
| > | ||||
| > ```python | ||||
| > textcat = TextCategorizer(nlp.vocab) | ||||
| > textcat.add_label('MY_LABEL') | ||||
| > textcat.add_label("MY_LABEL") | ||||
| > ``` | ||||
| 
 | ||||
| | Name    | Type    | Description       | | ||||
|  | @ -257,7 +257,7 @@ Serialize the pipe to disk. | |||
| > | ||||
| > ```python | ||||
| > textcat = TextCategorizer(nlp.vocab) | ||||
| > textcat.to_disk('/path/to/textcat') | ||||
| > textcat.to_disk("/path/to/textcat") | ||||
| > ``` | ||||
| 
 | ||||
| | Name   | Type             | Description                                                                                                           | | ||||
|  | @ -272,7 +272,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | |||
| > | ||||
| > ```python | ||||
| > textcat = TextCategorizer(nlp.vocab) | ||||
| > textcat.from_disk('/path/to/textcat') | ||||
| > textcat.from_disk("/path/to/textcat") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type              | Description                                                                | | ||||
|  | @ -282,7 +282,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | |||
| 
 | ||||
| ## TextCategorizer.to_bytes {#to_bytes tag="method"} | ||||
| 
 | ||||
| > #### example | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > textcat = TextCategorizer(nlp.vocab) | ||||
|  |  | |||
|  | @ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants. | |||
| ## Token.is_sent_start {#is_sent_start tag="property" new="2"} | ||||
| 
 | ||||
| A boolean value indicating whether the token starts a sentence. `None` if | ||||
| unknown. Defaults to `True` for the first token in the `doc`. | ||||
| unknown. Defaults to `True` for the first token in the `Doc`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  |  | |||
|  | @ -116,6 +116,72 @@ details and examples. | |||
| | `string`      | unicode  | The string to specially tokenize.                                                                                                                                        | | ||||
| | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | | ||||
| 
 | ||||
| ## Tokenizer.to_disk {#to_disk tag="method"} | ||||
| 
 | ||||
| Serialize the tokenizer to disk. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > tokenizer = Tokenizer(nlp.vocab) | ||||
| > tokenizer.to_disk("/path/to/tokenizer") | ||||
| > ``` | ||||
| 
 | ||||
| | Name   | Type             | Description                                                                                                           | | ||||
| | ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | ||||
| | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | ||||
| 
 | ||||
| ## Tokenizer.from_disk {#from_disk tag="method"} | ||||
| 
 | ||||
| Load the tokenizer from disk. Modifies the object in place and returns it. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > tokenizer = Tokenizer(nlp.vocab) | ||||
| > tokenizer.from_disk("/path/to/tokenizer") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type             | Description                                                                | | ||||
| | ----------- | ---------------- | -------------------------------------------------------------------------- | | ||||
| | `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | ||||
| | **RETURNS** | `Tokenizer`      | The modified `Tokenizer` object.                                           | | ||||
| 
 | ||||
| ## Tokenizer.to_bytes {#to_bytes tag="method"} | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > tokenizer = tokenizer(nlp.vocab) | ||||
| > tokenizer_bytes = tokenizer.to_bytes() | ||||
| > ``` | ||||
| 
 | ||||
| Serialize the tokenizer to a bytestring. | ||||
| 
 | ||||
| | Name        | Type  | Description                                        | | ||||
| | ----------- | ----- | -------------------------------------------------- | | ||||
| | `**exclude` | -     | Named attributes to prevent from being serialized. | | ||||
| | **RETURNS** | bytes | The serialized form of the `Tokenizer` object.     | | ||||
| 
 | ||||
| ## Tokenizer.from_bytes {#from_bytes tag="method"} | ||||
| 
 | ||||
| Load the tokenizer from a bytestring. Modifies the object in place and returns | ||||
| it. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > tokenizer_bytes = tokenizer.to_bytes() | ||||
| > tokenizer = Tokenizer(nlp.vocab) | ||||
| > tokenizer.from_bytes(tokenizer_bytes) | ||||
| > ``` | ||||
| 
 | ||||
| | Name         | Type        | Description                                    | | ||||
| | ------------ | ----------- | ---------------------------------------------- | | ||||
| | `bytes_data` | bytes       | The data to load from.                         | | ||||
| | `**exclude`  | -           | Named attributes to prevent from being loaded. | | ||||
| | **RETURNS**  | `Tokenizer` | The `Tokenizer` object.                        | | ||||
| 
 | ||||
| ## Attributes {#attributes} | ||||
| 
 | ||||
| | Name             | Type    | Description                                                                                                                | | ||||
|  |  | |||
|  | @ -642,7 +642,7 @@ All Python code is written in an **intersection of Python 2 and Python 3**. This | |||
| is easy in Cython, but somewhat ugly in Python. Logic that deals with Python or | ||||
| platform compatibility only lives in `spacy.compat`. To distinguish them from | ||||
| the builtin functions, replacement functions are suffixed with an underscore, | ||||
| e.e `unicode_`. | ||||
| e.g. `unicode_`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -660,7 +660,7 @@ e.e `unicode_`. | |||
| | `compat.input_`      | `raw_input`                        | `input`     | | ||||
| | `compat.path2str`    | `str(path)` with `.decode('utf8')` | `str(path)` | | ||||
| 
 | ||||
| ### compat.is_config {#is_config tag="function"} | ||||
| ### compat.is_config {#compat.is_config tag="function"} | ||||
| 
 | ||||
| Check if a specific configuration of Python version and operating system matches | ||||
| the user's setup. Mostly used to display targeted error messages. | ||||
|  |  | |||
|  | @ -424,7 +424,7 @@ take a path to a JSON file containing the patterns. This lets you reuse the | |||
| component with different patterns, depending on your application: | ||||
| 
 | ||||
| ```python | ||||
| html_merger = BadHTMLMerger(nlp, path='/path/to/patterns.json') | ||||
| html_merger = BadHTMLMerger(nlp, path="/path/to/patterns.json") | ||||
| ``` | ||||
| 
 | ||||
| <Infobox title="📖 Processing pipelines"> | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user