From 96b91a8898799bb5cce0b264f0a7685643f46b9f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 7 Mar 2019 12:25:00 +0100 Subject: [PATCH 01/55] Fix noqa [ci skip] --- spacy/tests/lang/test_initialize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 4a01ba50a..9cd0a78c3 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -18,8 +18,8 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", @pytest.mark.parametrize("lang", LANGUAGES) def test_lang_initialize(lang, capfd): """Test that languages can be initialized.""" - nlp = get_lang_class(lang)() # noqa: F841 + nlp = get_lang_class(lang)() # Check for stray print statements (see #3342) - doc = nlp("test") + doc = nlp("test") # noqa: F841 captured = capfd.readouterr() assert not captured.out From 88909a9adbaf4f3710603db2697c39f9a7274357 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Thu, 7 Mar 2019 21:07:19 +0100 Subject: [PATCH 02/55] Fix egg fragments in direct download (#3369) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description The egg fragment in the URL must be of the form `#egg=package_name==version` instead of `#egg=package_name-version`. One of the consequences of specifying wrong egg fragments is that `pip` does not recognize the package and its version properly, and thus it re-downloads the package systematically. I'm not sure how this should be tested properly. Here is what I had before the fix when running the same direct download twice: ``` $ python -m spacy download en_core_web_sm-2.0.0 --direct Looking in indexes: https://pypi.python.org/simple/ Collecting en_core_web_sm-2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm-2.0.0 Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB) 100% |████████████████████████████████| 37.4MB 1.6MB/s Generating metadata for package en-core-web-sm-2.0.0 produced metadata for project name en-core-web-sm. Fix your #egg=en-core-web-sm-2.0.0 fragments. Installing collected packages: en-core-web-sm Running setup.py install for en-core-web-sm ... done Successfully installed en-core-web-sm-2.0.0 $ python -m spacy download en_core_web_sm-2.0.0 --direct Looking in indexes: https://pypi.python.org/simple/ Collecting en_core_web_sm-2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm-2.0.0 Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB) 100% |████████████████████████████████| 37.4MB 919kB/s Generating metadata for package en-core-web-sm-2.0.0 produced metadata for project name en-core-web-sm. Fix your #egg=en-core-web-sm-2.0.0 fragments. Requirement already satisfied (use --upgrade to upgrade): en-core-web-sm from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm-2.0.0 in ./venv3/lib/python3.6/site-packages ``` And after the fix: ``` $ python -m spacy download en_core_web_sm-2.0.0 --direct Looking in indexes: https://pypi.python.org/simple/ Collecting en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0 Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB) 100% |████████████████████████████████| 37.4MB 1.1MB/s Installing collected packages: en-core-web-sm Running setup.py install for en-core-web-sm ... done Successfully installed en-core-web-sm-2.0.0 $ python -m spacy download en_core_web_sm-2.0.0 --direct Looking in indexes: https://pypi.python.org/simple/ Requirement already satisfied: en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0 in ./venv3/lib/python3.6/site-packages (2.0.0) ``` ### Types of change This is an enhancement as it avoids unnecessary downloads of (potentially big) spacy models, when they have already been downloaded. ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- .github/contributors/adrienball.md | 106 +++++++++++++++++++++++++++++ spacy/cli/download.py | 7 +- 2 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/adrienball.md diff --git a/.github/contributors/adrienball.md b/.github/contributors/adrienball.md new file mode 100644 index 000000000..29d957bf6 --- /dev/null +++ b/.github/contributors/adrienball.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------------- | +| Name | Adrien Ball | +| Company name (if applicable) | | +| Title or role (if applicable) | Machine Learning Engineer | +| Date | 2019-03-07 | +| GitHub username | adrienball | +| Website (optional) | https://medium.com/@adrien_ball | \ No newline at end of file diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 82c810fcb..01e0d93ec 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -26,7 +26,12 @@ def download(model, direct=False, *pip_args): with version. """ if direct: - dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args) + components = model.split("-") + model_name = "".join(components[:-1]) + version = components[-1] + dl = download_model( + '{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}'.format( + m=model_name, v=version), pip_args) else: shortcuts = get_json(about.__shortcuts__, "available shortcuts") model_name = shortcuts.get(model, model) From 296446a1c8afe99da9858e2816f954e78c61c184 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 8 Mar 2019 11:42:26 +0100 Subject: [PATCH 03/55] Tidy up and improve docs and docstrings (#3370) ## Description * tidy up and adjust Cython code to code style * improve docstrings and make calling `help()` nicer * add URLs to new docs pages to docstrings wherever possible, mostly to user-facing objects * fix various typos and inconsistencies in docs ### Types of change enhancement, docs ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/compat.py | 60 +- spacy/displacy/__init__.py | 12 + spacy/errors.py | 11 + spacy/gold.pyx | 196 ++-- spacy/language.py | 3 +- spacy/lemmatizer.py | 7 + spacy/lexeme.pyx | 20 +- spacy/matcher/__init__.py | 8 +- spacy/matcher/dependencymatcher.pyx | 164 +-- spacy/matcher/matcher.pyx | 1141 +++++++++++---------- spacy/matcher/phrasematcher.pyx | 42 +- spacy/pipeline/__init__.py | 25 +- spacy/pipeline/entityruler.py | 30 +- spacy/pipeline/functions.py | 14 +- spacy/pipeline/hooks.py | 2 + spacy/pipeline/pipes.pyx | 175 ++-- spacy/strings.pyx | 23 +- spacy/tokenizer.pyx | 85 +- spacy/tokens/__init__.py | 5 +- spacy/tokens/_retokenize.pyx | 35 +- spacy/tokens/_serialize.py | 6 +- spacy/tokens/doc.pyx | 356 ++++--- spacy/tokens/span.pyx | 258 +++-- spacy/tokens/token.pyx | 205 ++-- spacy/vectors.pyx | 128 ++- spacy/vocab.pyx | 93 +- website/docs/api/dependencyparser.md | 12 +- website/docs/api/doc.md | 77 +- website/docs/api/entityrecognizer.md | 12 +- website/docs/api/entityruler.md | 6 +- website/docs/api/pipeline-functions.md | 6 +- website/docs/api/sentencesegmenter.md | 2 +- website/docs/api/span.md | 15 +- website/docs/api/tagger.md | 12 +- website/docs/api/textcategorizer.md | 12 +- website/docs/api/token.md | 2 +- website/docs/api/tokenizer.md | 66 ++ website/docs/api/top-level.md | 4 +- website/docs/usage/rule-based-matching.md | 2 +- 39 files changed, 1942 insertions(+), 1390 deletions(-) diff --git a/spacy/compat.py b/spacy/compat.py index c1869b85f..8af49f254 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -1,4 +1,11 @@ # coding: utf8 +""" +Helpers for Python and platform compatibility. To distinguish them from +the builtin functions, replacement functions are suffixed with an underscore, +e.g. `unicode_`. + +DOCS: https://spacy.io/api/top-level#compat +""" from __future__ import unicode_literals import os @@ -64,19 +71,23 @@ elif is_python3: def b_to_str(b_str): + """Convert a bytes object to a string. + + b_str (bytes): The object to convert. + RETURNS (unicode): The converted string. + """ if is_python2: return b_str - # important: if no encoding is set, string becomes "b'...'" + # Important: if no encoding is set, string becomes "b'...'" return str(b_str, encoding="utf8") -def getattr_(obj, name, *default): - if is_python3 and isinstance(name, bytes): - name = name.decode("utf8") - return getattr(obj, name, *default) - - def symlink_to(orig, dest): + """Create a symlink. Used for model shortcut links. + + orig (unicode / Path): The origin path. + dest (unicode / Path): The destination path of the symlink. + """ if is_windows: import subprocess @@ -86,6 +97,10 @@ def symlink_to(orig, dest): def symlink_remove(link): + """Remove a symlink. Used for model shortcut links. + + link (unicode / Path): The path to the symlink. + """ # https://stackoverflow.com/q/26554135/6400719 if os.path.isdir(path2str(link)) and is_windows: # this should only be on Py2.7 and windows @@ -95,6 +110,18 @@ def symlink_remove(link): def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): + """Check if a specific configuration of Python version and operating system + matches the user's setup. Mostly used to display targeted error messages. + + python2 (bool): spaCy is executed with Python 2.x. + python3 (bool): spaCy is executed with Python 3.x. + windows (bool): spaCy is executed on Windows. + linux (bool): spaCy is executed on Linux. + osx (bool): spaCy is executed on OS X or macOS. + RETURNS (bool): Whether the configuration matches the user's platform. + + DOCS: https://spacy.io/api/top-level#compat.is_config + """ return ( python2 in (None, is_python2) and python3 in (None, is_python3) @@ -104,19 +131,14 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): ) -def normalize_string_keys(old): - """Given a dictionary, make sure keys are unicode strings, not bytes.""" - new = {} - for key, value in old.items(): - if isinstance(key, bytes_): - new[key.decode("utf8")] = value - else: - new[key] = value - return new - - def import_file(name, loc): - loc = str(loc) + """Import module from a file. Used to load models from a directory. + + name (unicode): Name of module to load. + loc (unicode / Path): Path to the file. + RETURNS: The loaded module. + """ + loc = path2str(loc) if is_python_pre_3_5: import imp diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index f8886848d..6c5509b14 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -1,4 +1,10 @@ # coding: utf8 +""" +spaCy's built in visualization suite for dependencies and named entities. + +DOCS: https://spacy.io/api/top-level#displacy +USAGE: https://spacy.io/usage/visualizers +""" from __future__ import unicode_literals from .render import DependencyRenderer, EntityRenderer @@ -25,6 +31,9 @@ def render( options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. RETURNS (unicode): Rendered HTML markup. + + DOCS: https://spacy.io/api/top-level#displacy.render + USAGE: https://spacy.io/usage/visualizers """ factories = { "dep": (DependencyRenderer, parse_deps), @@ -71,6 +80,9 @@ def serve( manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. port (int): Port to serve visualisation. host (unicode): Host to serve visualisation. + + DOCS: https://spacy.io/api/top-level#displacy.serve + USAGE: https://spacy.io/usage/visualizers """ from wsgiref import simple_server diff --git a/spacy/errors.py b/spacy/errors.py index 2a501089d..13382d146 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -338,6 +338,17 @@ class Errors(object): "or with a getter AND setter.") E120 = ("Can't set custom extension attributes during retokenization. " "Expected dict mapping attribute names to values, but got: {value}") + E121 = ("Can't bulk merge spans. Attribute length {attr_len} should be " + "equal to span length ({span_len}).") + E122 = ("Cannot find token to be split. Did it get merged?") + E123 = ("Cannot find head of token to be split. Did it get merged?") + E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg") + E125 = ("Unexpected value: {value}") + E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " + "This is likely a bug in spaCy, so feel free to open an issue.") + E127 = ("Cannot create phrase pattern representation for length 0. This " + "is likely a bug in spaCy.") + @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 756e6a5fa..d03d13d2d 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -14,34 +14,38 @@ from . import _align from .syntax import nonproj from .tokens import Doc, Span from .errors import Errors +from .compat import path2str from . import util from .util import minibatch, itershuffle from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek +punct_re = re.compile(r"\W") + + def tags_to_entities(tags): entities = [] start = None for i, tag in enumerate(tags): if tag is None: continue - if tag.startswith('O'): + if tag.startswith("O"): # TODO: We shouldn't be getting these malformed inputs. Fix this. if start is not None: start = None continue - elif tag == '-': + elif tag == "-": continue - elif tag.startswith('I'): + elif tag.startswith("I"): if start is None: - raise ValueError(Errors.E067.format(tags=tags[:i+1])) + raise ValueError(Errors.E067.format(tags=tags[:i + 1])) continue - if tag.startswith('U'): + if tag.startswith("U"): entities.append((tag[2:], i, i)) - elif tag.startswith('B'): + elif tag.startswith("B"): start = i - elif tag.startswith('L'): + elif tag.startswith("L"): entities.append((tag[2:], start, i)) start = None else: @@ -60,19 +64,18 @@ def merge_sents(sents): m_deps[3].extend(head + i for head in heads) m_deps[4].extend(labels) m_deps[5].extend(ner) - m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) + m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) for b in brackets) i += len(ids) return [(m_deps, m_brackets)] -punct_re = re.compile(r'\W') def align(cand_words, gold_words): if cand_words == gold_words: alignment = numpy.arange(len(cand_words)) return 0, alignment, alignment, {}, {} - cand_words = [w.replace(' ', '').lower() for w in cand_words] - gold_words = [w.replace(' ', '').lower() for w in gold_words] + cand_words = [w.replace(" ", "").lower() for w in cand_words] + gold_words = [w.replace(" ", "").lower() for w in gold_words] cost, i2j, j2i, matrix = _align.align(cand_words, gold_words) i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in cand_words], [len(w) for w in gold_words]) @@ -89,7 +92,10 @@ def align(cand_words, gold_words): class GoldCorpus(object): """An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing and NER.""" + annotations for tagging, dependency parsing and NER. + + DOCS: https://spacy.io/api/goldcorpus + """ def __init__(self, train, dev, gold_preproc=False, limit=None): """Create a GoldCorpus. @@ -101,12 +107,10 @@ class GoldCorpus(object): if isinstance(train, str) or isinstance(train, Path): train = self.read_tuples(self.walk_corpus(train)) dev = self.read_tuples(self.walk_corpus(dev)) - - # Write temp directory with one doc per file, so we can shuffle - # and stream + # Write temp directory with one doc per file, so we can shuffle and stream self.tmp_dir = Path(tempfile.mkdtemp()) - self.write_msgpack(self.tmp_dir / 'train', train, limit=self.limit) - self.write_msgpack(self.tmp_dir / 'dev', dev, limit=self.limit) + self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit) + self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) def __del__(self): shutil.rmtree(self.tmp_dir) @@ -117,7 +121,7 @@ class GoldCorpus(object): directory.mkdir() n = 0 for i, doc_tuple in enumerate(doc_tuples): - srsly.write_msgpack(directory / '{}.msg'.format(i), [doc_tuple]) + srsly.write_msgpack(directory / "{}.msg".format(i), [doc_tuple]) n += len(doc_tuple[1]) if limit and n >= limit: break @@ -134,11 +138,11 @@ class GoldCorpus(object): if str(path) in seen: continue seen.add(str(path)) - if path.parts[-1].startswith('.'): + if path.parts[-1].startswith("."): continue elif path.is_dir(): paths.extend(path.iterdir()) - elif path.parts[-1].endswith('.json'): + elif path.parts[-1].endswith(".json"): locs.append(path) return locs @@ -147,13 +151,12 @@ class GoldCorpus(object): i = 0 for loc in locs: loc = util.ensure_path(loc) - if loc.parts[-1].endswith('json'): + if loc.parts[-1].endswith("json"): gold_tuples = read_json_file(loc) - elif loc.parts[-1].endswith('msg'): + elif loc.parts[-1].endswith("msg"): gold_tuples = srsly.read_msgpack(loc) else: - msg = "Cannot read from file: %s. Supported formats: .json, .msg" - raise ValueError(msg % loc) + raise ValueError(Errors.E124.format(path=path2str(loc))) for item in gold_tuples: yield item i += len(item[1]) @@ -162,12 +165,12 @@ class GoldCorpus(object): @property def dev_tuples(self): - locs = (self.tmp_dir / 'dev').iterdir() + locs = (self.tmp_dir / "dev").iterdir() yield from self.read_tuples(locs, limit=self.limit) @property def train_tuples(self): - locs = (self.tmp_dir / 'train').iterdir() + locs = (self.tmp_dir / "train").iterdir() yield from self.read_tuples(locs, limit=self.limit) def count_train(self): @@ -193,8 +196,7 @@ class GoldCorpus(object): yield from gold_docs def dev_docs(self, nlp, gold_preproc=False): - gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, - gold_preproc=gold_preproc) + gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc) yield from gold_docs @classmethod @@ -205,32 +207,29 @@ class GoldCorpus(object): raw_text = None else: paragraph_tuples = merge_sents(paragraph_tuples) - docs = cls._make_docs(nlp, raw_text, paragraph_tuples, - gold_preproc, noise_level=noise_level) + docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, + noise_level=noise_level) golds = cls._make_golds(docs, paragraph_tuples, make_projective) for doc, gold in zip(docs, golds): if (not max_length) or len(doc) < max_length: yield doc, gold @classmethod - def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, - noise_level=0.0): + def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0): if raw_text is not None: raw_text = add_noise(raw_text, noise_level) return [nlp.make_doc(raw_text)] else: - return [Doc(nlp.vocab, - words=add_noise(sent_tuples[1], noise_level)) + return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) for (sent_tuples, brackets) in paragraph_tuples] @classmethod def _make_golds(cls, docs, paragraph_tuples, make_projective): if len(docs) != len(paragraph_tuples): - raise ValueError(Errors.E070.format(n_docs=len(docs), - n_annots=len(paragraph_tuples))) + n_annots = len(paragraph_tuples) + raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots)) if len(docs) == 1: - return [GoldParse.from_annot_tuples(docs[0], - paragraph_tuples[0][0], + return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0], make_projective=make_projective)] else: return [GoldParse.from_annot_tuples(doc, sent_tuples, @@ -247,18 +246,18 @@ def add_noise(orig, noise_level): corrupted = [w for w in corrupted if w] return corrupted else: - return ''.join(_corrupt(c, noise_level) for c in orig) + return "".join(_corrupt(c, noise_level) for c in orig) def _corrupt(c, noise_level): if random.random() >= noise_level: return c - elif c == ' ': - return '\n' - elif c == '\n': - return ' ' - elif c in ['.', "'", "!", "?", ',']: - return '' + elif c == " ": + return "\n" + elif c == "\n": + return " " + elif c in [".", "'", "!", "?", ","]: + return "" else: return c.lower() @@ -284,30 +283,30 @@ def json_to_tuple(doc): YIELDS (tuple): The reformatted data. """ paragraphs = [] - for paragraph in doc['paragraphs']: + for paragraph in doc["paragraphs"]: sents = [] - for sent in paragraph['sentences']: + for sent in paragraph["sentences"]: words = [] ids = [] tags = [] heads = [] labels = [] ner = [] - for i, token in enumerate(sent['tokens']): - words.append(token['orth']) + for i, token in enumerate(sent["tokens"]): + words.append(token["orth"]) ids.append(i) - tags.append(token.get('tag', '-')) - heads.append(token.get('head', 0) + i) - labels.append(token.get('dep', '')) + tags.append(token.get('tag', "-")) + heads.append(token.get("head", 0) + i) + labels.append(token.get("dep", "")) # Ensure ROOT label is case-insensitive - if labels[-1].lower() == 'root': - labels[-1] = 'ROOT' - ner.append(token.get('ner', '-')) + if labels[-1].lower() == "root": + labels[-1] = "ROOT" + ner.append(token.get("ner", "-")) sents.append([ [ids, words, tags, heads, labels, ner], - sent.get('brackets', [])]) + sent.get("brackets", [])]) if sents: - yield [paragraph.get('raw', None), sents] + yield [paragraph.get("raw", None), sents] def read_json_file(loc, docs_filter=None, limit=None): @@ -329,7 +328,7 @@ def _json_iterate(loc): # It's okay to read in the whole file -- just don't parse it into JSON. cdef bytes py_raw loc = util.ensure_path(loc) - with loc.open('rb') as file_: + with loc.open("rb") as file_: py_raw = file_.read() raw = py_raw cdef int square_depth = 0 @@ -339,11 +338,11 @@ def _json_iterate(loc): cdef int start = -1 cdef char c cdef char quote = ord('"') - cdef char backslash = ord('\\') - cdef char open_square = ord('[') - cdef char close_square = ord(']') - cdef char open_curly = ord('{') - cdef char close_curly = ord('}') + cdef char backslash = ord("\\") + cdef char open_square = ord("[") + cdef char close_square = ord("]") + cdef char open_curly = ord("{") + cdef char close_curly = ord("}") for i in range(len(py_raw)): c = raw[i] if escape: @@ -368,7 +367,7 @@ def _json_iterate(loc): elif c == close_curly: curly_depth -= 1 if square_depth == 1 and curly_depth == 0: - py_str = py_raw[start : i+1].decode('utf8') + py_str = py_raw[start : i + 1].decode("utf8") try: yield srsly.json_loads(py_str) except Exception: @@ -388,7 +387,7 @@ def iob_to_biluo(tags): def _consume_os(tags): - while tags and tags[0] == 'O': + while tags and tags[0] == "O": yield tags.pop(0) @@ -396,24 +395,27 @@ def _consume_ent(tags): if not tags: return [] tag = tags.pop(0) - target_in = 'I' + tag[1:] - target_last = 'L' + tag[1:] + target_in = "I" + tag[1:] + target_last = "L" + tag[1:] length = 1 while tags and tags[0] in {target_in, target_last}: length += 1 tags.pop(0) label = tag[2:] if length == 1: - return ['U-' + label] + return ["U-" + label] else: - start = 'B-' + label - end = 'L-' + label - middle = ['I-%s' % label for _ in range(1, length - 1)] + start = "B-" + label + end = "L-" + label + middle = ["I-%s" % label for _ in range(1, length - 1)] return [start] + middle + [end] cdef class GoldParse: - """Collection for training annotations.""" + """Collection for training annotations. + + DOCS: https://spacy.io/api/goldparse + """ @classmethod def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): _, words, tags, heads, deps, entities = annot_tuples @@ -456,13 +458,13 @@ cdef class GoldParse: if deps is None: deps = [None for _ in doc] if entities is None: - entities = ['-' for _ in doc] + entities = ["-" for _ in doc] elif len(entities) == 0: - entities = ['O' for _ in doc] + entities = ["O" for _ in doc] else: # Translate the None values to '-', to make processing easier. # See Issue #2603 - entities = [(ent if ent is not None else '-') for ent in entities] + entities = [(ent if ent is not None else "-") for ent in entities] if not isinstance(entities[0], basestring): # Assume we have entities specified by character offset. entities = biluo_tags_from_offsets(doc, entities) @@ -508,10 +510,10 @@ cdef class GoldParse: for i, gold_i in enumerate(self.cand_to_gold): if doc[i].text.isspace(): self.words[i] = doc[i].text - self.tags[i] = '_SP' + self.tags[i] = "_SP" self.heads[i] = None self.labels[i] = None - self.ner[i] = 'O' + self.ner[i] = "O" if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] @@ -521,7 +523,7 @@ cdef class GoldParse: # Set next word in multi-token span as head, until last if not is_last: self.heads[i] = i+1 - self.labels[i] = 'subtok' + self.labels[i] = "subtok" else: self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]] self.labels[i] = deps[i2j_multi[i]] @@ -530,24 +532,24 @@ cdef class GoldParse: # BILOU tags. We can't have BB or LL etc. # Case 1: O -- easy. ner_tag = entities[i2j_multi[i]] - if ner_tag == 'O': - self.ner[i] = 'O' + if ner_tag == "O": + self.ner[i] = "O" # Case 2: U. This has to become a B I* L sequence. - elif ner_tag.startswith('U-'): + elif ner_tag.startswith("U-"): if is_first: - self.ner[i] = ner_tag.replace('U-', 'B-', 1) + self.ner[i] = ner_tag.replace("U-", "B-", 1) elif is_last: - self.ner[i] = ner_tag.replace('U-', 'L-', 1) + self.ner[i] = ner_tag.replace("U-", "L-", 1) else: - self.ner[i] = ner_tag.replace('U-', 'I-', 1) + self.ner[i] = ner_tag.replace("U-", "I-", 1) # Case 3: L. If not last, change to I. - elif ner_tag.startswith('L-'): + elif ner_tag.startswith("L-"): if is_last: self.ner[i] = ner_tag else: - self.ner[i] = ner_tag.replace('L-', 'I-', 1) + self.ner[i] = ner_tag.replace("L-", "I-", 1) # Case 4: I. Stays correct - elif ner_tag.startswith('I-'): + elif ner_tag.startswith("I-"): self.ner[i] = ner_tag else: self.words[i] = words[gold_i] @@ -608,7 +610,7 @@ def docs_to_json(docs, underscore=None): return [doc.to_json(underscore=underscore) for doc in docs] -def biluo_tags_from_offsets(doc, entities, missing='O'): +def biluo_tags_from_offsets(doc, entities, missing="O"): """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO). @@ -631,11 +633,11 @@ def biluo_tags_from_offsets(doc, entities, missing='O'): >>> entities = [(len('I like '), len('I like London'), 'LOC')] >>> doc = nlp.tokenizer(text) >>> tags = biluo_tags_from_offsets(doc, entities) - >>> assert tags == ['O', 'O', 'U-LOC', 'O'] + >>> assert tags == ["O", "O", 'U-LOC', "O"] """ starts = {token.idx: token.i for token in doc} - ends = {token.idx+len(token): token.i for token in doc} - biluo = ['-' for _ in doc] + ends = {token.idx + len(token): token.i for token in doc} + biluo = ["-" for _ in doc] # Handle entity cases for start_char, end_char, label in entities: start_token = starts.get(start_char) @@ -643,19 +645,19 @@ def biluo_tags_from_offsets(doc, entities, missing='O'): # Only interested if the tokenization is correct if start_token is not None and end_token is not None: if start_token == end_token: - biluo[start_token] = 'U-%s' % label + biluo[start_token] = "U-%s" % label else: - biluo[start_token] = 'B-%s' % label + biluo[start_token] = "B-%s" % label for i in range(start_token+1, end_token): - biluo[i] = 'I-%s' % label - biluo[end_token] = 'L-%s' % label + biluo[i] = "I-%s" % label + biluo[end_token] = "L-%s" % label # Now distinguish the O cases from ones where we miss the tokenization entity_chars = set() for start_char, end_char, label in entities: for i in range(start_char, end_char): entity_chars.add(i) for token in doc: - for i in range(token.idx, token.idx+len(token)): + for i in range(token.idx, token.idx + len(token)): if i in entity_chars: break else: @@ -697,4 +699,4 @@ def offsets_from_biluo_tags(doc, tags): def is_punct_label(label): - return label == 'P' or label.lower() == 'punct' + return label == "P" or label.lower() == "punct" diff --git a/spacy/language.py b/spacy/language.py index 0c0cf8854..723c49ef7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -103,8 +103,9 @@ class Language(object): Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. lang (unicode): Two-letter language ID, i.e. ISO code. - """ + DOCS: https://spacy.io/api/language + """ Defaults = BaseDefaults lang = None diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 842bf3041..1aea308f9 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -6,6 +6,13 @@ from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos class Lemmatizer(object): + """ + The Lemmatizer supports simple part-of-speech-sensitive suffix rules and + lookup tables. + + DOCS: https://spacy.io/api/lemmatizer + """ + @classmethod def load(cls, path, index=None, exc=None, rules=None, lookup=None): return cls(index, exc, rules, lookup) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 4f614e6fd..8a1c0b2de 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -4,17 +4,19 @@ from __future__ import unicode_literals, print_function # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray +from libc.string cimport memset cimport numpy as np np.import_array() -from libc.string cimport memset + import numpy from thinc.neural.util import get_array_module from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP -from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV -from .attrs cimport PROB +from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT +from .attrs cimport IS_CURRENCY, IS_OOV, PROB + from .attrs import intify_attrs from .errors import Errors, Warnings, user_warning @@ -27,6 +29,8 @@ cdef class Lexeme: word-type, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag). + + DOCS: https://spacy.io/api/lexeme """ def __init__(self, Vocab vocab, attr_t orth): """Create a Lexeme object. @@ -115,15 +119,15 @@ cdef class Lexeme: RETURNS (float): A scalar similarity score. Higher is more similar. """ # Return 1.0 similarity for matches - if hasattr(other, 'orth'): + if hasattr(other, "orth"): if self.c.orth == other.orth: return 1.0 - elif hasattr(other, '__len__') and len(other) == 1 \ - and hasattr(other[0], 'orth'): + elif hasattr(other, "__len__") and len(other) == 1 \ + and hasattr(other[0], "orth"): if self.c.orth == other[0].orth: return 1.0 if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj='Lexeme')) + user_warning(Warnings.W008.format(obj="Lexeme")) return 0.0 vector = self.vector xp = get_array_module(vector) @@ -136,7 +140,7 @@ cdef class Lexeme: if (end-start) != sizeof(lex_data.data): raise ValueError(Errors.E072.format(length=end-start, bad_length=sizeof(lex_data.data))) - byte_string = b'\0' * sizeof(lex_data.data) + byte_string = b"\0" * sizeof(lex_data.data) byte_chars = byte_string for i in range(sizeof(lex_data.data)): byte_chars[i] = lex_data.data[i] diff --git a/spacy/matcher/__init__.py b/spacy/matcher/__init__.py index e9105d78b..d3923754b 100644 --- a/spacy/matcher/__init__.py +++ b/spacy/matcher/__init__.py @@ -1,6 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from .matcher import Matcher # noqa: F401 -from .phrasematcher import PhraseMatcher # noqa: F401 -from .dependencymatcher import DependencyTreeMatcher # noqa: F401 +from .matcher import Matcher +from .phrasematcher import PhraseMatcher +from .dependencymatcher import DependencyTreeMatcher + +__all__ = ["Matcher", "PhraseMatcher", "DependencyTreeMatcher"] diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 1a4b23e11..8fca95a2d 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -13,7 +13,7 @@ from .matcher import unpickle_matcher from ..errors import Errors -DELIMITER = '||' +DELIMITER = "||" INDEX_HEAD = 1 INDEX_RELOP = 0 @@ -55,7 +55,8 @@ cdef class DependencyTreeMatcher: return (unpickle_matcher, data, None, None) def __len__(self): - """Get the number of rules, which are edges ,added to the dependency tree matcher. + """Get the number of rules, which are edges, added to the dependency + tree matcher. RETURNS (int): The number of rules. """ @@ -73,19 +74,30 @@ cdef class DependencyTreeMatcher: idx = 0 visitedNodes = {} for relation in pattern: - if 'PATTERN' not in relation or 'SPEC' not in relation: + if "PATTERN" not in relation or "SPEC" not in relation: raise ValueError(Errors.E098.format(key=key)) if idx == 0: - if not('NODE_NAME' in relation['SPEC'] and 'NBOR_RELOP' not in relation['SPEC'] and 'NBOR_NAME' not in relation['SPEC']): + if not( + "NODE_NAME" in relation["SPEC"] + and "NBOR_RELOP" not in relation["SPEC"] + and "NBOR_NAME" not in relation["SPEC"] + ): raise ValueError(Errors.E099.format(key=key)) - visitedNodes[relation['SPEC']['NODE_NAME']] = True + visitedNodes[relation["SPEC"]["NODE_NAME"]] = True else: - if not('NODE_NAME' in relation['SPEC'] and 'NBOR_RELOP' in relation['SPEC'] and 'NBOR_NAME' in relation['SPEC']): + if not( + "NODE_NAME" in relation["SPEC"] + and "NBOR_RELOP" in relation["SPEC"] + and "NBOR_NAME" in relation["SPEC"] + ): raise ValueError(Errors.E100.format(key=key)) - if relation['SPEC']['NODE_NAME'] in visitedNodes or relation['SPEC']['NBOR_NAME'] not in visitedNodes: + if ( + relation["SPEC"]["NODE_NAME"] in visitedNodes + or relation["SPEC"]["NBOR_NAME"] not in visitedNodes + ): raise ValueError(Errors.E101.format(key=key)) - visitedNodes[relation['SPEC']['NODE_NAME']] = True - visitedNodes[relation['SPEC']['NBOR_NAME']] = True + visitedNodes[relation["SPEC"]["NODE_NAME"]] = True + visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True idx = idx + 1 def add(self, key, on_match, *patterns): @@ -93,55 +105,46 @@ cdef class DependencyTreeMatcher: if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) self.validateInput(pattern,key) - key = self._normalize_key(key) - _patterns = [] for pattern in patterns: token_patterns = [] for i in range(len(pattern)): - token_pattern = [pattern[i]['PATTERN']] + token_pattern = [pattern[i]["PATTERN"]] token_patterns.append(token_pattern) # self.patterns.append(token_patterns) _patterns.append(token_patterns) - self._patterns.setdefault(key, []) self._callbacks[key] = on_match self._patterns[key].extend(_patterns) - - # Add each node pattern of all the input patterns individually to the matcher. - # This enables only a single instance of Matcher to be used. + # Add each node pattern of all the input patterns individually to the + # matcher. This enables only a single instance of Matcher to be used. # Multiple adds are required to track each node pattern. _keys_to_token_list = [] for i in range(len(_patterns)): _keys_to_token = {} - # TODO : Better ways to hash edges in pattern? + # TODO: Better ways to hash edges in pattern? for j in range(len(_patterns[i])): - k = self._normalize_key(unicode(key)+DELIMITER+unicode(i)+DELIMITER+unicode(j)) - self.token_matcher.add(k,None,_patterns[i][j]) + k = self._normalize_key(unicode(key) + DELIMITER + unicode(i) + DELIMITER + unicode(j)) + self.token_matcher.add(k, None, _patterns[i][j]) _keys_to_token[k] = j _keys_to_token_list.append(_keys_to_token) - self._keys_to_token.setdefault(key, []) self._keys_to_token[key].extend(_keys_to_token_list) - _nodes_list = [] for pattern in patterns: nodes = {} for i in range(len(pattern)): - nodes[pattern[i]['SPEC']['NODE_NAME']]=i + nodes[pattern[i]["SPEC"]["NODE_NAME"]] = i _nodes_list.append(nodes) - self._nodes.setdefault(key, []) self._nodes[key].extend(_nodes_list) + # Create an object tree to traverse later on. This data structure + # enables easy tree pattern match. Doc-Token based tree cannot be + # reused since it is memory-heavy and tightly coupled with the Doc. + self.retrieve_tree(patterns, _nodes_list,key) - # Create an object tree to traverse later on. - # This datastructure enable easy tree pattern match. - # Doc-Token based tree cannot be reused since it is memory heavy and - # tightly coupled with doc - self.retrieve_tree(patterns,_nodes_list,key) - - def retrieve_tree(self,patterns,_nodes_list,key): + def retrieve_tree(self, patterns, _nodes_list, key): _heads_list = [] _root_list = [] for i in range(len(patterns)): @@ -149,31 +152,29 @@ cdef class DependencyTreeMatcher: root = -1 for j in range(len(patterns[i])): token_pattern = patterns[i][j] - if('NBOR_RELOP' not in token_pattern['SPEC']): - heads[j] = ('root',j) + if ("NBOR_RELOP" not in token_pattern["SPEC"]): + heads[j] = ('root', j) root = j else: - heads[j] = (token_pattern['SPEC']['NBOR_RELOP'],_nodes_list[i][token_pattern['SPEC']['NBOR_NAME']]) - + heads[j] = ( + token_pattern["SPEC"]["NBOR_RELOP"], + _nodes_list[i][token_pattern["SPEC"]["NBOR_NAME"]] + ) _heads_list.append(heads) _root_list.append(root) - _tree_list = [] for i in range(len(patterns)): tree = {} for j in range(len(patterns[i])): if(_heads_list[i][j][INDEX_HEAD] == j): continue - head = _heads_list[i][j][INDEX_HEAD] if(head not in tree): tree[head] = [] - tree[head].append( (_heads_list[i][j][INDEX_RELOP],j) ) + tree[head].append((_heads_list[i][j][INDEX_RELOP], j)) _tree_list.append(tree) - self._tree.setdefault(key, []) self._tree[key].extend(_tree_list) - self._root.setdefault(key, []) self._root[key].extend(_root_list) @@ -199,7 +200,6 @@ cdef class DependencyTreeMatcher: def __call__(self, Doc doc): matched_trees = [] - matches = self.token_matcher(doc) for key in list(self._patterns.keys()): _patterns_list = self._patterns[key] @@ -216,39 +216,51 @@ cdef class DependencyTreeMatcher: id_to_position = {} for i in range(len(_nodes)): id_to_position[i]=[] - - # This could be taken outside to improve running time..? + # TODO: This could be taken outside to improve running time..? for match_id, start, end in matches: if match_id in _keys_to_token: id_to_position[_keys_to_token[match_id]].append(start) - - _node_operator_map = self.get_node_operator_map(doc,_tree,id_to_position,_nodes,_root) + _node_operator_map = self.get_node_operator_map( + doc, + _tree, + id_to_position, + _nodes,_root + ) length = len(_nodes) if _root in id_to_position: candidates = id_to_position[_root] for candidate in candidates: isVisited = {} - self.dfs(candidate,_root,_tree,id_to_position,doc,isVisited,_node_operator_map) - # To check if the subtree pattern is completely identified. This is a heuristic. - # This is done to reduce the complexity of exponential unordered subtree matching. - # Will give approximate matches in some cases. + self.dfs( + candidate, + _root,_tree, + id_to_position, + doc, + isVisited, + _node_operator_map + ) + # To check if the subtree pattern is completely + # identified. This is a heuristic. This is done to + # reduce the complexity of exponential unordered subtree + # matching. Will give approximate matches in some cases. if(len(isVisited) == length): matched_trees.append((key,list(isVisited))) - for i, (ent_id, nodes) in enumerate(matched_trees): on_match = self._callbacks.get(ent_id) if on_match is not None: on_match(self, doc, i, matches) - return matched_trees def dfs(self,candidate,root,tree,id_to_position,doc,isVisited,_node_operator_map): - if(root in id_to_position and candidate in id_to_position[root]): - # color the node since it is valid + if (root in id_to_position and candidate in id_to_position[root]): + # Color the node since it is valid isVisited[candidate] = True if root in tree: for root_child in tree[root]: - if candidate in _node_operator_map and root_child[INDEX_RELOP] in _node_operator_map[candidate]: + if ( + candidate in _node_operator_map + and root_child[INDEX_RELOP] in _node_operator_map[candidate] + ): candidate_children = _node_operator_map[candidate][root_child[INDEX_RELOP]] for candidate_child in candidate_children: result = self.dfs( @@ -275,72 +287,68 @@ cdef class DependencyTreeMatcher: for child in tree[node]: all_operators.append(child[INDEX_RELOP]) all_operators = list(set(all_operators)) - all_nodes = [] for node in all_node_indices: all_nodes = all_nodes + id_to_position[node] all_nodes = list(set(all_nodes)) - for node in all_nodes: _node_operator_map[node] = {} for operator in all_operators: _node_operator_map[node][operator] = [] - # Used to invoke methods for each operator switcher = { - '<':self.dep, - '>':self.gov, - '>>':self.dep_chain, - '<<':self.gov_chain, - '.':self.imm_precede, - '$+':self.imm_right_sib, - '$-':self.imm_left_sib, - '$++':self.right_sib, - '$--':self.left_sib + "<": self.dep, + ">": self.gov, + ">>": self.dep_chain, + "<<": self.gov_chain, + ".": self.imm_precede, + "$+": self.imm_right_sib, + "$-": self.imm_left_sib, + "$++": self.right_sib, + "$--": self.left_sib } for operator in all_operators: for node in all_nodes: _node_operator_map[node][operator] = switcher.get(operator)(doc,node) - return _node_operator_map - def dep(self,doc,node): + def dep(self, doc, node): return list(doc[node].head) def gov(self,doc,node): return list(doc[node].children) - def dep_chain(self,doc,node): + def dep_chain(self, doc, node): return list(doc[node].ancestors) - def gov_chain(self,doc,node): + def gov_chain(self, doc, node): return list(doc[node].subtree) - def imm_precede(self,doc,node): - if node>0: - return [doc[node-1]] + def imm_precede(self, doc, node): + if node > 0: + return [doc[node - 1]] return [] - def imm_right_sib(self,doc,node): + def imm_right_sib(self, doc, node): for idx in range(list(doc[node].head.children)): - if idx == node-1: + if idx == node - 1: return [doc[idx]] return [] - def imm_left_sib(self,doc,node): + def imm_left_sib(self, doc, node): for idx in range(list(doc[node].head.children)): - if idx == node+1: + if idx == node + 1: return [doc[idx]] return [] - def right_sib(self,doc,node): + def right_sib(self, doc, node): candidate_children = [] for idx in range(list(doc[node].head.children)): if idx < node: candidate_children.append(doc[idx]) return candidate_children - def left_sib(self,doc,node): + def left_sib(self, doc, node): candidate_children = [] for idx in range(list(doc[node].head.children)): if idx > node: diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 17b0a4a36..89615e7b6 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -24,575 +24,15 @@ from ..strings import get_string_id from ..attrs import IDS -cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, - predicates=tuple()): - '''Find matches in a doc, with a compiled array of patterns. Matches are - returned as a list of (id, start, end) tuples. - - To augment the compiled patterns, we optionally also take two Python lists. - - The "predicates" list contains functions that take a Python list and return a - boolean value. It's mostly used for regular expressions. - - The "extra_getters" list contains functions that take a Python list and return - an attr ID. It's mostly used for extension attributes. - ''' - cdef vector[PatternStateC] states - cdef vector[MatchC] matches - cdef PatternStateC state - cdef int i, j, nr_extra_attr - cdef Pool mem = Pool() - predicate_cache = mem.alloc(doc.length * len(predicates), sizeof(char)) - if extensions is not None and len(extensions) >= 1: - nr_extra_attr = max(extensions.values()) + 1 - extra_attr_values = mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t)) - else: - nr_extra_attr = 0 - extra_attr_values = mem.alloc(doc.length, sizeof(attr_t)) - for i, token in enumerate(doc): - for name, index in extensions.items(): - value = token._.get(name) - if isinstance(value, basestring): - value = token.vocab.strings[value] - extra_attr_values[i * nr_extra_attr + index] = value - # Main loop - cdef int nr_predicate = len(predicates) - for i in range(doc.length): - for j in range(n): - states.push_back(PatternStateC(patterns[j], i, 0)) - transition_states(states, matches, predicate_cache, - doc[i], extra_attr_values, predicates) - extra_attr_values += nr_extra_attr - predicate_cache += len(predicates) - # Handle matches that end in 0-width patterns - finish_states(matches, states) - output = [] - seen = set() - for i in range(matches.size()): - match = ( - matches[i].pattern_id, - matches[i].start, - matches[i].start+matches[i].length - ) - - # We need to deduplicate, because we could otherwise arrive at the same - # match through two paths, e.g. .?.? matching 'a'. Are we matching the - # first .?, or the second .? -- it doesn't matter, it's just one match. - if match not in seen: - output.append(match) - seen.add(match) - return output - - -cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: - # The code was originally designed to always have pattern[1].attrs.value - # be the ent_id when we get to the end of a pattern. However, Issue #2671 - # showed this wasn't the case when we had a reject-and-continue before a - # match. I still don't really understand what's going on here, but this - # workaround does resolve the issue. - while pattern.attrs.attr != ID and \ - (pattern.nr_attr > 0 or pattern.nr_extra_attr > 0 or pattern.nr_py > 0): - pattern += 1 - return pattern.attrs.value - - -cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, - char* cached_py_predicates, - Token token, const attr_t* extra_attrs, py_predicates) except *: - cdef int q = 0 - cdef vector[PatternStateC] new_states - cdef int nr_predicate = len(py_predicates) - for i in range(states.size()): - if states[i].pattern.nr_py >= 1: - update_predicate_cache(cached_py_predicates, - states[i].pattern, token, py_predicates) - action = get_action(states[i], token.c, extra_attrs, - cached_py_predicates) - if action == REJECT: - continue - # Keep only a subset of states (the active ones). Index q is the - # states which are still alive. If we reject a state, we overwrite - # it in the states list, because q doesn't advance. - state = states[i] - states[q] = state - while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND): - if action == RETRY_EXTEND: - # This handles the 'extend' - new_states.push_back( - PatternStateC(pattern=state.pattern, start=state.start, - length=state.length+1)) - if action == RETRY_ADVANCE: - # This handles the 'advance' - new_states.push_back( - PatternStateC(pattern=state.pattern+1, start=state.start, - length=state.length+1)) - states[q].pattern += 1 - - if states[q].pattern.nr_py != 0: - update_predicate_cache(cached_py_predicates, - states[q].pattern, token, py_predicates) - action = get_action(states[q], token.c, extra_attrs, - cached_py_predicates) - if action == REJECT: - pass - elif action == ADVANCE: - states[q].pattern += 1 - states[q].length += 1 - q += 1 - else: - ent_id = get_ent_id(&state.pattern[1]) - if action == MATCH: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length+1)) - elif action == MATCH_REJECT: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length)) - elif action == MATCH_EXTEND: - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, - length=state.length)) - states[q].length += 1 - q += 1 - states.resize(q) - for i in range(new_states.size()): - states.push_back(new_states[i]) - - -cdef int update_predicate_cache(char* cache, - const TokenPatternC* pattern, Token token, predicates) except -1: - # If the state references any extra predicates, check whether they match. - # These are cached, so that we don't call these potentially expensive - # Python functions more than we need to. - for i in range(pattern.nr_py): - index = pattern.py_predicates[i] - if cache[index] == 0: - predicate = predicates[index] - result = predicate(token) - if result is True: - cache[index] = 1 - elif result is False: - cache[index] = -1 - elif result is None: - pass - else: - raise ValueError("Unexpected value: %s" % result) - - -cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *: - '''Handle states that end in zero-width patterns.''' - cdef PatternStateC state - for i in range(states.size()): - state = states[i] - while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE): - is_final = get_is_final(state) - if is_final: - ent_id = get_ent_id(state.pattern) - matches.push_back( - MatchC(pattern_id=ent_id, start=state.start, length=state.length)) - break - else: - state.pattern += 1 - - -cdef action_t get_action(PatternStateC state, - const TokenC* token, const attr_t* extra_attrs, - const char* predicate_matches) nogil: - '''We need to consider: - - a) Does the token match the specification? [Yes, No] - b) What's the quantifier? [1, 0+, ?] - c) Is this the last specification? [final, non-final] - - We can transition in the following ways: - - a) Do we emit a match? - b) Do we add a state with (next state, next token)? - c) Do we add a state with (next state, same token)? - d) Do we add a state with (same state, next token)? - - We'll code the actions as boolean strings, so 0000 means no to all 4, - 1000 means match but no states added, etc. - - 1: - Yes, final: - 1000 - Yes, non-final: - 0100 - No, final: - 0000 - No, non-final - 0000 - 0+: - Yes, final: - 1001 - Yes, non-final: - 0011 - No, final: - 1000 (note: Don't include last token!) - No, non-final: - 0010 - ?: - Yes, final: - 1000 - Yes, non-final: - 0100 - No, final: - 1000 (note: Don't include last token!) - No, non-final: - 0010 - - Possible combinations: 1000, 0100, 0000, 1001, 0110, 0011, 0010, - - We'll name the bits "match", "advance", "retry", "extend" - REJECT = 0000 - MATCH = 1000 - ADVANCE = 0100 - RETRY = 0010 - MATCH_EXTEND = 1001 - RETRY_ADVANCE = 0110 - RETRY_EXTEND = 0011 - MATCH_REJECT = 2000 # Match, but don't include last token - - Problem: If a quantifier is matching, we're adding a lot of open partials - ''' - cdef char is_match - is_match = get_is_match(state, token, extra_attrs, predicate_matches) - quantifier = get_quantifier(state) - is_final = get_is_final(state) - if quantifier == ZERO: - is_match = not is_match - quantifier = ONE - if quantifier == ONE: - if is_match and is_final: - # Yes, final: 1000 - return MATCH - elif is_match and not is_final: - # Yes, non-final: 0100 - return ADVANCE - elif not is_match and is_final: - # No, final: 0000 - return REJECT - else: - return REJECT - elif quantifier == ZERO_PLUS: - if is_match and is_final: - # Yes, final: 1001 - return MATCH_EXTEND - elif is_match and not is_final: - # Yes, non-final: 0011 - return RETRY_EXTEND - elif not is_match and is_final: - # No, final 2000 (note: Don't include last token!) - return MATCH_REJECT - else: - # No, non-final 0010 - return RETRY - elif quantifier == ZERO_ONE: - if is_match and is_final: - # Yes, final: 1000 - return MATCH - elif is_match and not is_final: - # Yes, non-final: 0110 - # We need both branches here, consider a pair like: - # pattern: .?b string: b - # If we 'ADVANCE' on the .?, we miss the match. - return RETRY_ADVANCE - elif not is_match and is_final: - # No, final 2000 (note: Don't include last token!) - return MATCH_REJECT - else: - # No, non-final 0010 - return RETRY - - -cdef char get_is_match(PatternStateC state, - const TokenC* token, const attr_t* extra_attrs, - const char* predicate_matches) nogil: - for i in range(state.pattern.nr_py): - if predicate_matches[state.pattern.py_predicates[i]] == -1: - return 0 - spec = state.pattern - for attr in spec.attrs[:spec.nr_attr]: - if get_token_attr(token, attr.attr) != attr.value: - return 0 - for i in range(spec.nr_extra_attr): - if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: - return 0 - return True - - -cdef char get_is_final(PatternStateC state) nogil: - if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0: - return 1 - else: - return 0 - - -cdef char get_quantifier(PatternStateC state) nogil: - return state.pattern.quantifier - - DEF PADDING = 5 -cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL: - pattern = mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) - cdef int i, index - for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs): - pattern[i].quantifier = quantifier - pattern[i].attrs = mem.alloc(len(spec), sizeof(AttrValueC)) - pattern[i].nr_attr = len(spec) - for j, (attr, value) in enumerate(spec): - pattern[i].attrs[j].attr = attr - pattern[i].attrs[j].value = value - pattern[i].extra_attrs = mem.alloc(len(extensions), sizeof(IndexValueC)) - for j, (index, value) in enumerate(extensions): - pattern[i].extra_attrs[j].index = index - pattern[i].extra_attrs[j].value = value - pattern[i].nr_extra_attr = len(extensions) - pattern[i].py_predicates = mem.alloc(len(predicates), sizeof(int32_t)) - for j, index in enumerate(predicates): - pattern[i].py_predicates[j] = index - pattern[i].nr_py = len(predicates) - pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0) - i = len(token_specs) - pattern[i].attrs = mem.alloc(2, sizeof(AttrValueC)) - pattern[i].attrs[0].attr = ID - pattern[i].attrs[0].value = entity_id - pattern[i].nr_attr = 0 - pattern[i].nr_extra_attr = 0 - pattern[i].nr_py = 0 - return pattern - - -cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil: - while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0: - pattern += 1 - id_attr = pattern[0].attrs[0] - if id_attr.attr != ID: - with gil: - raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr)) - return id_attr.value - - -def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates): - """This function interprets the pattern, converting the various bits of - syntactic sugar before we compile it into a struct with init_pattern. - - We need to split the pattern up into three parts: - * Normal attribute/value pairs, which are stored on either the token or lexeme, - can be handled directly. - * Extension attributes are handled specially, as we need to prefetch the - values from Python for the doc before we begin matching. - * Extra predicates also call Python functions, so we have to create the - functions and store them. So we store these specially as well. - * Extension attributes that have extra predicates are stored within the - extra_predicates. - """ - tokens = [] - for spec in token_specs: - if not spec: - # Signifier for 'any token' - tokens.append((ONE, [(NULL_ATTR, 0)], [], [])) - continue - ops = _get_operators(spec) - attr_values = _get_attr_values(spec, string_store) - extensions = _get_extensions(spec, string_store, extensions_table) - predicates = _get_extra_predicates(spec, extra_predicates) - for op in ops: - tokens.append((op, list(attr_values), list(extensions), list(predicates))) - return tokens - - -def _get_attr_values(spec, string_store): - attr_values = [] - for attr, value in spec.items(): - if isinstance(attr, basestring): - if attr == '_': - continue - elif attr.upper() == 'OP': - continue - if attr.upper() == 'TEXT': - attr = 'ORTH' - attr = IDS.get(attr.upper()) - if isinstance(value, basestring): - value = string_store.add(value) - elif isinstance(value, bool): - value = int(value) - elif isinstance(value, dict): - continue - if attr is not None: - attr_values.append((attr, value)) - return attr_values - -# These predicate helper classes are used to match the REGEX, IN, >= etc -# extensions to the matcher introduced in #3173. - -class _RegexPredicate(object): - def __init__(self, i, attr, value, predicate, is_extension=False): - self.i = i - self.attr = attr - self.value = re.compile(value) - self.predicate = predicate - self.is_extension = is_extension - self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) - assert self.predicate == 'REGEX' - - def __call__(self, Token token): - if self.is_extension: - value = token._.get(self.attr) - else: - value = token.vocab.strings[get_token_attr(token.c, self.attr)] - return bool(self.value.search(value)) - - -class _SetMemberPredicate(object): - def __init__(self, i, attr, value, predicate, is_extension=False): - self.i = i - self.attr = attr - self.value = set(get_string_id(v) for v in value) - self.predicate = predicate - self.is_extension = is_extension - self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) - assert self.predicate in ('IN', 'NOT_IN') - - def __call__(self, Token token): - if self.is_extension: - value = get_string_id(token._.get(self.attr)) - else: - value = get_token_attr(token.c, self.attr) - if self.predicate == 'IN': - return value in self.value - else: - return value not in self.value - - def __repr__(self): - return repr(('SetMemberPredicate', self.i, self.attr, self.value, self.predicate)) - - -class _ComparisonPredicate(object): - def __init__(self, i, attr, value, predicate, is_extension=False): - self.i = i - self.attr = attr - self.value = value - self.predicate = predicate - self.is_extension = is_extension - self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) - assert self.predicate in ('==', '!=', '>=', '<=', '>', '<') - - def __call__(self, Token token): - if self.is_extension: - value = token._.get(self.attr) - else: - value = get_token_attr(token.c, self.attr) - if self.predicate == '==': - return value == self.value - if self.predicate == '!=': - return value != self.value - elif self.predicate == '>=': - return value >= self.value - elif self.predicate == '<=': - return value <= self.value - elif self.predicate == '>': - return value > self.value - elif self.predicate == '<': - return value < self.value - - -def _get_extra_predicates(spec, extra_predicates): - predicate_types = { - 'REGEX': _RegexPredicate, - 'IN': _SetMemberPredicate, - 'NOT_IN': _SetMemberPredicate, - '==': _ComparisonPredicate, - '>=': _ComparisonPredicate, - '<=': _ComparisonPredicate, - '>': _ComparisonPredicate, - '<': _ComparisonPredicate, - } - seen_predicates = {pred.key: pred.i for pred in extra_predicates} - output = [] - for attr, value in spec.items(): - if isinstance(attr, basestring): - if attr == '_': - output.extend( - _get_extension_extra_predicates( - value, extra_predicates, predicate_types, - seen_predicates)) - continue - elif attr.upper() == 'OP': - continue - if attr.upper() == 'TEXT': - attr = 'ORTH' - attr = IDS.get(attr.upper()) - if isinstance(value, dict): - for type_, cls in predicate_types.items(): - if type_ in value: - predicate = cls(len(extra_predicates), attr, value[type_], type_) - # Don't create a redundant predicates. - # This helps with efficiency, as we're caching the results. - if predicate.key in seen_predicates: - output.append(seen_predicates[predicate.key]) - else: - extra_predicates.append(predicate) - output.append(predicate.i) - seen_predicates[predicate.key] = predicate.i - return output - - -def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, - seen_predicates): - output = [] - for attr, value in spec.items(): - if isinstance(value, dict): - for type_, cls in predicate_types.items(): - if type_ in value: - key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) - if key in seen_predicates: - output.append(seen_predicates[key]) - else: - predicate = cls(len(extra_predicates), attr, value[type_], type_, - is_extension=True) - extra_predicates.append(predicate) - output.append(predicate.i) - seen_predicates[key] = predicate.i - return output - - -def _get_operators(spec): - # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS - lookup = {'*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS), - '?': (ZERO_ONE,), '1': (ONE,), '!': (ZERO,)} - # Fix casing - spec = {key.upper(): values for key, values in spec.items() - if isinstance(key, basestring)} - if 'OP' not in spec: - return (ONE,) - elif spec['OP'] in lookup: - return lookup[spec['OP']] - else: - keys = ', '.join(lookup.keys()) - raise KeyError(Errors.E011.format(op=spec['OP'], opts=keys)) - - -def _get_extensions(spec, string_store, name2index): - attr_values = [] - for name, value in spec.get('_', {}).items(): - if isinstance(value, dict): - # Handle predicates (e.g. "IN", in the extra_predicates, not here. - continue - if isinstance(value, basestring): - value = string_store.add(value) - if name not in name2index: - name2index[name] = len(name2index) - attr_values.append((name2index[name], value)) - return attr_values - - cdef class Matcher: - """Match sequences of tokens, based on pattern rules.""" + """Match sequences of tokens, based on pattern rules. + + DOCS: https://spacy.io/api/matcher + USAGE: https://spacy.io/usage/rule-based-matching + """ def __init__(self, vocab, validate=False): """Create the Matcher. @@ -756,3 +196,574 @@ def unpickle_matcher(vocab, patterns, callbacks): callback = callbacks.get(key, None) matcher.add(key, callback, *specs) return matcher + + + +cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, + predicates=tuple()): + """Find matches in a doc, with a compiled array of patterns. Matches are + returned as a list of (id, start, end) tuples. + + To augment the compiled patterns, we optionally also take two Python lists. + + The "predicates" list contains functions that take a Python list and return a + boolean value. It's mostly used for regular expressions. + + The "extra_getters" list contains functions that take a Python list and return + an attr ID. It's mostly used for extension attributes. + """ + cdef vector[PatternStateC] states + cdef vector[MatchC] matches + cdef PatternStateC state + cdef int i, j, nr_extra_attr + cdef Pool mem = Pool() + predicate_cache = mem.alloc(doc.length * len(predicates), sizeof(char)) + if extensions is not None and len(extensions) >= 1: + nr_extra_attr = max(extensions.values()) + 1 + extra_attr_values = mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t)) + else: + nr_extra_attr = 0 + extra_attr_values = mem.alloc(doc.length, sizeof(attr_t)) + for i, token in enumerate(doc): + for name, index in extensions.items(): + value = token._.get(name) + if isinstance(value, basestring): + value = token.vocab.strings[value] + extra_attr_values[i * nr_extra_attr + index] = value + # Main loop + cdef int nr_predicate = len(predicates) + for i in range(doc.length): + for j in range(n): + states.push_back(PatternStateC(patterns[j], i, 0)) + transition_states(states, matches, predicate_cache, + doc[i], extra_attr_values, predicates) + extra_attr_values += nr_extra_attr + predicate_cache += len(predicates) + # Handle matches that end in 0-width patterns + finish_states(matches, states) + output = [] + seen = set() + for i in range(matches.size()): + match = ( + matches[i].pattern_id, + matches[i].start, + matches[i].start+matches[i].length + ) + # We need to deduplicate, because we could otherwise arrive at the same + # match through two paths, e.g. .?.? matching 'a'. Are we matching the + # first .?, or the second .? -- it doesn't matter, it's just one match. + if match not in seen: + output.append(match) + seen.add(match) + return output + + +cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: + # The code was originally designed to always have pattern[1].attrs.value + # be the ent_id when we get to the end of a pattern. However, Issue #2671 + # showed this wasn't the case when we had a reject-and-continue before a + # match. I still don't really understand what's going on here, but this + # workaround does resolve the issue. + while pattern.attrs.attr != ID and \ + (pattern.nr_attr > 0 or pattern.nr_extra_attr > 0 or pattern.nr_py > 0): + pattern += 1 + return pattern.attrs.value + + +cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches, + char* cached_py_predicates, + Token token, const attr_t* extra_attrs, py_predicates) except *: + cdef int q = 0 + cdef vector[PatternStateC] new_states + cdef int nr_predicate = len(py_predicates) + for i in range(states.size()): + if states[i].pattern.nr_py >= 1: + update_predicate_cache(cached_py_predicates, + states[i].pattern, token, py_predicates) + action = get_action(states[i], token.c, extra_attrs, + cached_py_predicates) + if action == REJECT: + continue + # Keep only a subset of states (the active ones). Index q is the + # states which are still alive. If we reject a state, we overwrite + # it in the states list, because q doesn't advance. + state = states[i] + states[q] = state + while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND): + if action == RETRY_EXTEND: + # This handles the 'extend' + new_states.push_back( + PatternStateC(pattern=state.pattern, start=state.start, + length=state.length+1)) + if action == RETRY_ADVANCE: + # This handles the 'advance' + new_states.push_back( + PatternStateC(pattern=state.pattern+1, start=state.start, + length=state.length+1)) + states[q].pattern += 1 + if states[q].pattern.nr_py != 0: + update_predicate_cache(cached_py_predicates, + states[q].pattern, token, py_predicates) + action = get_action(states[q], token.c, extra_attrs, + cached_py_predicates) + if action == REJECT: + pass + elif action == ADVANCE: + states[q].pattern += 1 + states[q].length += 1 + q += 1 + else: + ent_id = get_ent_id(&state.pattern[1]) + if action == MATCH: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length+1)) + elif action == MATCH_REJECT: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length)) + elif action == MATCH_EXTEND: + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, + length=state.length)) + states[q].length += 1 + q += 1 + states.resize(q) + for i in range(new_states.size()): + states.push_back(new_states[i]) + + +cdef int update_predicate_cache(char* cache, + const TokenPatternC* pattern, Token token, predicates) except -1: + # If the state references any extra predicates, check whether they match. + # These are cached, so that we don't call these potentially expensive + # Python functions more than we need to. + for i in range(pattern.nr_py): + index = pattern.py_predicates[i] + if cache[index] == 0: + predicate = predicates[index] + result = predicate(token) + if result is True: + cache[index] = 1 + elif result is False: + cache[index] = -1 + elif result is None: + pass + else: + raise ValueError(Errors.E125.format(value=result)) + + +cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *: + """Handle states that end in zero-width patterns.""" + cdef PatternStateC state + for i in range(states.size()): + state = states[i] + while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE): + is_final = get_is_final(state) + if is_final: + ent_id = get_ent_id(state.pattern) + matches.push_back( + MatchC(pattern_id=ent_id, start=state.start, length=state.length)) + break + else: + state.pattern += 1 + + +cdef action_t get_action(PatternStateC state, + const TokenC* token, const attr_t* extra_attrs, + const char* predicate_matches) nogil: + """We need to consider: + a) Does the token match the specification? [Yes, No] + b) What's the quantifier? [1, 0+, ?] + c) Is this the last specification? [final, non-final] + + We can transition in the following ways: + a) Do we emit a match? + b) Do we add a state with (next state, next token)? + c) Do we add a state with (next state, same token)? + d) Do we add a state with (same state, next token)? + + We'll code the actions as boolean strings, so 0000 means no to all 4, + 1000 means match but no states added, etc. + + 1: + Yes, final: + 1000 + Yes, non-final: + 0100 + No, final: + 0000 + No, non-final + 0000 + 0+: + Yes, final: + 1001 + Yes, non-final: + 0011 + No, final: + 1000 (note: Don't include last token!) + No, non-final: + 0010 + ?: + Yes, final: + 1000 + Yes, non-final: + 0100 + No, final: + 1000 (note: Don't include last token!) + No, non-final: + 0010 + + Possible combinations: 1000, 0100, 0000, 1001, 0110, 0011, 0010, + + We'll name the bits "match", "advance", "retry", "extend" + REJECT = 0000 + MATCH = 1000 + ADVANCE = 0100 + RETRY = 0010 + MATCH_EXTEND = 1001 + RETRY_ADVANCE = 0110 + RETRY_EXTEND = 0011 + MATCH_REJECT = 2000 # Match, but don't include last token + + Problem: If a quantifier is matching, we're adding a lot of open partials + """ + cdef char is_match + is_match = get_is_match(state, token, extra_attrs, predicate_matches) + quantifier = get_quantifier(state) + is_final = get_is_final(state) + if quantifier == ZERO: + is_match = not is_match + quantifier = ONE + if quantifier == ONE: + if is_match and is_final: + # Yes, final: 1000 + return MATCH + elif is_match and not is_final: + # Yes, non-final: 0100 + return ADVANCE + elif not is_match and is_final: + # No, final: 0000 + return REJECT + else: + return REJECT + elif quantifier == ZERO_PLUS: + if is_match and is_final: + # Yes, final: 1001 + return MATCH_EXTEND + elif is_match and not is_final: + # Yes, non-final: 0011 + return RETRY_EXTEND + elif not is_match and is_final: + # No, final 2000 (note: Don't include last token!) + return MATCH_REJECT + else: + # No, non-final 0010 + return RETRY + elif quantifier == ZERO_ONE: + if is_match and is_final: + # Yes, final: 1000 + return MATCH + elif is_match and not is_final: + # Yes, non-final: 0110 + # We need both branches here, consider a pair like: + # pattern: .?b string: b + # If we 'ADVANCE' on the .?, we miss the match. + return RETRY_ADVANCE + elif not is_match and is_final: + # No, final 2000 (note: Don't include last token!) + return MATCH_REJECT + else: + # No, non-final 0010 + return RETRY + + +cdef char get_is_match(PatternStateC state, + const TokenC* token, const attr_t* extra_attrs, + const char* predicate_matches) nogil: + for i in range(state.pattern.nr_py): + if predicate_matches[state.pattern.py_predicates[i]] == -1: + return 0 + spec = state.pattern + for attr in spec.attrs[:spec.nr_attr]: + if get_token_attr(token, attr.attr) != attr.value: + return 0 + for i in range(spec.nr_extra_attr): + if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]: + return 0 + return True + + +cdef char get_is_final(PatternStateC state) nogil: + if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0: + return 1 + else: + return 0 + + +cdef char get_quantifier(PatternStateC state) nogil: + return state.pattern.quantifier + + +cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL: + pattern = mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) + cdef int i, index + for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs): + pattern[i].quantifier = quantifier + pattern[i].attrs = mem.alloc(len(spec), sizeof(AttrValueC)) + pattern[i].nr_attr = len(spec) + for j, (attr, value) in enumerate(spec): + pattern[i].attrs[j].attr = attr + pattern[i].attrs[j].value = value + pattern[i].extra_attrs = mem.alloc(len(extensions), sizeof(IndexValueC)) + for j, (index, value) in enumerate(extensions): + pattern[i].extra_attrs[j].index = index + pattern[i].extra_attrs[j].value = value + pattern[i].nr_extra_attr = len(extensions) + pattern[i].py_predicates = mem.alloc(len(predicates), sizeof(int32_t)) + for j, index in enumerate(predicates): + pattern[i].py_predicates[j] = index + pattern[i].nr_py = len(predicates) + pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0) + i = len(token_specs) + pattern[i].attrs = mem.alloc(2, sizeof(AttrValueC)) + pattern[i].attrs[0].attr = ID + pattern[i].attrs[0].value = entity_id + pattern[i].nr_attr = 0 + pattern[i].nr_extra_attr = 0 + pattern[i].nr_py = 0 + return pattern + + +cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil: + while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0: + pattern += 1 + id_attr = pattern[0].attrs[0] + if id_attr.attr != ID: + with gil: + raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr)) + return id_attr.value + + +def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates): + """This function interprets the pattern, converting the various bits of + syntactic sugar before we compile it into a struct with init_pattern. + + We need to split the pattern up into three parts: + * Normal attribute/value pairs, which are stored on either the token or lexeme, + can be handled directly. + * Extension attributes are handled specially, as we need to prefetch the + values from Python for the doc before we begin matching. + * Extra predicates also call Python functions, so we have to create the + functions and store them. So we store these specially as well. + * Extension attributes that have extra predicates are stored within the + extra_predicates. + """ + tokens = [] + for spec in token_specs: + if not spec: + # Signifier for 'any token' + tokens.append((ONE, [(NULL_ATTR, 0)], [], [])) + continue + ops = _get_operators(spec) + attr_values = _get_attr_values(spec, string_store) + extensions = _get_extensions(spec, string_store, extensions_table) + predicates = _get_extra_predicates(spec, extra_predicates) + for op in ops: + tokens.append((op, list(attr_values), list(extensions), list(predicates))) + return tokens + + +def _get_attr_values(spec, string_store): + attr_values = [] + for attr, value in spec.items(): + if isinstance(attr, basestring): + if attr == '_': + continue + elif attr.upper() == "OP": + continue + if attr.upper() == "TEXT": + attr = "ORTH" + attr = IDS.get(attr.upper()) + if isinstance(value, basestring): + value = string_store.add(value) + elif isinstance(value, bool): + value = int(value) + elif isinstance(value, dict): + continue + if attr is not None: + attr_values.append((attr, value)) + return attr_values + + +# These predicate helper classes are used to match the REGEX, IN, >= etc +# extensions to the matcher introduced in #3173. + +class _RegexPredicate(object): + operators = ("REGEX",) + + def __init__(self, i, attr, value, predicate, is_extension=False): + self.i = i + self.attr = attr + self.value = re.compile(value) + self.predicate = predicate + self.is_extension = is_extension + self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + if self.predicate not in self.operators: + raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) + + def __call__(self, Token token): + if self.is_extension: + value = token._.get(self.attr) + else: + value = token.vocab.strings[get_token_attr(token.c, self.attr)] + return bool(self.value.search(value)) + + +class _SetMemberPredicate(object): + operators = ("IN", "NOT_IN") + + def __init__(self, i, attr, value, predicate, is_extension=False): + self.i = i + self.attr = attr + self.value = set(get_string_id(v) for v in value) + self.predicate = predicate + self.is_extension = is_extension + self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + if self.predicate not in self.operators: + raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) + + def __call__(self, Token token): + if self.is_extension: + value = get_string_id(token._.get(self.attr)) + else: + value = get_token_attr(token.c, self.attr) + if self.predicate == "IN": + return value in self.value + else: + return value not in self.value + + def __repr__(self): + return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate)) + + +class _ComparisonPredicate(object): + operators = ("==", "!=", ">=", "<=", ">", "<") + + def __init__(self, i, attr, value, predicate, is_extension=False): + self.i = i + self.attr = attr + self.value = value + self.predicate = predicate + self.is_extension = is_extension + self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + if self.predicate not in self.operators: + raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) + + def __call__(self, Token token): + if self.is_extension: + value = token._.get(self.attr) + else: + value = get_token_attr(token.c, self.attr) + if self.predicate == "==": + return value == self.value + if self.predicate == "!=": + return value != self.value + elif self.predicate == ">=": + return value >= self.value + elif self.predicate == "<=": + return value <= self.value + elif self.predicate == ">": + return value > self.value + elif self.predicate == "<": + return value < self.value + + +def _get_extra_predicates(spec, extra_predicates): + predicate_types = { + "REGEX": _RegexPredicate, + "IN": _SetMemberPredicate, + "NOT_IN": _SetMemberPredicate, + "==": _ComparisonPredicate, + ">=": _ComparisonPredicate, + "<=": _ComparisonPredicate, + ">": _ComparisonPredicate, + "<": _ComparisonPredicate, + } + seen_predicates = {pred.key: pred.i for pred in extra_predicates} + output = [] + for attr, value in spec.items(): + if isinstance(attr, basestring): + if attr == "_": + output.extend( + _get_extension_extra_predicates( + value, extra_predicates, predicate_types, + seen_predicates)) + continue + elif attr.upper() == "OP": + continue + if attr.upper() == "TEXT": + attr = "ORTH" + attr = IDS.get(attr.upper()) + if isinstance(value, dict): + for type_, cls in predicate_types.items(): + if type_ in value: + predicate = cls(len(extra_predicates), attr, value[type_], type_) + # Don't create a redundant predicates. + # This helps with efficiency, as we're caching the results. + if predicate.key in seen_predicates: + output.append(seen_predicates[predicate.key]) + else: + extra_predicates.append(predicate) + output.append(predicate.i) + seen_predicates[predicate.key] = predicate.i + return output + + +def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, + seen_predicates): + output = [] + for attr, value in spec.items(): + if isinstance(value, dict): + for type_, cls in predicate_types.items(): + if type_ in value: + key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) + if key in seen_predicates: + output.append(seen_predicates[key]) + else: + predicate = cls(len(extra_predicates), attr, value[type_], type_, + is_extension=True) + extra_predicates.append(predicate) + output.append(predicate.i) + seen_predicates[key] = predicate.i + return output + + +def _get_operators(spec): + # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS + lookup = {"*": (ZERO_PLUS,), "+": (ONE, ZERO_PLUS), + "?": (ZERO_ONE,), "1": (ONE,), "!": (ZERO,)} + # Fix casing + spec = {key.upper(): values for key, values in spec.items() + if isinstance(key, basestring)} + if "OP" not in spec: + return (ONE,) + elif spec["OP"] in lookup: + return lookup[spec["OP"]] + else: + keys = ", ".join(lookup.keys()) + raise KeyError(Errors.E011.format(op=spec["OP"], opts=keys)) + + +def _get_extensions(spec, string_store, name2index): + attr_values = [] + for name, value in spec.get("_", {}).items(): + if isinstance(value, dict): + # Handle predicates (e.g. "IN", in the extra_predicates, not here. + continue + if isinstance(value, basestring): + value = string_store.add(value) + if name not in name2index: + name2index[name] = len(name2index) + attr_values.append((name2index[name], value)) + return attr_values diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 04c8ad7dd..5fb59e47f 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -12,7 +12,7 @@ from ..vocab cimport Vocab from ..tokens.doc cimport Doc, get_token_attr from ..typedefs cimport attr_t, hash_t -from ..errors import Warnings, deprecation_warning, user_warning +from ..errors import Errors, Warnings, deprecation_warning, user_warning from ..attrs import FLAG61 as U_ENT from ..attrs import FLAG60 as B2_ENT from ..attrs import FLAG59 as B3_ENT @@ -25,6 +25,13 @@ from ..attrs import FLAG41 as I4_ENT cdef class PhraseMatcher: + """Efficiently match large terminology lists. While the `Matcher` matches + sequences based on lists of token descriptions, the `PhraseMatcher` accepts + match patterns in the form of `Doc` objects. + + DOCS: https://spacy.io/api/phrasematcher + USAGE: https://spacy.io/usage/rule-based-matching#phrasematcher + """ cdef Pool mem cdef Vocab vocab cdef Matcher matcher @@ -36,7 +43,16 @@ cdef class PhraseMatcher: cdef public object _docs cdef public object _validate - def __init__(self, Vocab vocab, max_length=0, attr='ORTH', validate=False): + def __init__(self, Vocab vocab, max_length=0, attr="ORTH", validate=False): + """Initialize the PhraseMatcher. + + vocab (Vocab): The shared vocabulary. + attr (int / unicode): Token attribute to match on. + validate (bool): Perform additional validation when patterns are added. + RETURNS (PhraseMatcher): The newly constructed object. + + DOCS: https://spacy.io/api/phrasematcher#init + """ if max_length != 0: deprecation_warning(Warnings.W010) self.mem = Pool() @@ -54,7 +70,7 @@ cdef class PhraseMatcher: [{B3_ENT: True}, {I3_ENT: True}, {L3_ENT: True}], [{B4_ENT: True}, {I4_ENT: True}, {I4_ENT: True, "OP": "+"}, {L4_ENT: True}], ] - self.matcher.add('Candidate', None, *abstract_patterns) + self.matcher.add("Candidate", None, *abstract_patterns) self._callbacks = {} self._docs = {} self._validate = validate @@ -65,6 +81,8 @@ cdef class PhraseMatcher: number of individual patterns. RETURNS (int): The number of rules. + + DOCS: https://spacy.io/api/phrasematcher#len """ return len(self._docs) @@ -73,6 +91,8 @@ cdef class PhraseMatcher: key (unicode): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. + + DOCS: https://spacy.io/api/phrasematcher#contains """ cdef hash_t ent_id = self.matcher._normalize_key(key) return ent_id in self._callbacks @@ -88,6 +108,8 @@ cdef class PhraseMatcher: key (unicode): The match ID. on_match (callable): Callback executed on match. *docs (Doc): `Doc` objects representing match patterns. + + DOCS: https://spacy.io/api/phrasematcher#add """ cdef Doc doc cdef hash_t ent_id = self.matcher._normalize_key(key) @@ -112,8 +134,7 @@ cdef class PhraseMatcher: lexeme = self.vocab[attr_value] lexeme.set_flag(tag, True) phrase_key[i] = lexeme.orth - phrase_hash = hash64(phrase_key, - length * sizeof(attr_t), 0) + phrase_hash = hash64(phrase_key, length * sizeof(attr_t), 0) self.phrase_ids.set(phrase_hash, ent_id) def __call__(self, Doc doc): @@ -123,6 +144,8 @@ cdef class PhraseMatcher: RETURNS (list): A list of `(key, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `label_id` and `key` are both integers. + + DOCS: https://spacy.io/api/phrasematcher#call """ matches = [] if self.attr == ORTH: @@ -158,6 +181,8 @@ cdef class PhraseMatcher: If both return_matches and as_tuples are True, the output will be a sequence of ((doc, matches), context) tuples. YIELDS (Doc): Documents, in order. + + DOCS: https://spacy.io/api/phrasematcher#pipe """ if as_tuples: for doc, context in stream: @@ -180,8 +205,7 @@ cdef class PhraseMatcher: phrase_key = mem.alloc(end-start, sizeof(attr_t)) for i, j in enumerate(range(start, end)): phrase_key[i] = doc.c[j].lex.orth - cdef hash_t key = hash64(phrase_key, - (end-start) * sizeof(attr_t), 0) + cdef hash_t key = hash64(phrase_key, (end-start) * sizeof(attr_t), 0) ent_id = self.phrase_ids.get(key) if ent_id == 0: return None @@ -203,12 +227,12 @@ cdef class PhraseMatcher: # Concatenate the attr name and value to not pollute lexeme space # e.g. 'POS-VERB' instead of just 'VERB', which could otherwise # create false positive matches - return 'matcher:{}-{}'.format(string_attr_name, string_attr_value) + return "matcher:{}-{}".format(string_attr_name, string_attr_value) def get_bilou(length): if length == 0: - raise ValueError("Length must be >= 1") + raise ValueError(Errors.E127) elif length == 1: return [U_ENT] elif length == 2: diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index d683cc989..64286832f 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,8 +1,23 @@ # coding: utf8 from __future__ import unicode_literals -from .pipes import Tagger, DependencyParser, EntityRecognizer # noqa -from .pipes import TextCategorizer, Tensorizer, Pipe # noqa -from .entityruler import EntityRuler # noqa -from .hooks import SentenceSegmenter, SimilarityHook # noqa -from .functions import merge_entities, merge_noun_chunks, merge_subtokens # noqa +from .pipes import Tagger, DependencyParser, EntityRecognizer +from .pipes import TextCategorizer, Tensorizer, Pipe +from .entityruler import EntityRuler +from .hooks import SentenceSegmenter, SimilarityHook +from .functions import merge_entities, merge_noun_chunks, merge_subtokens + +__all__ = [ + "Tagger", + "DependencyParser", + "EntityRecognizer", + "TextCategorizer", + "Tensorizer", + "Pipe", + "EntityRuler", + "SentenceSegmenter", + "SimilarityHook", + "merge_entities", + "merge_noun_chunks", + "merge_subtokens", +] diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 6b757fe7c..09a0c0491 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -12,10 +12,20 @@ from ..matcher import Matcher, PhraseMatcher class EntityRuler(object): + """The EntityRuler lets you add spans to the `Doc.ents` using token-based + rules or exact phrase matches. It can be combined with the statistical + `EntityRecognizer` to boost accuracy, or used on its own to implement a + purely rule-based entity recognition system. After initialization, the + component is typically added to the pipeline using `nlp.add_pipe`. + + DOCS: https://spacy.io/api/entityruler + USAGE: https://spacy.io/usage/rule-based-matching#entityruler + """ + name = "entity_ruler" def __init__(self, nlp, **cfg): - """Initialise the entitiy ruler. If patterns are supplied here, they + """Initialize the entitiy ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either be a token pattern (list) or a phrase pattern (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`. @@ -29,6 +39,8 @@ class EntityRuler(object): of a model pipeline, this will include all keyword arguments passed to `spacy.load`. RETURNS (EntityRuler): The newly constructed object. + + DOCS: https://spacy.io/api/entityruler#init """ self.nlp = nlp self.overwrite = cfg.get("overwrite_ents", False) @@ -55,6 +67,8 @@ class EntityRuler(object): doc (Doc): The Doc object in the pipeline. RETURNS (Doc): The Doc with added entities, if available. + + DOCS: https://spacy.io/api/entityruler#call """ matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = set( @@ -83,6 +97,8 @@ class EntityRuler(object): """All labels present in the match patterns. RETURNS (set): The string labels. + + DOCS: https://spacy.io/api/entityruler#labels """ all_labels = set(self.token_patterns.keys()) all_labels.update(self.phrase_patterns.keys()) @@ -93,6 +109,8 @@ class EntityRuler(object): """Get all patterns that were added to the entity ruler. RETURNS (list): The original patterns, one dictionary per pattern. + + DOCS: https://spacy.io/api/entityruler#patterns """ all_patterns = [] for label, patterns in self.token_patterns.items(): @@ -110,6 +128,8 @@ class EntityRuler(object): {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]} patterns (list): The patterns to add. + + DOCS: https://spacy.io/api/entityruler#add_patterns """ for entry in patterns: label = entry["label"] @@ -131,6 +151,8 @@ class EntityRuler(object): patterns_bytes (bytes): The bytestring to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. + + DOCS: https://spacy.io/api/entityruler#from_bytes """ patterns = srsly.msgpack_loads(patterns_bytes) self.add_patterns(patterns) @@ -140,6 +162,8 @@ class EntityRuler(object): """Serialize the entity ruler patterns to a bytestring. RETURNS (bytes): The serialized patterns. + + DOCS: https://spacy.io/api/entityruler#to_bytes """ return srsly.msgpack_dumps(self.patterns) @@ -150,6 +174,8 @@ class EntityRuler(object): path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. + + DOCS: https://spacy.io/api/entityruler#from_disk """ path = ensure_path(path) path = path.with_suffix(".jsonl") @@ -164,6 +190,8 @@ class EntityRuler(object): path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. + + DOCS: https://spacy.io/api/entityruler """ path = ensure_path(path) path = path.with_suffix(".jsonl") diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 925f0e0fc..0f7d94df2 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -9,6 +9,8 @@ def merge_noun_chunks(doc): doc (Doc): The Doc object. RETURNS (Doc): The Doc object with merged noun chunks. + + DOCS: https://spacy.io/api/pipeline-functions#merge_noun_chunks """ if not doc.is_parsed: return doc @@ -23,7 +25,9 @@ def merge_entities(doc): """Merge entities into a single token. doc (Doc): The Doc object. - RETURNS (Doc): The Doc object with merged noun entities. + RETURNS (Doc): The Doc object with merged entities. + + DOCS: https://spacy.io/api/pipeline-functions#merge_entities """ with doc.retokenize() as retokenizer: for ent in doc.ents: @@ -33,6 +37,14 @@ def merge_entities(doc): def merge_subtokens(doc, label="subtok"): + """Merge subtokens into a single token. + + doc (Doc): The Doc object. + label (unicode): The subtoken dependency label. + RETURNS (Doc): The Doc object with merged subtokens. + + DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens + """ merger = Matcher(doc.vocab) merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}]) matches = merger(doc) diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index 7e9d09c54..e998ee0cb 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -15,6 +15,8 @@ class SentenceSegmenter(object): initialization, or assign a new strategy to the .strategy attribute. Sentence detection strategies should be generators that take `Doc` objects and yield `Span` objects for each sentence. + + DOCS: https://spacy.io/api/sentencesegmenter """ name = "sentencizer" diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index bde794e75..b3c3db04d 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -6,9 +6,8 @@ from __future__ import unicode_literals cimport numpy as np import numpy -from collections import OrderedDict import srsly - +from collections import OrderedDict from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax from thinc.misc import LayerNorm @@ -284,9 +283,7 @@ class Tensorizer(Pipe): """ for doc, tensor in zip(docs, tensors): if tensor.shape[0] != len(doc): - raise ValueError( - Errors.E076.format(rows=tensor.shape[0], words=len(doc)) - ) + raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) doc.tensor = tensor def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): @@ -346,14 +343,19 @@ class Tensorizer(Pipe): class Tagger(Pipe): - name = 'tagger' + """Pipeline component for part-of-speech tagging. + + DOCS: https://spacy.io/api/tagger + """ + + name = "tagger" def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model self._rehearsal_model = None self.cfg = OrderedDict(sorted(cfg.items())) - self.cfg.setdefault('cnn_maxout_pieces', 2) + self.cfg.setdefault("cnn_maxout_pieces", 2) @property def labels(self): @@ -404,7 +406,7 @@ class Tagger(Pipe): cdef Vocab vocab = self.vocab for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] - if hasattr(doc_tag_ids, 'get'): + if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags @@ -453,9 +455,9 @@ class Tagger(Pipe): scores = self.model.ops.flatten(scores) tag_index = {tag: i for i, tag in enumerate(self.labels)} cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype='i') + correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) - known_labels = numpy.ones((scores.shape[0], 1), dtype='f') + known_labels = numpy.ones((scores.shape[0], 1), dtype="f") for gold in golds: for tag in gold.tags: if tag is None: @@ -466,7 +468,7 @@ class Tagger(Pipe): correct[idx] = 0 known_labels[idx] = 0. idx += 1 - correct = self.model.ops.xp.array(correct, dtype='i') + correct = self.model.ops.xp.array(correct, dtype="i") d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() @@ -490,9 +492,9 @@ class Tagger(Pipe): vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer, exc=vocab.morphology.exc) - self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') + self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") if self.model is True: - for hp in ['token_vector_width', 'conv_depth']: + for hp in ["token_vector_width", "conv_depth"]: if hp in kwargs: self.cfg[hp] = kwargs[hp] self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) @@ -503,7 +505,7 @@ class Tagger(Pipe): @classmethod def Model(cls, n_tags, **cfg): - if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): + if cfg.get("pretrained_dims") and not cfg.get("pretrained_vectors"): raise ValueError(TempErrors.T008) return build_tagger_model(n_tags, **cfg) @@ -538,25 +540,23 @@ class Tagger(Pipe): def to_bytes(self, **exclude): serialize = OrderedDict() if self.model not in (None, True, False): - serialize['model'] = self.model.to_bytes - serialize['vocab'] = self.vocab.to_bytes - serialize['cfg'] = lambda: srsly.json_dumps(self.cfg) + serialize["model"] = self.model.to_bytes + serialize["vocab"] = self.vocab.to_bytes + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) - serialize['tag_map'] = lambda: srsly.msgpack_dumps(tag_map) + serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): def load_model(b): # TODO: Remove this once we don't have to handle previous models - if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: - self.cfg['pretrained_vectors'] = self.vocab.vectors.name - + if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: + self.cfg["pretrained_vectors"] = self.vocab.vectors.name if self.model is True: token_vector_width = util.env_opt( - 'token_vector_width', - self.cfg.get('token_vector_width', 96)) - self.model = self.Model(self.vocab.morphology.n_tags, - **self.cfg) + "token_vector_width", + self.cfg.get("token_vector_width", 96)) + self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) self.model.from_bytes(b) def load_tag_map(b): @@ -567,10 +567,10 @@ class Tagger(Pipe): exc=self.vocab.morphology.exc) deserialize = OrderedDict(( - ('vocab', lambda b: self.vocab.from_bytes(b)), - ('tag_map', load_tag_map), - ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), - ('model', lambda b: load_model(b)), + ("vocab", lambda b: self.vocab.from_bytes(b)), + ("tag_map", load_tag_map), + ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), + ("model", lambda b: load_model(b)), )) util.from_bytes(bytes_data, deserialize, exclude) return self @@ -580,7 +580,7 @@ class Tagger(Pipe): serialize = OrderedDict(( ('vocab', lambda p: self.vocab.to_disk(p)), ('tag_map', lambda p: srsly.write_msgpack(p, tag_map)), - ('model', lambda p: p.open('wb').write(self.model.to_bytes())), + ('model', lambda p: p.open("wb").write(self.model.to_bytes())), ('cfg', lambda p: srsly.write_json(p, self.cfg)) )) util.to_disk(path, serialize, exclude) @@ -588,11 +588,11 @@ class Tagger(Pipe): def from_disk(self, path, **exclude): def load_model(p): # TODO: Remove this once we don't have to handle previous models - if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: - self.cfg['pretrained_vectors'] = self.vocab.vectors.name + if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: + self.cfg["pretrained_vectors"] = self.vocab.vectors.name if self.model is True: self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) - with p.open('rb') as file_: + with p.open("rb") as file_: self.model.from_bytes(file_.read()) def load_tag_map(p): @@ -603,10 +603,10 @@ class Tagger(Pipe): exc=self.vocab.morphology.exc) deserialize = OrderedDict(( - ('cfg', lambda p: self.cfg.update(_load_cfg(p))), - ('vocab', lambda p: self.vocab.from_disk(p)), - ('tag_map', load_tag_map), - ('model', load_model), + ("cfg", lambda p: self.cfg.update(_load_cfg(p))), + ("vocab", lambda p: self.vocab.from_disk(p)), + ("tag_map", load_tag_map), + ("model", load_model), )) util.from_disk(path, deserialize, exclude) return self @@ -616,37 +616,38 @@ class MultitaskObjective(Tagger): """Experimental: Assist training of a parser or tagger, by training a side-objective. """ - name = 'nn_labeller' + + name = "nn_labeller" def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): self.vocab = vocab self.model = model - if target == 'dep': + if target == "dep": self.make_label = self.make_dep - elif target == 'tag': + elif target == "tag": self.make_label = self.make_tag - elif target == 'ent': + elif target == "ent": self.make_label = self.make_ent - elif target == 'dep_tag_offset': + elif target == "dep_tag_offset": self.make_label = self.make_dep_tag_offset - elif target == 'ent_tag': + elif target == "ent_tag": self.make_label = self.make_ent_tag - elif target == 'sent_start': + elif target == "sent_start": self.make_label = self.make_sent_start - elif hasattr(target, '__call__'): + elif hasattr(target, "__call__"): self.make_label = target else: raise ValueError(Errors.E016) self.cfg = dict(cfg) - self.cfg.setdefault('cnn_maxout_pieces', 2) + self.cfg.setdefault("cnn_maxout_pieces", 2) @property def labels(self): - return self.cfg.setdefault('labels', {}) + return self.cfg.setdefault("labels", {}) @labels.setter def labels(self, value): - self.cfg['labels'] = value + self.cfg["labels"] = value def set_annotations(self, docs, dep_ids, tensors=None): pass @@ -662,7 +663,7 @@ class MultitaskObjective(Tagger): if label is not None and label not in self.labels: self.labels[label] = len(self.labels) if self.model is True: - token_vector_width = util.env_opt('token_vector_width') + token_vector_width = util.env_opt("token_vector_width") self.model = self.Model(len(self.labels), tok2vec=tok2vec) link_vectors_to_models(self.vocab) if sgd is None: @@ -671,7 +672,7 @@ class MultitaskObjective(Tagger): @classmethod def Model(cls, n_tags, tok2vec=None, **cfg): - token_vector_width = util.env_opt('token_vector_width', 96) + token_vector_width = util.env_opt("token_vector_width", 96) softmax = Softmax(n_tags, token_vector_width*2) model = chain( tok2vec, @@ -690,10 +691,10 @@ class MultitaskObjective(Tagger): def get_loss(self, docs, golds, scores): if len(docs) != len(golds): - raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs), + raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs), n_golds=len(golds))) cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype='i') + correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) for i, gold in enumerate(golds): for j in range(len(docs[i])): @@ -705,7 +706,7 @@ class MultitaskObjective(Tagger): else: correct[idx] = self.labels[label] idx += 1 - correct = self.model.ops.xp.array(correct, dtype='i') + correct = self.model.ops.xp.array(correct, dtype="i") d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) loss = (d_scores**2).sum() return float(loss), d_scores @@ -733,25 +734,25 @@ class MultitaskObjective(Tagger): offset = heads[i] - i offset = min(offset, 2) offset = max(offset, -2) - return '%s-%s:%d' % (deps[i], tags[i], offset) + return "%s-%s:%d" % (deps[i], tags[i], offset) @staticmethod def make_ent_tag(i, words, tags, heads, deps, ents): if ents is None or ents[i] is None: return None else: - return '%s-%s' % (tags[i], ents[i]) + return "%s-%s" % (tags[i], ents[i]) @staticmethod def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}): - '''A multi-task objective for representing sentence boundaries, + """A multi-task objective for representing sentence boundaries, using BILU scheme. (O is impossible) The implementation of this method uses an internal cache that relies on the identity of the heads array, to avoid requiring a new piece of gold data. You can pass cache=False if you know the cache will do the wrong thing. - ''' + """ assert len(words) == len(heads) assert target < len(words), (target, len(words)) if cache: @@ -760,10 +761,10 @@ class MultitaskObjective(Tagger): else: for key in list(_cache.keys()): _cache.pop(key) - sent_tags = ['I-SENT'] * len(words) + sent_tags = ["I-SENT"] * len(words) _cache[id(heads)] = sent_tags else: - sent_tags = ['I-SENT'] * len(words) + sent_tags = ["I-SENT"] * len(words) def _find_root(child): seen = set([child]) @@ -781,10 +782,10 @@ class MultitaskObjective(Tagger): sentences.setdefault(root, []).append(i) for root, span in sorted(sentences.items()): if len(span) == 1: - sent_tags[span[0]] = 'U-SENT' + sent_tags[span[0]] = "U-SENT" else: - sent_tags[span[0]] = 'B-SENT' - sent_tags[span[-1]] = 'L-SENT' + sent_tags[span[0]] = "B-SENT" + sent_tags[span[-1]] = "L-SENT" return sent_tags[target] @@ -854,6 +855,10 @@ class ClozeMultitask(Pipe): class TextCategorizer(Pipe): + """Pipeline component for text classification. + + DOCS: https://spacy.io/api/textcategorizer + """ name = 'textcat' @classmethod @@ -863,7 +868,7 @@ class TextCategorizer(Pipe): token_vector_width = cfg["token_vector_width"] else: token_vector_width = util.env_opt("token_vector_width", 96) - if cfg.get('architecture') == 'simple_cnn': + if cfg.get("architecture") == "simple_cnn": tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg) return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg) else: @@ -884,11 +889,11 @@ class TextCategorizer(Pipe): @property def labels(self): - return tuple(self.cfg.setdefault('labels', [])) + return tuple(self.cfg.setdefault("labels", [])) @labels.setter def labels(self, value): - self.cfg['labels'] = tuple(value) + self.cfg["labels"] = tuple(value) def __call__(self, doc): scores, tensors = self.predict([doc]) @@ -934,8 +939,8 @@ class TextCategorizer(Pipe): losses[self.name] += (gradient**2).sum() def get_loss(self, docs, golds, scores): - truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') - not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') + truths = numpy.zeros((len(golds), len(self.labels)), dtype="f") + not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f") for i, gold in enumerate(golds): for j, label in enumerate(self.labels): if label in gold.cats: @@ -956,20 +961,19 @@ class TextCategorizer(Pipe): # This functionality was available previously, but was broken. # The problem is that we resize the last layer, but the last layer # is actually just an ensemble. We're not resizing the child layers - # -- a huge problem. + # - a huge problem. raise ValueError(Errors.E116) - #smaller = self.model._layers[-1] - #larger = Affine(len(self.labels)+1, smaller.nI) - #copy_array(larger.W[:smaller.nO], smaller.W) - #copy_array(larger.b[:smaller.nO], smaller.b) - #self.model._layers[-1] = larger + # smaller = self.model._layers[-1] + # larger = Affine(len(self.labels)+1, smaller.nI) + # copy_array(larger.W[:smaller.nO], smaller.W) + # copy_array(larger.b[:smaller.nO], smaller.b) + # self.model._layers[-1] = larger self.labels = tuple(list(self.labels) + [label]) return 1 - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, - **kwargs): + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): if self.model is True: - self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') + self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") self.model = self.Model(len(self.labels), **self.cfg) link_vectors_to_models(self.vocab) if sgd is None: @@ -978,7 +982,12 @@ class TextCategorizer(Pipe): cdef class DependencyParser(Parser): - name = 'parser' + """Pipeline component for dependency parsing. + + DOCS: https://spacy.io/api/dependencyparser + """ + + name = "parser" TransitionSystem = ArcEager @property @@ -986,7 +995,7 @@ cdef class DependencyParser(Parser): return [nonproj.deprojectivize] def add_multitask_objective(self, target): - if target == 'cloze': + if target == "cloze": cloze = ClozeMultitask(self.vocab) self._multitasks.append(cloze) else: @@ -1000,8 +1009,7 @@ cdef class DependencyParser(Parser): tok2vec=tok2vec, sgd=sgd) def __reduce__(self): - return (DependencyParser, (self.vocab, self.moves, self.model), - None, None) + return (DependencyParser, (self.vocab, self.moves, self.model), None, None) @property def labels(self): @@ -1010,6 +1018,11 @@ cdef class DependencyParser(Parser): cdef class EntityRecognizer(Parser): + """Pipeline component for named entity recognition. + + DOCS: https://spacy.io/api/entityrecognizer + """ + name = "ner" TransitionSystem = BiluoPushDown nr_feature = 6 @@ -1040,4 +1053,4 @@ cdef class EntityRecognizer(Parser): if move[0] in ("B", "I", "L", "U"))) -__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer'] +__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"] diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 26407ec59..433b30e8b 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -20,7 +20,7 @@ from . import util def get_string_id(key): """Get a string ID, handling the reserved symbols correctly. If the key is already an ID, return it. - + This function optimises for convenience over performance, so shouldn't be used in tight loops. """ @@ -31,12 +31,12 @@ def get_string_id(key): elif not key: return 0 else: - chars = key.encode('utf8') + chars = key.encode("utf8") return hash_utf8(chars, len(chars)) cpdef hash_t hash_string(unicode string) except 0: - chars = string.encode('utf8') + chars = string.encode("utf8") return hash_utf8(chars, len(chars)) @@ -51,9 +51,9 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil: cdef unicode decode_Utf8Str(const Utf8Str* string): cdef int i, length if string.s[0] < sizeof(string.s) and string.s[0] != 0: - return string.s[1:string.s[0]+1].decode('utf8') + return string.s[1:string.s[0]+1].decode("utf8") elif string.p[0] < 255: - return string.p[1:string.p[0]+1].decode('utf8') + return string.p[1:string.p[0]+1].decode("utf8") else: i = 0 length = 0 @@ -62,7 +62,7 @@ cdef unicode decode_Utf8Str(const Utf8Str* string): length += 255 length += string.p[i] i += 1 - return string.p[i:length + i].decode('utf8') + return string.p[i:length + i].decode("utf8") cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *: @@ -91,7 +91,10 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e cdef class StringStore: - """Look up strings by 64-bit hashes.""" + """Look up strings by 64-bit hashes. + + DOCS: https://spacy.io/api/stringstore + """ def __init__(self, strings=None, freeze=False): """Create the StringStore. @@ -113,7 +116,7 @@ cdef class StringStore: if isinstance(string_or_id, basestring) and len(string_or_id) == 0: return 0 elif string_or_id == 0: - return u'' + return "" elif string_or_id in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string_or_id] @@ -181,7 +184,7 @@ cdef class StringStore: elif isinstance(string, unicode): key = hash_string(string) else: - string = string.encode('utf8') + string = string.encode("utf8") key = hash_utf8(string, len(string)) if key < len(SYMBOLS_BY_INT): return True @@ -296,7 +299,7 @@ cdef class StringStore: cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. - cdef bytes byte_string = py_string.encode('utf8') + cdef bytes byte_string = py_string.encode("utf8") return self._intern_utf8(byte_string, len(byte_string)) @cython.final diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 15a5f7274..4e0d49b59 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -3,16 +3,18 @@ # coding: utf8 from __future__ import unicode_literals -from collections import OrderedDict from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc from cymem.cymem cimport Pool from preshed.maps cimport PreshMap -import re cimport cython +from collections import OrderedDict +import re + from .tokens.doc cimport Doc from .strings cimport hash_string + from .errors import Errors, Warnings, deprecation_warning from . import util @@ -20,6 +22,8 @@ from . import util cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries. + + DOCS: https://spacy.io/api/tokenizer """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None): @@ -40,6 +44,8 @@ cdef class Tokenizer: EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) >>> tokenizer = English().Defaults.create_tokenizer(nlp) + + DOCS: https://spacy.io/api/tokenizer#init """ self.mem = Pool() self._cache = PreshMap() @@ -73,6 +79,8 @@ cdef class Tokenizer: string (unicode): The string to tokenize. RETURNS (Doc): A container for linguistic annotations. + + DOCS: https://spacy.io/api/tokenizer#call """ if len(string) >= (2 ** 30): raise ValueError(Errors.E025.format(length=len(string))) @@ -114,7 +122,7 @@ cdef class Tokenizer: cache_hit = self._try_cache(key, doc) if not cache_hit: self._tokenize(doc, span, key) - doc.c[doc.length - 1].spacy = string[-1] == ' ' and not in_ws + doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws return doc def pipe(self, texts, batch_size=1000, n_threads=2): @@ -122,9 +130,9 @@ cdef class Tokenizer: texts: A sequence of unicode texts. batch_size (int): Number of texts to accumulate in an internal buffer. - n_threads (int): Number of threads to use, if the implementation - supports multi-threading. The default tokenizer is single-threaded. YIELDS (Doc): A sequence of Doc objects, in order. + + DOCS: https://spacy.io/api/tokenizer#pipe """ for text in texts: yield self(text) @@ -235,7 +243,7 @@ cdef class Tokenizer: if not matches: tokens.push_back(self.vocab.get(tokens.mem, string), False) else: - # let's say we have dyn-o-mite-dave - the regex finds the + # Let's say we have dyn-o-mite-dave - the regex finds the # start and end positions of the hyphens start = 0 start_before_infixes = start @@ -257,7 +265,6 @@ cdef class Tokenizer: # https://github.com/explosion/spaCy/issues/768) infix_span = string[infix_start:infix_end] tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) - start = infix_end span = string[start:] if span: @@ -274,7 +281,7 @@ cdef class Tokenizer: for i in range(n): if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: return 0 - # See https://github.com/explosion/spaCy/issues/1250 + # See #1250 if has_special: return 0 cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) @@ -293,6 +300,8 @@ cdef class Tokenizer: RETURNS (list): A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. + + DOCS: https://spacy.io/api/tokenizer#find_infix """ if self.infix_finditer is None: return 0 @@ -304,6 +313,8 @@ cdef class Tokenizer: string (unicode): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`. + + DOCS: https://spacy.io/api/tokenizer#find_prefix """ if self.prefix_search is None: return 0 @@ -316,6 +327,8 @@ cdef class Tokenizer: string (unicode): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`. + + DOCS: https://spacy.io/api/tokenizer#find_suffix """ if self.suffix_search is None: return 0 @@ -334,6 +347,8 @@ cdef class Tokenizer: token_attrs (iterable): A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. + + DOCS: https://spacy.io/api/tokenizer#add_special_case """ substrings = list(substrings) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) @@ -350,8 +365,10 @@ cdef class Tokenizer: path (unicode or Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. + + DOCS: https://spacy.io/api/tokenizer#to_disk """ - with path.open('wb') as file_: + with path.open("wb") as file_: file_.write(self.to_bytes(**exclude)) def from_disk(self, path, **exclude): @@ -361,8 +378,10 @@ cdef class Tokenizer: path (unicode or Path): A path to a directory. Paths may be either strings or `Path`-like objects. RETURNS (Tokenizer): The modified `Tokenizer` object. + + DOCS: https://spacy.io/api/tokenizer#from_disk """ - with path.open('rb') as file_: + with path.open("rb") as file_: bytes_data = file_.read() self.from_bytes(bytes_data, **exclude) return self @@ -372,14 +391,16 @@ cdef class Tokenizer: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `Tokenizer` object. + + DOCS: https://spacy.io/api/tokenizer#to_bytes """ serializers = OrderedDict(( - ('vocab', lambda: self.vocab.to_bytes()), - ('prefix_search', lambda: _get_regex_pattern(self.prefix_search)), - ('suffix_search', lambda: _get_regex_pattern(self.suffix_search)), - ('infix_finditer', lambda: _get_regex_pattern(self.infix_finditer)), - ('token_match', lambda: _get_regex_pattern(self.token_match)), - ('exceptions', lambda: OrderedDict(sorted(self._rules.items()))) + ("vocab", lambda: self.vocab.to_bytes()), + ("prefix_search", lambda: _get_regex_pattern(self.prefix_search)), + ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), + ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), + ("token_match", lambda: _get_regex_pattern(self.token_match)), + ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) return util.to_bytes(serializers, exclude) @@ -389,26 +410,28 @@ cdef class Tokenizer: bytes_data (bytes): The data to load from. **exclude: Named attributes to prevent from being loaded. RETURNS (Tokenizer): The `Tokenizer` object. + + DOCS: https://spacy.io/api/tokenizer#from_bytes """ data = OrderedDict() deserializers = OrderedDict(( - ('vocab', lambda b: self.vocab.from_bytes(b)), - ('prefix_search', lambda b: data.setdefault('prefix_search', b)), - ('suffix_search', lambda b: data.setdefault('suffix_search', b)), - ('infix_finditer', lambda b: data.setdefault('infix_finditer', b)), - ('token_match', lambda b: data.setdefault('token_match', b)), - ('exceptions', lambda b: data.setdefault('rules', b)) + ("vocab", lambda b: self.vocab.from_bytes(b)), + ("prefix_search", lambda b: data.setdefault("prefix_search", b)), + ("suffix_search", lambda b: data.setdefault("suffix_search", b)), + ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), + ("token_match", lambda b: data.setdefault("token_match", b)), + ("exceptions", lambda b: data.setdefault("rules", b)) )) msg = util.from_bytes(bytes_data, deserializers, exclude) - if data.get('prefix_search'): - self.prefix_search = re.compile(data['prefix_search']).search - if data.get('suffix_search'): - self.suffix_search = re.compile(data['suffix_search']).search - if data.get('infix_finditer'): - self.infix_finditer = re.compile(data['infix_finditer']).finditer - if data.get('token_match'): - self.token_match = re.compile(data['token_match']).match - for string, substrings in data.get('rules', {}).items(): + if data.get("prefix_search"): + self.prefix_search = re.compile(data["prefix_search"]).search + if data.get("suffix_search"): + self.suffix_search = re.compile(data["suffix_search"]).search + if data.get("infix_finditer"): + self.infix_finditer = re.compile(data["infix_finditer"]).finditer + if data.get("token_match"): + self.token_match = re.compile(data["token_match"]).match + for string, substrings in data.get("rules", {}).items(): self.add_special_case(string, substrings) return self diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index b4815abd2..5722d45bc 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -1,5 +1,8 @@ +# coding: utf8 +from __future__ import unicode_literals + from .doc import Doc from .token import Token from .span import Span -__all__ = ['Doc', 'Token', 'Span'] +__all__ = ["Doc", "Token", "Span"] diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 50ab82e0e..b25a1a697 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -6,11 +6,11 @@ from __future__ import unicode_literals from libc.string cimport memcpy, memset from libc.stdlib cimport malloc, free - -import numpy from cymem.cymem cimport Pool from thinc.neural.util import get_array_module +import numpy + from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end from .span cimport Span from .token cimport Token @@ -26,11 +26,16 @@ from ..strings import get_string_id cdef class Retokenizer: - """Helper class for doc.retokenize() context manager.""" + """Helper class for doc.retokenize() context manager. + + DOCS: https://spacy.io/api/doc#retokenize + USAGE: https://spacy.io/usage/linguistic-features#retokenization + """ cdef Doc doc cdef list merges cdef list splits cdef set tokens_to_merge + def __init__(self, doc): self.doc = doc self.merges = [] @@ -40,6 +45,11 @@ cdef class Retokenizer: def merge(self, Span span, attrs=SimpleFrozenDict()): """Mark a span for merging. The attrs will be applied to the resulting token. + + span (Span): The span to merge. + attrs (dict): Attributes to set on the merged token. + + DOCS: https://spacy.io/api/doc#retokenizer.merge """ for token in span: if token.i in self.tokens_to_merge: @@ -58,6 +68,16 @@ cdef class Retokenizer: def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()): """Mark a Token for splitting, into the specified orths. The attrs will be applied to each subtoken. + + token (Token): The token to split. + orths (list): The verbatim text of the split tokens. Needs to match the + text of the original token. + heads (list): List of token or `(token, subtoken)` tuples specifying the + tokens to attach the newly split subtokens to. + attrs (dict): Attributes to set on all split tokens. Attribute names + mapped to list of per-token attribute values. + + DOCS: https://spacy.io/api/doc#retokenizer.split """ if ''.join(orths) != token.text: raise ValueError(Errors.E117.format(new=''.join(orths), old=token.text)) @@ -104,14 +124,12 @@ cdef class Retokenizer: # referred to in the splits. If we merged these tokens previously, we # have to raise an error if token_index == -1: - raise IndexError( - "Cannot find token to be split. Did it get merged?") + raise IndexError(Errors.E122) head_indices = [] for head_char, subtoken in heads: head_index = token_by_start(self.doc.c, self.doc.length, head_char) if head_index == -1: - raise IndexError( - "Cannot find head of token to be split. Did it get merged?") + raise IndexError(Errors.E123) # We want to refer to the token index of the head *after* the # mergery. We need to account for the extra tokens introduced. # e.g., let's say we have [ab, c] and we want a and b to depend @@ -206,7 +224,6 @@ def _merge(Doc doc, int start, int end, attributes): doc.c[i].head -= i # Set the left/right children, left/right edges set_children_from_heads(doc.c, doc.length) - # Clear the cached Python objects # Return the merged Python object return doc[start] @@ -336,7 +353,7 @@ def _bulk_merge(Doc doc, merges): # Make sure ent_iob remains consistent for (span, _) in merges: if(span.end < len(offsets)): - #if it's not the last span + # If it's not the last span token_after_span_position = offsets[span.end] if doc.c[token_after_span_position].ent_iob == 1\ and doc.c[token_after_span_position - 1].ent_iob in (0, 2): diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 5c3bf9c70..43ea78242 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals import numpy @@ -16,9 +17,8 @@ class Binder(object): def __init__(self, attrs=None): """Create a Binder object, to hold serialized annotations. - attrs (list): - List of attributes to serialize. 'orth' and 'spacy' are always - serialized, so they're not required. Defaults to None. + attrs (list): List of attributes to serialize. 'orth' and 'spacy' are + always serialized, so they're not required. Defaults to None. """ attrs = attrs or [] self.attrs = list(attrs) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2bef44cbc..1dfcd1687 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -7,28 +7,25 @@ from __future__ import unicode_literals cimport cython cimport numpy as np +from libc.string cimport memcpy, memset +from libc.math cimport sqrt + import numpy import numpy.linalg import struct import srsly from thinc.neural.util import get_array_module, copy_array -import srsly -from libc.string cimport memcpy, memset -from libc.math cimport sqrt - -from .span cimport Span -from .token cimport Token from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t -from ..attrs import intify_attrs, IDS -from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB -from ..attrs cimport ENT_TYPE, SENT_START +from ..attrs cimport ENT_TYPE, SENT_START, attr_id_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t + +from ..attrs import intify_attrs, IDS from ..util import normalize_slice from ..compat import is_config, copy_reg, pickle, basestring_ from ..errors import deprecation_warning, models_warning, user_warning @@ -37,6 +34,7 @@ from .. import util from .underscore import Underscore, get_ext_args from ._retokenize import Retokenizer + DEF PADDING = 5 @@ -77,7 +75,7 @@ def _get_chunker(lang): return None except KeyError: return None - return cls.Defaults.syntax_iterators.get(u'noun_chunks') + return cls.Defaults.syntax_iterators.get("noun_chunks") cdef class Doc: @@ -94,23 +92,60 @@ cdef class Doc: >>> from spacy.tokens import Doc >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) + + DOCS: https://spacy.io/api/doc """ + @classmethod def set_extension(cls, name, **kwargs): - if cls.has_extension(name) and not kwargs.get('force', False): - raise ValueError(Errors.E090.format(name=name, obj='Doc')) + """Define a custom attribute which becomes available as `Doc._`. + + name (unicode): Name of the attribute to set. + default: Optional default value of the attribute. + getter (callable): Optional getter function. + setter (callable): Optional setter function. + method (callable): Optional method for method extension. + force (bool): Force overwriting existing attribute. + + DOCS: https://spacy.io/api/doc#set_extension + USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes + """ + if cls.has_extension(name) and not kwargs.get("force", False): + raise ValueError(Errors.E090.format(name=name, obj="Doc")) Underscore.doc_extensions[name] = get_ext_args(**kwargs) @classmethod def get_extension(cls, name): + """Look up a previously registered extension by name. + + name (unicode): Name of the extension. + RETURNS (tuple): A `(default, method, getter, setter)` tuple. + + DOCS: https://spacy.io/api/doc#get_extension + """ return Underscore.doc_extensions.get(name) @classmethod def has_extension(cls, name): + """Check whether an extension has been registered. + + name (unicode): Name of the extension. + RETURNS (bool): Whether the extension has been registered. + + DOCS: https://spacy.io/api/doc#has_extension + """ return name in Underscore.doc_extensions @classmethod def remove_extension(cls, name): + """Remove a previously registered extension. + + name (unicode): Name of the extension. + RETURNS (tuple): A `(default, method, getter, setter)` tuple of the + removed extension. + + DOCS: https://spacy.io/api/doc#remove_extension + """ if not cls.has_extension(name): raise ValueError(Errors.E046.format(name=name)) return Underscore.doc_extensions.pop(name) @@ -128,6 +163,8 @@ cdef class Doc: it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. RETURNS (Doc): The newly constructed object. + + DOCS: https://spacy.io/api/doc#init """ self.vocab = vocab size = 20 @@ -151,7 +188,7 @@ cdef class Doc: self.user_hooks = {} self.user_token_hooks = {} self.user_span_hooks = {} - self.tensor = numpy.zeros((0,), dtype='float32') + self.tensor = numpy.zeros((0,), dtype="float32") self.user_data = {} if user_data is None else user_data self._vector = None self.noun_chunks_iterator = _get_chunker(self.vocab.lang) @@ -184,6 +221,7 @@ cdef class Doc: @property def _(self): + """Custom extension attributes registered via `set_extension`.""" return Underscore(Underscore.doc_extensions, self) @property @@ -195,7 +233,7 @@ cdef class Doc: b) sent.is_parsed is set to True; c) At least one token other than the first where sent_start is not None. """ - if 'sents' in self.user_hooks: + if "sents" in self.user_hooks: return True if self.is_parsed: return True @@ -227,11 +265,12 @@ cdef class Doc: supported, as `Span` objects must be contiguous (cannot have gaps). You can use negative indices and open-ended ranges, which have their normal Python semantics. + + DOCS: https://spacy.io/api/doc#getitem """ if isinstance(i, slice): start, stop = normalize_slice(len(self), i.start, i.stop, i.step) return Span(self, start, stop, label=0) - if i < 0: i = self.length + i bounds_check(i, self.length, PADDING) @@ -244,8 +283,7 @@ cdef class Doc: than-Python speeds are required, you can instead access the annotations as a numpy array, or access the underlying C data directly from Cython. - EXAMPLE: - >>> for token in doc + DOCS: https://spacy.io/api/doc#iter """ cdef int i for i in range(self.length): @@ -256,16 +294,15 @@ cdef class Doc: RETURNS (int): The number of tokens in the document. - EXAMPLE: - >>> len(doc) + DOCS: https://spacy.io/api/doc#len """ return self.length def __unicode__(self): - return u''.join([t.text_with_ws for t in self]) + return "".join([t.text_with_ws for t in self]) def __bytes__(self): - return u''.join([t.text_with_ws for t in self]).encode('utf-8') + return "".join([t.text_with_ws for t in self]).encode("utf-8") def __str__(self): if is_config(python3=True): @@ -290,6 +327,8 @@ cdef class Doc: vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. RETURNS (Span): The newly constructed object. + + DOCS: https://spacy.io/api/doc#char_span """ if not isinstance(label, int): label = self.vocab.strings.add(label) @@ -311,9 +350,11 @@ cdef class Doc: other (object): The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. RETURNS (float): A scalar similarity score. Higher is more similar. + + DOCS: https://spacy.io/api/doc#similarity """ - if 'similarity' in self.user_hooks: - return self.user_hooks['similarity'](self, other) + if "similarity" in self.user_hooks: + return self.user_hooks["similarity"](self, other) if isinstance(other, (Lexeme, Token)) and self.length == 1: if self.c[0].lex.orth == other.orth: return 1.0 @@ -325,9 +366,9 @@ cdef class Doc: else: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj='Doc')) + models_warning(Warnings.W007.format(obj="Doc")) if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj='Doc')) + user_warning(Warnings.W008.format(obj="Doc")) return 0.0 vector = self.vector xp = get_array_module(vector) @@ -338,10 +379,12 @@ cdef class Doc: the object. RETURNS (bool): Whether a word vector is associated with the object. + + DOCS: https://spacy.io/api/doc#has_vector """ def __get__(self): - if 'has_vector' in self.user_hooks: - return self.user_hooks['has_vector'](self) + if "has_vector" in self.user_hooks: + return self.user_hooks["has_vector"](self) elif self.vocab.vectors.data.size: return True elif self.tensor.size: @@ -355,15 +398,16 @@ cdef class Doc: RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array representing the document's semantics. + + DOCS: https://spacy.io/api/doc#vector """ def __get__(self): - if 'vector' in self.user_hooks: - return self.user_hooks['vector'](self) + if "vector" in self.user_hooks: + return self.user_hooks["vector"](self) if self._vector is not None: return self._vector elif not len(self): - self._vector = numpy.zeros((self.vocab.vectors_length,), - dtype='f') + self._vector = numpy.zeros((self.vocab.vectors_length,), dtype="f") return self._vector elif self.vocab.vectors.data.size > 0: self._vector = sum(t.vector for t in self) / len(self) @@ -372,8 +416,7 @@ cdef class Doc: self._vector = self.tensor.mean(axis=0) return self._vector else: - return numpy.zeros((self.vocab.vectors_length,), - dtype='float32') + return numpy.zeros((self.vocab.vectors_length,), dtype="float32") def __set__(self, value): self._vector = value @@ -382,10 +425,12 @@ cdef class Doc: """The L2 norm of the document's vector representation. RETURNS (float): The L2 norm of the vector representation. + + DOCS: https://spacy.io/api/doc#vector_norm """ def __get__(self): - if 'vector_norm' in self.user_hooks: - return self.user_hooks['vector_norm'](self) + if "vector_norm" in self.user_hooks: + return self.user_hooks["vector_norm"](self) cdef float value cdef double norm = 0 if self._vector_norm is None: @@ -404,7 +449,7 @@ cdef class Doc: RETURNS (unicode): The original verbatim text of the document. """ def __get__(self): - return u''.join(t.text_with_ws for t in self) + return "".join(t.text_with_ws for t in self) property text_with_ws: """An alias of `Doc.text`, provided for duck-type compatibility with @@ -416,21 +461,12 @@ cdef class Doc: return self.text property ents: - """Iterate over the entities in the document. Yields named-entity - `Span` objects, if the entity recognizer has been applied to the - document. + """The named entities in the document. Returns a tuple of named entity + `Span` objects, if the entity recognizer has been applied. - YIELDS (Span): Entities in the document. + RETURNS (tuple): Entities in the document, one `Span` per entity. - EXAMPLE: Iterate over the span to get individual Token objects, - or access the label: - - >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') - >>> ents = list(tokens.ents) - >>> assert ents[0].label == 346 - >>> assert ents[0].label_ == 'PERSON' - >>> assert ents[0].orth_ == 'Best' - >>> assert ents[0].text == 'Mr. Best' + DOCS: https://spacy.io/api/doc#ents """ def __get__(self): cdef int i @@ -442,8 +478,8 @@ cdef class Doc: token = &self.c[i] if token.ent_iob == 1: if start == -1: - seq = ['%s|%s' % (t.text, t.ent_iob_) for t in self[i-5:i+5]] - raise ValueError(Errors.E093.format(seq=' '.join(seq))) + seq = ["%s|%s" % (t.text, t.ent_iob_) for t in self[i-5:i+5]] + raise ValueError(Errors.E093.format(seq=" ".join(seq))) elif token.ent_iob == 2 or token.ent_iob == 0: if start != -1: output.append(Span(self, start, i, label=label)) @@ -465,7 +501,6 @@ cdef class Doc: # prediction # 3. Test basic data-driven ORTH gazetteer # 4. Test more nuanced date and currency regex - tokens_in_ents = {} cdef attr_t entity_type cdef int ent_start, ent_end @@ -479,7 +514,6 @@ cdef class Doc: self.vocab.strings[tokens_in_ents[token_index][2]]), span2=(ent_start, ent_end, self.vocab.strings[entity_type]))) tokens_in_ents[token_index] = (ent_start, ent_end, entity_type) - cdef int i for i in range(self.length): self.c[i].ent_type = 0 @@ -510,6 +544,8 @@ cdef class Doc: clauses. YIELDS (Span): Noun chunks in the document. + + DOCS: https://spacy.io/api/doc#noun_chunks """ def __get__(self): if not self.is_parsed: @@ -533,15 +569,15 @@ cdef class Doc: dependency parse. If the parser is disabled, the `sents` iterator will be unavailable. - EXAMPLE: - >>> doc = nlp("This is a sentence. Here's another...") - >>> assert [s.root.text for s in doc.sents] == ["is", "'s"] + YIELDS (Span): Sentences in the document. + + DOCS: https://spacy.io/api/doc#sents """ def __get__(self): if not self.is_sentenced: raise ValueError(Errors.E030) - if 'sents' in self.user_hooks: - yield from self.user_hooks['sents'](self) + if "sents" in self.user_hooks: + yield from self.user_hooks["sents"](self) else: start = 0 for i in range(1, self.length): @@ -606,17 +642,16 @@ cdef class Doc: if isinstance(py_attr_ids, basestring_): # Handle inputs like doc.to_array('ORTH') py_attr_ids = [py_attr_ids] - elif not hasattr(py_attr_ids, '__iter__'): + elif not hasattr(py_attr_ids, "__iter__"): # Handle inputs like doc.to_array(ORTH) py_attr_ids = [py_attr_ids] # Allow strings, e.g. 'lemma' or 'LEMMA' - py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_) + py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in py_attr_ids] # Make an array from the attributes --- otherwise our inner loop is # Python dict iteration. - cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype='i') - output = numpy.ndarray(shape=(self.length, len(attr_ids)), - dtype=numpy.uint64) + cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i") + output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) c_output = output.data c_attr_ids = attr_ids.data cdef TokenC* token @@ -628,8 +663,7 @@ cdef class Doc: # Handle 1d case return output if len(attr_ids) >= 2 else output.reshape((self.length,)) - def count_by(self, attr_id_t attr_id, exclude=None, - PreshCounter counts=None): + def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): """Count the frequencies of a given attribute. Produces a dict of `{attribute (int): count (ints)}` frequencies, keyed by the values of the given attribute ID. @@ -637,13 +671,7 @@ cdef class Doc: attr_id (int): The attribute ID to key the counts. RETURNS (dict): A dictionary mapping attributes to integer counts. - EXAMPLE: - >>> from spacy import attrs - >>> doc = nlp(u'apple apple orange banana') - >>> tokens.count_by(attrs.ORTH) - {12800L: 1, 11880L: 2, 7561L: 1} - >>> tokens.to_array([attrs.ORTH]) - array([[11880], [11880], [7561], [12800]]) + DOCS: https://spacy.io/api/doc#count_by """ cdef int i cdef attr_t attr @@ -684,13 +712,21 @@ cdef class Doc: cdef void set_parse(self, const TokenC* parsed) nogil: # TODO: This method is fairly misleading atm. It's used by Parser # to actually apply the parse calculated. Need to rethink this. - # Probably we should use from_array? self.is_parsed = True for i in range(self.length): self.c[i] = parsed[i] def from_array(self, attrs, array): + """Load attributes from a numpy array. Write to a `Doc` object, from an + `(M, N)` array of attributes. + + attrs (list) A list of attribute ID ints. + array (numpy.ndarray[ndim=2, dtype='int32']): The attribute values. + RETURNS (Doc): Itself. + + DOCS: https://spacy.io/api/doc#from_array + """ if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) cdef int i, col @@ -714,10 +750,10 @@ cdef class Doc: for i in range(length): if array[i, col] != 0: self.vocab.morphology.assign_tag(&tokens[i], array[i, col]) - # set flags + # Set flags self.is_parsed = bool(self.is_parsed or HEAD in attrs or DEP in attrs) self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) - # if document is parsed, set children + # If document is parsed, set children if self.is_parsed: set_children_from_heads(self.c, self.length) return self @@ -729,6 +765,8 @@ cdef class Doc: RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape (n, n), where n = len(self). + + DOCS: https://spacy.io/api/doc#get_lca_matrix """ return numpy.asarray(_get_lca_matrix(self, 0, len(self))) @@ -737,9 +775,11 @@ cdef class Doc: path (unicode or Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. + + DOCS: https://spacy.io/api/doc#to_disk """ path = util.ensure_path(path) - with path.open('wb') as file_: + with path.open("wb") as file_: file_.write(self.to_bytes(**exclude)) def from_disk(self, path, **exclude): @@ -749,9 +789,11 @@ cdef class Doc: path (unicode or Path): A path to a directory. Paths may be either strings or `Path`-like objects. RETURNS (Doc): The modified `Doc` object. + + DOCS: https://spacy.io/api/doc#from_disk """ path = util.ensure_path(path) - with path.open('rb') as file_: + with path.open("rb") as file_: bytes_data = file_.read() return self.from_bytes(bytes_data, **exclude) @@ -760,15 +802,16 @@ cdef class Doc: RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. + + DOCS: https://spacy.io/api/doc#to_bytes """ array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] - if self.is_tagged: array_head.append(TAG) - # if doc parsed add head and dep attribute + # If doc parsed add head and dep attribute if self.is_parsed: array_head.extend([HEAD, DEP]) - # otherwise add sent_start + # Otherwise add sent_start else: array_head.append(SENT_START) # Msgpack doesn't distinguish between lists and tuples, which is @@ -776,17 +819,16 @@ cdef class Doc: # keys, we must have tuples. In values we just have to hope # users don't mind getting a list instead of a tuple. serializers = { - 'text': lambda: self.text, - 'array_head': lambda: array_head, - 'array_body': lambda: self.to_array(array_head), - 'sentiment': lambda: self.sentiment, - 'tensor': lambda: self.tensor, + "text": lambda: self.text, + "array_head": lambda: array_head, + "array_body": lambda: self.to_array(array_head), + "sentiment": lambda: self.sentiment, + "tensor": lambda: self.tensor, } - if 'user_data' not in exclude and self.user_data: + if "user_data" not in exclude and self.user_data: user_data_keys, user_data_values = list(zip(*self.user_data.items())) - serializers['user_data_keys'] = lambda: srsly.msgpack_dumps(user_data_keys) - serializers['user_data_values'] = lambda: srsly.msgpack_dumps(user_data_values) - + serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys) + serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, **exclude): @@ -794,42 +836,40 @@ cdef class Doc: data (bytes): The string to load from. RETURNS (Doc): Itself. + + DOCS: https://spacy.io/api/doc#from_bytes """ if self.length != 0: raise ValueError(Errors.E033.format(length=self.length)) deserializers = { - 'text': lambda b: None, - 'array_head': lambda b: None, - 'array_body': lambda b: None, - 'sentiment': lambda b: None, - 'tensor': lambda b: None, - 'user_data_keys': lambda b: None, - 'user_data_values': lambda b: None, + "text": lambda b: None, + "array_head": lambda b: None, + "array_body": lambda b: None, + "sentiment": lambda b: None, + "tensor": lambda b: None, + "user_data_keys": lambda b: None, + "user_data_values": lambda b: None, } - msg = util.from_bytes(bytes_data, deserializers, exclude) # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within # keys, we must have tuples. In values we just have to hope # users don't mind getting a list instead of a tuple. - if 'user_data' not in exclude and 'user_data_keys' in msg: - user_data_keys = srsly.msgpack_loads(msg['user_data_keys'], use_list=False) - user_data_values = srsly.msgpack_loads(msg['user_data_values']) + if "user_data" not in exclude and "user_data_keys" in msg: + user_data_keys = srsly.msgpack_loads(msg["user_data_keys"], use_list=False) + user_data_values = srsly.msgpack_loads(msg["user_data_values"]) for key, value in zip(user_data_keys, user_data_values): self.user_data[key] = value - cdef int i, start, end, has_space - - if 'sentiment' not in exclude and 'sentiment' in msg: - self.sentiment = msg['sentiment'] - if 'tensor' not in exclude and 'tensor' in msg: - self.tensor = msg['tensor'] - + if "sentiment" not in exclude and "sentiment" in msg: + self.sentiment = msg["sentiment"] + if "tensor" not in exclude and "tensor" in msg: + self.tensor = msg["tensor"] start = 0 cdef const LexemeC* lex cdef unicode orth_ - text = msg['text'] - attrs = msg['array_body'] + text = msg["text"] + attrs = msg["array_body"] for i in range(attrs.shape[0]): end = start + attrs[i, 0] has_space = attrs[i, 1] @@ -837,11 +877,11 @@ cdef class Doc: lex = self.vocab.get(self.mem, orth_) self.push_back(lex, has_space) start = end + has_space - self.from_array(msg['array_head'][2:], attrs[:, 2:]) + self.from_array(msg["array_head"][2:], attrs[:, 2:]) return self def extend_tensor(self, tensor): - '''Concatenate a new tensor onto the doc.tensor object. + """Concatenate a new tensor onto the doc.tensor object. The doc.tensor attribute holds dense feature vectors computed by the models in the pipeline. Let's say a @@ -849,7 +889,7 @@ cdef class Doc: per word. doc.tensor.shape will be (30, 128). After calling doc.extend_tensor with an array of shape (30, 64), doc.tensor == (30, 192). - ''' + """ xp = get_array_module(self.tensor) if self.tensor.size == 0: self.tensor.resize(tensor.shape, refcheck=False) @@ -858,7 +898,7 @@ cdef class Doc: self.tensor = xp.hstack((self.tensor, tensor)) def retokenize(self): - '''Context manager to handle retokenization of the Doc. + """Context manager to handle retokenization of the Doc. Modifications to the Doc's tokenization are stored, and then made all at once when the context manager exits. This is much more efficient, and less error-prone. @@ -866,7 +906,10 @@ cdef class Doc: All views of the Doc (Span and Token) created before the retokenization are invalidated, although they may accidentally continue to work. - ''' + + DOCS: https://spacy.io/api/doc#retokenize + USAGE: https://spacy.io/usage/linguistic-features#retokenization + """ return Retokenizer(self) def _bulk_merge(self, spans, attributes): @@ -882,9 +925,10 @@ cdef class Doc: RETURNS (Token): The first newly merged token. """ cdef unicode tag, lemma, ent_type - - assert len(attributes) == len(spans), "attribute length should be equal to span length" + str(len(attributes)) +\ - str(len(spans)) + attr_len = len(attributes) + span_len = len(spans) + if not attr_len == span_len: + raise ValueError(Errors.E121.format(attr_len=attr_len, span_len=span_len)) with self.retokenize() as retokenizer: for i, span in enumerate(spans): fix_attributes(self, attributes[i]) @@ -915,13 +959,10 @@ cdef class Doc: elif not args: fix_attributes(self, attributes) elif args: - raise ValueError(Errors.E034.format(n_args=len(args), - args=repr(args), + raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args), kwargs=repr(attributes))) remove_label_if_necessary(attributes) - attributes = intify_attrs(attributes, strings_map=self.vocab.strings) - cdef int start = token_by_start(self.c, self.length, start_idx) if start == -1: return None @@ -938,44 +979,47 @@ cdef class Doc: raise ValueError(Errors.E105) def to_json(self, underscore=None): - """Convert a Doc to JSON. Produces the same format used by the spacy - train command. + """Convert a Doc to JSON. The format it produces will be the new format + for the `spacy train` command (not implemented yet). underscore (list): Optional list of string names of custom doc._. attributes. Attribute values need to be JSON-serializable. Values will be added to an "_" key in the data, e.g. "_": {"foo": "bar"}. RETURNS (dict): The data in spaCy's JSON format. + + DOCS: https://spacy.io/api/doc#to_json """ - data = {'text': self.text} - data['ents'] = [{'start': ent.start_char, 'end': ent.end_char, - 'label': ent.label_} for ent in self.ents] + data = {"text": self.text} + if self.ents: + data["ents"] = [{"start": ent.start_char, "end": ent.end_char, + "label": ent.label_} for ent in self.ents] sents = list(self.sents) if sents: - data['sents'] = [{'start': sent.start_char, 'end': sent.end_char} + data["sents"] = [{"start": sent.start_char, "end": sent.end_char} for sent in sents] if self.cats: - data['cats'] = self.cats - data['tokens'] = [] + data["cats"] = self.cats + data["tokens"] = [] for token in self: - token_data = {'id': token.i, 'start': token.idx, 'end': token.idx + len(token)} + token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)} if token.pos_: - token_data['pos'] = token.pos_ + token_data["pos"] = token.pos_ if token.tag_: - token_data['tag'] = token.tag_ + token_data["tag"] = token.tag_ if token.dep_: - token_data['dep'] = token.dep_ + token_data["dep"] = token.dep_ if token.head: - token_data['head'] = token.head.i - data['tokens'].append(token_data) + token_data["head"] = token.head.i + data["tokens"].append(token_data) if underscore: - data['_'] = {} + data["_"] = {} for attr in underscore: if not self.has_extension(attr): raise ValueError(Errors.E106.format(attr=attr, opts=underscore)) value = self._.get(attr) if not srsly.is_json_serializable(value): raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) - data['_'][attr] = value + data["_"][attr] = value return data @@ -1007,9 +1051,8 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: tokens[i].r_kids = 0 tokens[i].l_edge = i tokens[i].r_edge = i - # Three times, for non-projectivity - # See issue #3170. This isn't a very satisfying fix, but I think it's - # sufficient. + # Three times, for non-projectivity. See issue #3170. This isn't a very + # satisfying fix, but I think it's sufficient. for loop_count in range(3): # Set left edges for i in range(length): @@ -1021,7 +1064,7 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: head.l_edge = child.l_edge if child.r_edge > head.r_edge: head.r_edge = child.r_edge - # Set right edges --- same as above, but iterate in reverse + # Set right edges - same as above, but iterate in reverse for i in range(length-1, -1, -1): child = &tokens[i] head = &tokens[i + child.head] @@ -1052,20 +1095,14 @@ cdef int _get_tokens_lca(Token token_j, Token token_k): return token_k.i elif token_k.head == token_j: return token_j.i - token_j_ancestors = set(token_j.ancestors) - if token_k in token_j_ancestors: return token_k.i - for token_k_ancestor in token_k.ancestors: - if token_k_ancestor == token_j: return token_j.i - if token_k_ancestor in token_j_ancestors: return token_k_ancestor.i - return -1 @@ -1083,12 +1120,10 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): with shape (n, n), where n = len(doc). """ cdef int [:,:] lca_matrix - n_tokens= end - start lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32) lca_mat.fill(-1) lca_matrix = lca_mat - for j in range(n_tokens): token_j = doc[start + j] # the common ancestor of token and itself is itself: @@ -1109,7 +1144,6 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): else: lca_matrix[j, k] = lca - start lca_matrix[k, j] = lca - start - return lca_matrix @@ -1123,8 +1157,7 @@ def pickle_doc(doc): def unpickle_doc(vocab, hooks_and_data, bytes_data): user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data) - doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, - exclude='user_data') + doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude="user_data") doc.user_hooks.update(doc_hooks) doc.user_span_hooks.update(span_hooks) doc.user_token_hooks.update(token_hooks) @@ -1133,19 +1166,22 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data): copy_reg.pickle(Doc, pickle_doc, unpickle_doc) + def remove_label_if_necessary(attributes): # More deprecated attribute handling =/ - if 'label' in attributes: - attributes['ent_type'] = attributes.pop('label') + if "label" in attributes: + attributes["ent_type"] = attributes.pop("label") + def fix_attributes(doc, attributes): - if 'label' in attributes and 'ent_type' not in attributes: - if isinstance(attributes['label'], int): - attributes[ENT_TYPE] = attributes['label'] + if "label" in attributes and "ent_type" not in attributes: + if isinstance(attributes["label"], int): + attributes[ENT_TYPE] = attributes["label"] else: - attributes[ENT_TYPE] = doc.vocab.strings[attributes['label']] - if 'ent_type' in attributes: - attributes[ENT_TYPE] = attributes['ent_type'] + attributes[ENT_TYPE] = doc.vocab.strings[attributes["label"]] + if "ent_type" in attributes: + attributes[ENT_TYPE] = attributes["ent_type"] + def get_entity_info(ent_info): if isinstance(ent_info, Span): diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 1450eb214..48e791102 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,12 +1,13 @@ # coding: utf8 from __future__ import unicode_literals -from collections import defaultdict cimport numpy as np +from libc.math cimport sqrt + import numpy import numpy.linalg -from libc.math cimport sqrt from thinc.neural.util import get_array_module +from collections import defaultdict from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix from .token cimport TokenC @@ -14,9 +15,10 @@ from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t, hash_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t -from ..util import normalize_slice from ..attrs cimport * from ..lexeme cimport Lexeme + +from ..util import normalize_slice from ..compat import is_config, basestring_ from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning from ..errors import deprecation_warning @@ -24,29 +26,66 @@ from .underscore import Underscore, get_ext_args cdef class Span: - """A slice from a Doc object.""" + """A slice from a Doc object. + + DOCS: https://spacy.io/api/span + """ @classmethod def set_extension(cls, name, **kwargs): - if cls.has_extension(name) and not kwargs.get('force', False): - raise ValueError(Errors.E090.format(name=name, obj='Span')) + """Define a custom attribute which becomes available as `Span._`. + + name (unicode): Name of the attribute to set. + default: Optional default value of the attribute. + getter (callable): Optional getter function. + setter (callable): Optional setter function. + method (callable): Optional method for method extension. + force (bool): Force overwriting existing attribute. + + DOCS: https://spacy.io/api/span#set_extension + USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes + """ + if cls.has_extension(name) and not kwargs.get("force", False): + raise ValueError(Errors.E090.format(name=name, obj="Span")) Underscore.span_extensions[name] = get_ext_args(**kwargs) @classmethod def get_extension(cls, name): + """Look up a previously registered extension by name. + + name (unicode): Name of the extension. + RETURNS (tuple): A `(default, method, getter, setter)` tuple. + + DOCS: https://spacy.io/api/span#get_extension + """ return Underscore.span_extensions.get(name) @classmethod def has_extension(cls, name): + """Check whether an extension has been registered. + + name (unicode): Name of the extension. + RETURNS (bool): Whether the extension has been registered. + + DOCS: https://spacy.io/api/span#has_extension + """ return name in Underscore.span_extensions @classmethod def remove_extension(cls, name): + """Remove a previously registered extension. + + name (unicode): Name of the extension. + RETURNS (tuple): A `(default, method, getter, setter)` tuple of the + removed extension. + + DOCS: https://spacy.io/api/span#remove_extension + """ if not cls.has_extension(name): raise ValueError(Errors.E046.format(name=name)) return Underscore.span_extensions.pop(name) - def __cinit__(self, Doc doc, int start, int end, label=0, - vector=None, vector_norm=None): + def __cinit__(self, Doc doc, int start, int end, label=0, vector=None, + vector_norm=None): """Create a `Span` object from the slice `doc[start : end]`. doc (Doc): The parent document. @@ -56,6 +95,8 @@ cdef class Span: vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. RETURNS (Span): The newly constructed object. + + DOCS: https://spacy.io/api/span#init """ if not (0 <= start <= end <= len(doc)): raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc))) @@ -102,6 +143,8 @@ cdef class Span: """Get the number of tokens in the span. RETURNS (int): The number of tokens in the span. + + DOCS: https://spacy.io/api/span#len """ self._recalculate_indices() if self.end < self.start: @@ -111,7 +154,7 @@ cdef class Span: def __repr__(self): if is_config(python3=True): return self.text - return self.text.encode('utf-8') + return self.text.encode("utf-8") def __getitem__(self, object i): """Get a `Token` or a `Span` object @@ -120,9 +163,7 @@ cdef class Span: the span to get. RETURNS (Token or Span): The token at `span[i]`. - EXAMPLE: - >>> span[0] - >>> span[1:3] + DOCS: https://spacy.io/api/span#getitem """ self._recalculate_indices() if isinstance(i, slice): @@ -138,6 +179,8 @@ cdef class Span: """Iterate over `Token` objects. YIELDS (Token): A `Token` object. + + DOCS: https://spacy.io/api/span#iter """ self._recalculate_indices() for i in range(self.start, self.end): @@ -148,31 +191,32 @@ cdef class Span: @property def _(self): - """User space for adding custom attribute extensions.""" + """Custom extension attributes registered via `set_extension`.""" return Underscore(Underscore.span_extensions, self, start=self.start_char, end=self.end_char) def as_doc(self): - # TODO: fix - """Create a `Doc` object with a copy of the Span's data. + """Create a `Doc` object with a copy of the `Span`'s data. RETURNS (Doc): The `Doc` copy of the span. + + DOCS: https://spacy.io/api/span#as_doc """ - cdef Doc doc = Doc(self.doc.vocab, - words=[t.text for t in self], - spaces=[bool(t.whitespace_) for t in self]) + # TODO: Fix! + words = [t.text for t in self] + spaces = [bool(t.whitespace_) for t in self] + cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] if self.doc.is_tagged: array_head.append(TAG) - # if doc parsed add head and dep attribute + # If doc parsed add head and dep attribute if self.doc.is_parsed: array_head.extend([HEAD, DEP]) - # otherwise add sent_start + # Otherwise add sent_start else: array_head.append(SENT_START) array = self.doc.to_array(array_head) doc.from_array(array_head, array[self.start : self.end]) - doc.noun_chunks_iterator = self.doc.noun_chunks_iterator doc.user_hooks = self.doc.user_hooks doc.user_span_hooks = self.doc.user_span_hooks @@ -181,7 +225,7 @@ cdef class Span: doc.vector_norm = self.vector_norm doc.tensor = self.doc.tensor[self.start : self.end] for key, value in self.doc.cats.items(): - if hasattr(key, '__len__') and len(key) == 3: + if hasattr(key, "__len__") and len(key) == 3: cat_start, cat_end, cat_label = key if cat_start == self.start_char and cat_end == self.end_char: doc.cats[cat_label] = value @@ -207,6 +251,8 @@ cdef class Span: RETURNS (np.array[ndim=2, dtype=numpy.int32]): LCA matrix with shape (n, n), where n = len(self). + + DOCS: https://spacy.io/api/span#get_lca_matrix """ return numpy.asarray(_get_lca_matrix(self.doc, self.start, self.end)) @@ -217,22 +263,24 @@ cdef class Span: other (object): The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. RETURNS (float): A scalar similarity score. Higher is more similar. + + DOCS: https://spacy.io/api/span#similarity """ - if 'similarity' in self.doc.user_span_hooks: - self.doc.user_span_hooks['similarity'](self, other) - if len(self) == 1 and hasattr(other, 'orth'): + if "similarity" in self.doc.user_span_hooks: + self.doc.user_span_hooks["similarity"](self, other) + if len(self) == 1 and hasattr(other, "orth"): if self[0].orth == other.orth: return 1.0 - elif hasattr(other, '__len__') and len(self) == len(other): + elif hasattr(other, "__len__") and len(self) == len(other): for i in range(len(self)): - if self[i].orth != getattr(other[i], 'orth', None): + if self[i].orth != getattr(other[i], "orth", None): break else: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj='Span')) + models_warning(Warnings.W007.format(obj="Span")) if self.vector_norm == 0.0 or other.vector_norm == 0.0: - user_warning(Warnings.W008.format(obj='Span')) + user_warning(Warnings.W008.format(obj="Span")) return 0.0 vector = self.vector xp = get_array_module(vector) @@ -251,8 +299,8 @@ cdef class Span: cdef int i, j cdef attr_id_t feature cdef np.ndarray[attr_t, ndim=2] output - # Make an array from the attributes --- otherwise our inner loop is Python - # dict iteration. + # Make an array from the attributes - otherwise our inner loop is Python + # dict iteration cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) cdef int length = self.end - self.start output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64) @@ -282,12 +330,11 @@ cdef class Span: property sent: """RETURNS (Span): The sentence span that the span is a part of.""" def __get__(self): - if 'sent' in self.doc.user_span_hooks: - return self.doc.user_span_hooks['sent'](self) - # This should raise if we're not parsed - # or doesen't have any sbd component :) + if "sent" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["sent"](self) + # This should raise if not parsed / no custom sentence boundaries self.doc.sents - # if doc is parsed we can use the deps to find the sentence + # If doc is parsed we can use the deps to find the sentence # otherwise we use the `sent_start` token attribute cdef int n = 0 cdef int i @@ -300,11 +347,11 @@ cdef class Span: raise RuntimeError(Errors.E038) return self.doc[root.l_edge:root.r_edge + 1] elif self.doc.is_sentenced: - # find start of the sentence + # Find start of the sentence start = self.start while self.doc.c[start].sent_start != 1 and start > 0: start += -1 - # find end of the sentence + # Find end of the sentence end = self.end n = 0 while end < self.doc.length and self.doc.c[end].sent_start != 1: @@ -315,7 +362,13 @@ cdef class Span: return self.doc[start:end] property ents: - """RETURNS (list): A list of tokens that belong to the current span.""" + """The named entities in the span. Returns a tuple of named entity + `Span` objects, if the entity recognizer has been applied. + + RETURNS (tuple): Entities in the span, one `Span` per entity. + + DOCS: https://spacy.io/api/span#ents + """ def __get__(self): ents = [] for ent in self.doc.ents: @@ -324,11 +377,16 @@ cdef class Span: return ents property has_vector: - """RETURNS (bool): Whether a word vector is associated with the object. + """A boolean value indicating whether a word vector is associated with + the object. + + RETURNS (bool): Whether a word vector is associated with the object. + + DOCS: https://spacy.io/api/span#has_vector """ def __get__(self): - if 'has_vector' in self.doc.user_span_hooks: - return self.doc.user_span_hooks['has_vector'](self) + if "has_vector" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["has_vector"](self) elif self.vocab.vectors.data.size > 0: return any(token.has_vector for token in self) elif self.doc.tensor.size > 0: @@ -342,19 +400,26 @@ cdef class Span: RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array representing the span's semantics. + + DOCS: https://spacy.io/api/span#vector """ def __get__(self): - if 'vector' in self.doc.user_span_hooks: - return self.doc.user_span_hooks['vector'](self) + if "vector" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["vector"](self) if self._vector is None: self._vector = sum(t.vector for t in self) / len(self) return self._vector property vector_norm: - """RETURNS (float): The L2 norm of the vector representation.""" + """The L2 norm of the span's vector representation. + + RETURNS (float): The L2 norm of the vector representation. + + DOCS: https://spacy.io/api/span#vector_norm + """ def __get__(self): - if 'vector_norm' in self.doc.user_span_hooks: - return self.doc.user_span_hooks['vector'](self) + if "vector_norm" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["vector"](self) cdef float value cdef double norm = 0 if self._vector_norm is None: @@ -369,8 +434,8 @@ cdef class Span: negativity of the span. """ def __get__(self): - if 'sentiment' in self.doc.user_span_hooks: - return self.doc.user_span_hooks['sentiment'](self) + if "sentiment" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["sentiment"](self) else: return sum([token.sentiment for token in self]) / len(self) @@ -390,7 +455,7 @@ cdef class Span: whitespace). """ def __get__(self): - return u''.join([t.text_with_ws for t in self]) + return "".join([t.text_with_ws for t in self]) property noun_chunks: """Yields base noun-phrase `Span` objects, if the document has been @@ -399,7 +464,9 @@ cdef class Span: NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Base noun-phrase `Span` objects + YIELDS (Span): Base noun-phrase `Span` objects. + + DOCS: https://spacy.io/api/span#noun_chunks """ def __get__(self): if not self.doc.is_parsed: @@ -418,52 +485,18 @@ cdef class Span: yield span property root: - """The token within the span that's highest in the parse tree. - If there's a tie, the earliest is prefered. + """The token with the shortest path to the root of the + sentence (or the root itself). If multiple tokens are equally + high in the tree, the first token is taken. RETURNS (Token): The root token. - EXAMPLE: The root token has the shortest path to the root of the - sentence (or is the root itself). If multiple words are equally - high in the tree, the first word is taken. For example: - - >>> toks = nlp(u'I like New York in Autumn.') - - Let's name the indices – easier than writing `toks[4]` etc. - - >>> i, like, new, york, in_, autumn, dot = range(len(toks)) - - The head of 'new' is 'York', and the head of "York" is "like" - - >>> toks[new].head.text - 'York' - >>> toks[york].head.text - 'like' - - Create a span for "New York". Its root is "York". - - >>> new_york = toks[new:york+1] - >>> new_york.root.text - 'York' - - Here's a more complicated case, raised by issue #214: - - >>> toks = nlp(u'to, north and south carolina') - >>> to, north, and_, south, carolina = toks - >>> south.head.text, carolina.head.text - ('north', 'to') - - Here "south" is a child of "north", which is a child of "carolina". - Carolina is the root of the span: - - >>> south_carolina = toks[-2:] - >>> south_carolina.root.text - 'carolina' + DOCS: https://spacy.io/api/span#root """ def __get__(self): self._recalculate_indices() - if 'root' in self.doc.user_span_hooks: - return self.doc.user_span_hooks['root'](self) + if "root" in self.doc.user_span_hooks: + return self.doc.user_span_hooks["root"](self) # This should probably be called 'head', and the other one called # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ cdef int i @@ -495,10 +528,12 @@ cdef class Span: return self.doc[root] property lefts: - """ Tokens that are to the left of the span, whose head is within the + """Tokens that are to the left of the span, whose head is within the `Span`. YIELDS (Token):A left-child of a token of the span. + + DOCS: https://spacy.io/api/span#lefts """ def __get__(self): for token in reversed(self): # Reverse, so we get tokens in order @@ -511,6 +546,8 @@ cdef class Span: `Span`. YIELDS (Token): A right-child of a token of the span. + + DOCS: https://spacy.io/api/span#rights """ def __get__(self): for token in self: @@ -519,15 +556,25 @@ cdef class Span: yield right property n_lefts: - """RETURNS (int): The number of leftward immediate children of the + """The number of tokens that are to the left of the span, whose + heads are within the span. + + RETURNS (int): The number of leftward immediate children of the span, in the syntactic dependency parse. + + DOCS: https://spacy.io/api/span#n_lefts """ def __get__(self): return len(list(self.lefts)) property n_rights: - """RETURNS (int): The number of rightward immediate children of the + """The number of tokens that are to the right of the span, whose + heads are within the span. + + RETURNS (int): The number of rightward immediate children of the span, in the syntactic dependency parse. + + DOCS: https://spacy.io/api/span#n_rights """ def __get__(self): return len(list(self.rights)) @@ -536,6 +583,8 @@ cdef class Span: """Tokens within the span and tokens which descend from them. YIELDS (Token): A token within the span, or a descendant from it. + + DOCS: https://spacy.io/api/span#subtree """ def __get__(self): for word in self.lefts: @@ -550,7 +599,7 @@ cdef class Span: return self.root.ent_id def __set__(self, hash_t key): - raise NotImplementedError(TempErrors.T007.format(attr='ent_id')) + raise NotImplementedError(TempErrors.T007.format(attr="ent_id")) property ent_id_: """RETURNS (unicode): The (string) entity ID.""" @@ -558,10 +607,10 @@ cdef class Span: return self.root.ent_id_ def __set__(self, hash_t key): - raise NotImplementedError(TempErrors.T007.format(attr='ent_id_')) + raise NotImplementedError(TempErrors.T007.format(attr="ent_id_")) property orth_: - """Verbatim text content (identical to Span.text). Exists mostly for + """Verbatim text content (identical to `Span.text`). Exists mostly for consistency with other attributes. RETURNS (unicode): The span's text.""" @@ -571,27 +620,28 @@ cdef class Span: property lemma_: """RETURNS (unicode): The span's lemma.""" def __get__(self): - return ' '.join([t.lemma_ for t in self]).strip() + return " ".join([t.lemma_ for t in self]).strip() property upper_: - """Deprecated. Use Span.text.upper() instead.""" + """Deprecated. Use `Span.text.upper()` instead.""" def __get__(self): - return ''.join([t.text_with_ws.upper() for t in self]).strip() + return "".join([t.text_with_ws.upper() for t in self]).strip() property lower_: - """Deprecated. Use Span.text.lower() instead.""" + """Deprecated. Use `Span.text.lower()` instead.""" def __get__(self): - return ''.join([t.text_with_ws.lower() for t in self]).strip() + return "".join([t.text_with_ws.lower() for t in self]).strip() property string: - """Deprecated: Use Span.text_with_ws instead.""" + """Deprecated: Use `Span.text_with_ws` instead.""" def __get__(self): - return ''.join([t.text_with_ws for t in self]) + return "".join([t.text_with_ws for t in self]) property label_: """RETURNS (unicode): The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] + def __set__(self, unicode label_): self.label = self.doc.vocab.strings.add(label_) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 00de4897c..771e43549 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -8,42 +8,82 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free from cython.view cimport array as cvarray cimport numpy as np np.import_array() + import numpy from thinc.neural.util import get_array_module from ..typedefs cimport hash_t from ..lexeme cimport Lexeme -from .. import parts_of_speech from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP +from ..symbols cimport conj + +from .. import parts_of_speech +from .. import util from ..compat import is_config from ..errors import Errors, Warnings, user_warning, models_warning -from .. import util from .underscore import Underscore, get_ext_args cdef class Token: """An individual token – i.e. a word, punctuation symbol, whitespace, - etc.""" + etc. + + DOCS: https://spacy.io/api/token + """ @classmethod def set_extension(cls, name, **kwargs): - if cls.has_extension(name) and not kwargs.get('force', False): - raise ValueError(Errors.E090.format(name=name, obj='Token')) + """Define a custom attribute which becomes available as `Token._`. + + name (unicode): Name of the attribute to set. + default: Optional default value of the attribute. + getter (callable): Optional getter function. + setter (callable): Optional setter function. + method (callable): Optional method for method extension. + force (bool): Force overwriting existing attribute. + + DOCS: https://spacy.io/api/token#set_extension + USAGE: https://spacy.io/usage/processing-pipelines#custom-components-attributes + """ + if cls.has_extension(name) and not kwargs.get("force", False): + raise ValueError(Errors.E090.format(name=name, obj="Token")) Underscore.token_extensions[name] = get_ext_args(**kwargs) @classmethod def get_extension(cls, name): + """Look up a previously registered extension by name. + + name (unicode): Name of the extension. + RETURNS (tuple): A `(default, method, getter, setter)` tuple. + + DOCS: https://spacy.io/api/token#get_extension + """ return Underscore.token_extensions.get(name) @classmethod def has_extension(cls, name): + """Check whether an extension has been registered. + + name (unicode): Name of the extension. + RETURNS (bool): Whether the extension has been registered. + + DOCS: https://spacy.io/api/token#has_extension + """ return name in Underscore.token_extensions @classmethod def remove_extension(cls, name): + """Remove a previously registered extension. + + name (unicode): Name of the extension. + RETURNS (tuple): A `(default, method, getter, setter)` tuple of the + removed extension. + + DOCS: https://spacy.io/api/token#remove_extension + """ if not cls.has_extension(name): raise ValueError(Errors.E046.format(name=name)) return Underscore.token_extensions.pop(name) @@ -54,6 +94,8 @@ cdef class Token: vocab (Vocab): A storage container for lexical types. doc (Doc): The parent document. offset (int): The index of the token within the document. + + DOCS: https://spacy.io/api/token#init """ self.vocab = vocab self.doc = doc @@ -67,6 +109,8 @@ cdef class Token: """The number of unicode characters in the token, i.e. `token.text`. RETURNS (int): The number of unicode characters in the token. + + DOCS: https://spacy.io/api/token#len """ return self.c.lex.length @@ -121,6 +165,7 @@ cdef class Token: @property def _(self): + """Custom extension attributes registered via `set_extension`.""" return Underscore(Underscore.token_extensions, self, start=self.idx, end=None) @@ -130,12 +175,7 @@ cdef class Token: flag_id (int): The ID of the flag attribute. RETURNS (bool): Whether the flag is set. - EXAMPLE: - >>> from spacy.attrs import IS_TITLE - >>> doc = nlp(u'Give it back! He pleaded.') - >>> token = doc[0] - >>> token.check_flag(IS_TITLE) - True + DOCS: https://spacy.io/api/token#check_flag """ return Lexeme.c_check_flag(self.c.lex, flag_id) @@ -144,6 +184,8 @@ cdef class Token: i (int): The relative position of the token to get. Defaults to 1. RETURNS (Token): The token at position `self.doc[self.i+i]`. + + DOCS: https://spacy.io/api/token#nbor """ if self.i+i < 0 or (self.i+i >= len(self.doc)): raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc))) @@ -156,19 +198,21 @@ cdef class Token: other (object): The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. RETURNS (float): A scalar similarity score. Higher is more similar. + + DOCS: https://spacy.io/api/token#similarity """ - if 'similarity' in self.doc.user_token_hooks: - return self.doc.user_token_hooks['similarity'](self) - if hasattr(other, '__len__') and len(other) == 1 and hasattr(other, "__getitem__"): - if self.c.lex.orth == getattr(other[0], 'orth', None): + if "similarity" in self.doc.user_token_hooks: + return self.doc.user_token_hooks["similarity"](self) + if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"): + if self.c.lex.orth == getattr(other[0], "orth", None): return 1.0 - elif hasattr(other, 'orth'): + elif hasattr(other, "orth"): if self.c.lex.orth == other.orth: return 1.0 if self.vocab.vectors.n_keys == 0: - models_warning(Warnings.W007.format(obj='Token')) + models_warning(Warnings.W007.format(obj="Token")) if self.vector_norm == 0 or other.vector_norm == 0: - user_warning(Warnings.W008.format(obj='Token')) + user_warning(Warnings.W008.format(obj="Token")) return 0.0 vector = self.vector xp = get_array_module(vector) @@ -202,7 +246,7 @@ cdef class Token: def __get__(self): cdef unicode orth = self.vocab.strings[self.c.lex.orth] if self.c.spacy: - return orth + u' ' + return orth + " " else: return orth @@ -215,8 +259,8 @@ cdef class Token: """RETURNS (float): A scalar value indicating the positivity or negativity of the token.""" def __get__(self): - if 'sentiment' in self.doc.user_token_hooks: - return self.doc.user_token_hooks['sentiment'](self) + if "sentiment" in self.doc.user_token_hooks: + return self.doc.user_token_hooks["sentiment"](self) return self.c.lex.sentiment property lang: @@ -298,6 +342,7 @@ cdef class Token: """RETURNS (uint64): ID of coarse-grained part-of-speech tag.""" def __get__(self): return self.c.pos + def __set__(self, pos): self.c.pos = pos @@ -322,10 +367,12 @@ cdef class Token: the object. RETURNS (bool): Whether a word vector is associated with the object. + + DOCS: https://spacy.io/api/token#has_vector """ def __get__(self): if 'has_vector' in self.doc.user_token_hooks: - return self.doc.user_token_hooks['has_vector'](self) + return self.doc.user_token_hooks["has_vector"](self) if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: return True return self.vocab.has_vector(self.c.lex.orth) @@ -335,10 +382,12 @@ cdef class Token: RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array representing the token's semantics. + + DOCS: https://spacy.io/api/token#vector """ def __get__(self): if 'vector' in self.doc.user_token_hooks: - return self.doc.user_token_hooks['vector'](self) + return self.doc.user_token_hooks["vector"](self) if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: return self.doc.tensor[self.i] else: @@ -348,23 +397,35 @@ cdef class Token: """The L2 norm of the token's vector representation. RETURNS (float): The L2 norm of the vector representation. + + DOCS: https://spacy.io/api/token#vector_norm """ def __get__(self): if 'vector_norm' in self.doc.user_token_hooks: - return self.doc.user_token_hooks['vector_norm'](self) + return self.doc.user_token_hooks["vector_norm"](self) vector = self.vector return numpy.sqrt((vector ** 2).sum()) property n_lefts: - """RETURNS (int): The number of leftward immediate children of the + """The number of leftward immediate children of the word, in the + syntactic dependency parse. + + RETURNS (int): The number of leftward immediate children of the word, in the syntactic dependency parse. + + DOCS: https://spacy.io/api/token#n_lefts """ def __get__(self): return self.c.l_kids property n_rights: - """RETURNS (int): The number of rightward immediate children of the + """The number of rightward immediate children of the word, in the + syntactic dependency parse. + + RETURNS (int): The number of rightward immediate children of the word, in the syntactic dependency parse. + + DOCS: https://spacy.io/api/token#n_rights """ def __get__(self): return self.c.r_kids @@ -373,7 +434,7 @@ cdef class Token: """RETURNS (Span): The sentence span that the token is a part of.""" def __get__(self): if 'sent' in self.doc.user_token_hooks: - return self.doc.user_token_hooks['sent'](self) + return self.doc.user_token_hooks["sent"](self) return self.doc[self.i : self.i+1].sent property sent_start: @@ -390,8 +451,13 @@ cdef class Token: self.is_sent_start = value property is_sent_start: - """RETURNS (bool / None): Whether the token starts a sentence. + """A boolean value indicating whether the token starts a sentence. + `None` if unknown. Defaults to `True` for the first token in the `Doc`. + + RETURNS (bool / None): Whether the token starts a sentence. None if unknown. + + DOCS: https://spacy.io/api/token#is_sent_start """ def __get__(self): if self.c.sent_start == 0: @@ -418,6 +484,8 @@ cdef class Token: dependency parse. YIELDS (Token): A left-child of the token. + + DOCS: https://spacy.io/api/token#lefts """ def __get__(self): cdef int nr_iter = 0 @@ -429,13 +497,15 @@ cdef class Token: nr_iter += 1 # This is ugly, but it's a way to guard out infinite loops if nr_iter >= 10000000: - raise RuntimeError(Errors.E045.format(attr='token.lefts')) + raise RuntimeError(Errors.E045.format(attr="token.lefts")) property rights: """The rightward immediate children of the word, in the syntactic dependency parse. YIELDS (Token): A right-child of the token. + + DOCS: https://spacy.io/api/token#rights """ def __get__(self): cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) @@ -447,7 +517,7 @@ cdef class Token: ptr -= 1 nr_iter += 1 if nr_iter >= 10000000: - raise RuntimeError(Errors.E045.format(attr='token.rights')) + raise RuntimeError(Errors.E045.format(attr="token.rights")) tokens.reverse() for t in tokens: yield t @@ -455,7 +525,9 @@ cdef class Token: property children: """A sequence of the token's immediate syntactic children. - YIELDS (Token): A child token such that child.head==self + YIELDS (Token): A child token such that `child.head==self`. + + DOCS: https://spacy.io/api/token#children """ def __get__(self): yield from self.lefts @@ -467,6 +539,8 @@ cdef class Token: YIELDS (Token): A descendent token such that `self.is_ancestor(descendent) or token == self`. + + DOCS: https://spacy.io/api/token#subtree """ def __get__(self): for word in self.lefts: @@ -496,11 +570,13 @@ cdef class Token: YIELDS (Token): A sequence of ancestor tokens such that `ancestor.is_ancestor(self)`. + + DOCS: https://spacy.io/api/token#ancestors """ def __get__(self): cdef const TokenC* head_ptr = self.c - # guard against infinite loop, no token can have - # more ancestors than tokens in the tree + # Guard against infinite loop, no token can have + # more ancestors than tokens in the tree. cdef int i = 0 while head_ptr.head != 0 and i < self.doc.length: head_ptr += head_ptr.head @@ -513,6 +589,8 @@ cdef class Token: descendant (Token): Another token. RETURNS (bool): Whether this token is the ancestor of the descendant. + + DOCS: https://spacy.io/api/token#is_ancestor """ if self.doc is not descendant.doc: return False @@ -528,34 +606,28 @@ cdef class Token: return self.doc[self.i + self.c.head] def __set__(self, Token new_head): - # this function sets the head of self to new_head - # and updates the counters for left/right dependents - # and left/right corner for the new and the old head - - # do nothing if old head is new head + # This function sets the head of self to new_head and updates the + # counters for left/right dependents and left/right corner for the + # new and the old head + # Do nothing if old head is new head if self.i + self.c.head == new_head.i: return - cdef Token old_head = self.head cdef int rel_newhead_i = new_head.i - self.i - - # is the new head a descendant of the old head + # Is the new head a descendant of the old head cdef bint is_desc = old_head.is_ancestor(new_head) - cdef int new_edge cdef Token anc, child - - # update number of deps of old head + # Update number of deps of old head if self.c.head > 0: # left dependent old_head.c.l_kids -= 1 if self.c.l_edge == old_head.c.l_edge: - # the token dominates the left edge so the left edge of - # the head may change when the token is reattached, it may + # The token dominates the left edge so the left edge of + # the head may change when the token is reattached, it may # not change if the new head is a descendant of the current - # head - + # head. new_edge = self.c.l_edge - # the new l_edge is the left-most l_edge on any of the + # The new l_edge is the left-most l_edge on any of the # other dependents where the l_edge is left of the head, # otherwise it is the head if not is_desc: @@ -566,21 +638,18 @@ cdef class Token: if child.c.l_edge < new_edge: new_edge = child.c.l_edge old_head.c.l_edge = new_edge - - # walk up the tree from old_head and assign new l_edge to + # Walk up the tree from old_head and assign new l_edge to # ancestors until an ancestor already has an l_edge that's # further left for anc in old_head.ancestors: if anc.c.l_edge <= new_edge: break anc.c.l_edge = new_edge - elif self.c.head < 0: # right dependent old_head.c.r_kids -= 1 - # do the same thing as for l_edge + # Do the same thing as for l_edge if self.c.r_edge == old_head.c.r_edge: new_edge = self.c.r_edge - if not is_desc: new_edge = old_head.i for child in old_head.children: @@ -589,16 +658,14 @@ cdef class Token: if child.c.r_edge > new_edge: new_edge = child.c.r_edge old_head.c.r_edge = new_edge - for anc in old_head.ancestors: if anc.c.r_edge >= new_edge: break anc.c.r_edge = new_edge - - # update number of deps of new head + # Update number of deps of new head if rel_newhead_i > 0: # left dependent new_head.c.l_kids += 1 - # walk up the tree from new head and set l_edge to self.l_edge + # Walk up the tree from new head and set l_edge to self.l_edge # until you hit a token with an l_edge further to the left if self.c.l_edge < new_head.c.l_edge: new_head.c.l_edge = self.c.l_edge @@ -606,34 +673,33 @@ cdef class Token: if anc.c.l_edge <= self.c.l_edge: break anc.c.l_edge = self.c.l_edge - elif rel_newhead_i < 0: # right dependent new_head.c.r_kids += 1 - # do the same as for l_edge + # Do the same as for l_edge if self.c.r_edge > new_head.c.r_edge: new_head.c.r_edge = self.c.r_edge for anc in new_head.ancestors: if anc.c.r_edge >= self.c.r_edge: break anc.c.r_edge = self.c.r_edge - - # set new head + # Set new head self.c.head = rel_newhead_i property conjuncts: """A sequence of coordinated tokens, including the token itself. YIELDS (Token): A coordinated token. + + DOCS: https://spacy.io/api/token#conjuncts """ def __get__(self): - """Get a list of conjoined words.""" cdef Token word - if 'conjuncts' in self.doc.user_token_hooks: - yield from self.doc.user_token_hooks['conjuncts'](self) + if "conjuncts" in self.doc.user_token_hooks: + yield from self.doc.user_token_hooks["conjuncts"](self) else: - if self.dep_ != 'conj': + if self.dep != conj: for word in self.rights: - if word.dep_ == 'conj': + if word.dep == conj: yield word yield from word.conjuncts @@ -670,7 +736,7 @@ cdef class Token: RETURNS (unicode): IOB code of named entity tag. """ def __get__(self): - iob_strings = ('', 'I', 'O', 'B') + iob_strings = ("", "I", "O", "B") return iob_strings[self.c.ent_iob] property ent_id: @@ -697,7 +763,7 @@ cdef class Token: """RETURNS (unicode): The trailing whitespace character, if present. """ def __get__(self): - return ' ' if self.c.spacy else '' + return " " if self.c.spacy else "" property orth_: """RETURNS (unicode): Verbatim text content (identical to @@ -770,6 +836,7 @@ cdef class Token: """RETURNS (unicode): Coarse-grained part-of-speech tag.""" def __get__(self): return parts_of_speech.NAMES[self.c.pos] + def __set__(self, pos_name): self.c.pos = parts_of_speech.IDS[pos_name] diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 9a1ad0e25..311bb9634 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,30 +1,31 @@ # coding: utf8 from __future__ import unicode_literals +cimport numpy as np +from cython.operator cimport dereference as deref +from libcpp.set cimport set as cppset + import functools import numpy from collections import OrderedDict import srsly - -cimport numpy as np from thinc.neural.util import get_array_module from thinc.neural._classes.model import Model from .strings cimport StringStore + from .strings import get_string_id from .compat import basestring_, path2str from .errors import Errors from . import util -from cython.operator cimport dereference as deref -from libcpp.set cimport set as cppset def unpickle_vectors(bytes_data): return Vectors().from_bytes(bytes_data) class GlobalRegistry(object): - '''Global store of vectors, to avoid repeatedly loading the data.''' + """Global store of vectors, to avoid repeatedly loading the data.""" data = {} @classmethod @@ -46,8 +47,10 @@ cdef class Vectors: rows in the vectors.data table. Multiple keys can be mapped to the same vector, and not all of the rows in - the table need to be assigned --- so len(list(vectors.keys())) may be + the table need to be assigned - so len(list(vectors.keys())) may be greater or smaller than vectors.shape[0]. + + DOCS: https://spacy.io/api/vectors """ cdef public object name cdef public object data @@ -62,12 +65,14 @@ cdef class Vectors: keys (iterable): A sequence of keys, aligned with the data. name (string): A name to identify the vectors table. RETURNS (Vectors): The newly created object. + + DOCS: https://spacy.io/api/vectors#init """ self.name = name if data is None: if shape is None: shape = (0,0) - data = numpy.zeros(shape, dtype='f') + data = numpy.zeros(shape, dtype="f") self.data = data self.key2row = OrderedDict() if self.data is not None: @@ -84,23 +89,40 @@ cdef class Vectors: in the vector table. RETURNS (tuple): A `(rows, dims)` pair. + + DOCS: https://spacy.io/api/vectors#shape """ return self.data.shape @property def size(self): - """RETURNS (int): rows*dims""" + """The vector size i,e. rows * dims. + + RETURNS (int): The vector size. + + DOCS: https://spacy.io/api/vectors#size + """ return self.data.shape[0] * self.data.shape[1] @property def is_full(self): - """RETURNS (bool): `True` if no slots are available for new keys.""" + """Whether the vectors table is full. + + RETURNS (bool): `True` if no slots are available for new keys. + + DOCS: https://spacy.io/api/vectors#is_full + """ return self._unset.size() == 0 @property def n_keys(self): - """RETURNS (int) The number of keys in the table. Note that this is the - number of all keys, not just unique vectors.""" + """Get the number of keys in the table. Note that this is the number + of all keys, not just unique vectors. + + RETURNS (int): The number of keys in the table. + + DOCS: https://spacy.io/api/vectors#n_keys + """ return len(self.key2row) def __reduce__(self): @@ -111,6 +133,8 @@ cdef class Vectors: key (int): The key to get the vector for. RETURNS (ndarray): The vector for the key. + + DOCS: https://spacy.io/api/vectors#getitem """ i = self.key2row[key] if i is None: @@ -123,6 +147,8 @@ cdef class Vectors: key (int): The key to set the vector for. vector (ndarray): The vector to set. + + DOCS: https://spacy.io/api/vectors#setitem """ i = self.key2row[key] self.data[i] = vector @@ -133,6 +159,8 @@ cdef class Vectors: """Iterate over the keys in the table. YIELDS (int): A key in the table. + + DOCS: https://spacy.io/api/vectors#iter """ yield from self.key2row @@ -140,6 +168,8 @@ cdef class Vectors: """Return the number of vectors in the table. RETURNS (int): The number of vectors in the data. + + DOCS: https://spacy.io/api/vectors#len """ return self.data.shape[0] @@ -148,6 +178,8 @@ cdef class Vectors: key (int): The key to check. RETURNS (bool): Whether the key has a vector entry. + + DOCS: https://spacy.io/api/vectors#contains """ return key in self.key2row @@ -159,6 +191,12 @@ cdef class Vectors: If the number of vectors is reduced, keys mapped to rows that have been deleted are removed. These removed items are returned as a list of `(key, row)` tuples. + + shape (tuple): A `(rows, dims)` tuple. + inplace (bool): Reallocate the memory. + RETURNS (list): The removed items as a list of `(key, row)` tuples. + + DOCS: https://spacy.io/api/vectors#resize """ if inplace: self.data.resize(shape, refcheck=False) @@ -175,10 +213,7 @@ cdef class Vectors: return removed_items def keys(self): - """A sequence of the keys in the table. - - RETURNS (iterable): The keys. - """ + """RETURNS (iterable): A sequence of keys in the table.""" return self.key2row.keys() def values(self): @@ -188,6 +223,8 @@ cdef class Vectors: returned may be less than the length of the vectors table. YIELDS (ndarray): A vector in the table. + + DOCS: https://spacy.io/api/vectors#values """ for row, vector in enumerate(range(self.data.shape[0])): if not self._unset.count(row): @@ -197,6 +234,8 @@ cdef class Vectors: """Iterate over `(key, vector)` pairs. YIELDS (tuple): A key/vector pair. + + DOCS: https://spacy.io/api/vectors#items """ for key, row in self.key2row.items(): yield key, self.data[row] @@ -215,7 +254,7 @@ cdef class Vectors: RETURNS: The requested key, keys, row or rows. """ if sum(arg is None for arg in (key, keys, row, rows)) != 3: - bad_kwargs = {'key': key, 'keys': keys, 'row': row, 'rows': rows} + bad_kwargs = {"key": key, "keys": keys, "row": row, "rows": rows} raise ValueError(Errors.E059.format(kwargs=bad_kwargs)) xp = get_array_module(self.data) if key is not None: @@ -224,7 +263,7 @@ cdef class Vectors: elif keys is not None: keys = [get_string_id(key) for key in keys] rows = [self.key2row.get(key, -1.) for key in keys] - return xp.asarray(rows, dtype='i') + return xp.asarray(rows, dtype="i") else: targets = set() if row is not None: @@ -236,7 +275,7 @@ cdef class Vectors: if row in targets: results.append(key) targets.remove(row) - return xp.asarray(results, dtype='uint64') + return xp.asarray(results, dtype="uint64") def add(self, key, *, vector=None, row=None): """Add a key to the table. Keys can be mapped to an existing vector @@ -246,6 +285,8 @@ cdef class Vectors: vector (ndarray / None): A vector to add for the key. row (int / None): The row number of a vector to map the key to. RETURNS (int): The row the vector was added to. + + DOCS: https://spacy.io/api/vectors#add """ key = get_string_id(key) if row is None and key in self.key2row: @@ -292,11 +333,10 @@ cdef class Vectors: sims = xp.dot(batch, vectors.T) best_rows[i:i+batch_size] = sims.argmax(axis=1) scores[i:i+batch_size] = sims.max(axis=1) - xp = get_array_module(self.data) row2key = {row: key for key, row in self.key2row.items()} keys = xp.asarray( - [row2key[row] for row in best_rows if row in row2key], dtype='uint64') + [row2key[row] for row in best_rows if row in row2key], dtype="uint64") return (keys, best_rows, scores) def from_glove(self, path): @@ -308,29 +348,30 @@ cdef class Vectors: path (unicode / Path): The path to load the GloVe vectors from. RETURNS: A `StringStore` object, holding the key-to-string mapping. + + DOCS: https://spacy.io/api/vectors#from_glove """ path = util.ensure_path(path) width = None for name in path.iterdir(): - if name.parts[-1].startswith('vectors'): + if name.parts[-1].startswith("vectors"): _, dims, dtype, _2 = name.parts[-1].split('.') width = int(dims) break else: raise IOError(Errors.E061.format(filename=path)) - bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims, - dtype=dtype) + bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype) xp = get_array_module(self.data) self.data = None - with bin_loc.open('rb') as file_: + with bin_loc.open("rb") as file_: self.data = xp.fromfile(file_, dtype=dtype) - if dtype != 'float32': - self.data = xp.ascontiguousarray(self.data, dtype='float32') + if dtype != "float32": + self.data = xp.ascontiguousarray(self.data, dtype="float32") if self.data.ndim == 1: self.data = self.data.reshape((self.data.size//width, width)) n = 0 strings = StringStore() - with (path / 'vocab.txt').open('r') as file_: + with (path / "vocab.txt").open("r") as file_: for i, line in enumerate(file_): key = strings.add(line.strip()) self.add(key, row=i) @@ -341,16 +382,17 @@ cdef class Vectors: path (unicode / Path): A path to a directory, which will be created if it doesn't exists. Either a string or a Path-like object. + + DOCS: https://spacy.io/api/vectors#to_disk """ xp = get_array_module(self.data) if xp is numpy: - save_array = lambda arr, file_: xp.save(file_, arr, - allow_pickle=False) + save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) else: save_array = lambda arr, file_: xp.save(file_, arr) serializers = OrderedDict(( - ('vectors', lambda p: save_array(self.data, p.open('wb'))), - ('key2row', lambda p: srsly.write_msgpack(p, self.key2row)) + ("vectors", lambda p: save_array(self.data, p.open("wb"))), + ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) )) return util.to_disk(path, serializers, exclude) @@ -360,6 +402,8 @@ cdef class Vectors: path (unicode / Path): Directory path, string or Path-like object. RETURNS (Vectors): The modified object. + + DOCS: https://spacy.io/api/vectors#from_disk """ def load_key2row(path): if path.exists(): @@ -380,9 +424,9 @@ cdef class Vectors: self.data = xp.load(str(path)) serializers = OrderedDict(( - ('key2row', load_key2row), - ('keys', load_keys), - ('vectors', load_vectors), + ("key2row", load_key2row), + ("keys", load_keys), + ("vectors", load_vectors), )) util.from_disk(path, serializers, exclude) return self @@ -392,15 +436,17 @@ cdef class Vectors: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `Vectors` object. + + DOCS: https://spacy.io/api/vectors#to_bytes """ def serialize_weights(): - if hasattr(self.data, 'to_bytes'): + if hasattr(self.data, "to_bytes"): return self.data.to_bytes() else: return srsly.msgpack_dumps(self.data) serializers = OrderedDict(( - ('key2row', lambda: srsly.msgpack_dumps(self.key2row)), - ('vectors', serialize_weights) + ("key2row", lambda: srsly.msgpack_dumps(self.key2row)), + ("vectors", serialize_weights) )) return util.to_bytes(serializers, exclude) @@ -410,16 +456,18 @@ cdef class Vectors: data (bytes): The data to load from. **exclude: Named attributes to prevent from being loaded. RETURNS (Vectors): The `Vectors` object. + + DOCS: https://spacy.io/api/vectors#from_bytes """ def deserialize_weights(b): - if hasattr(self.data, 'from_bytes'): + if hasattr(self.data, "from_bytes"): self.data.from_bytes() else: self.data = srsly.msgpack_loads(b) deserializers = OrderedDict(( - ('key2row', lambda b: self.key2row.update(srsly.msgpack_loads(b))), - ('vectors', deserialize_weights) + ("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))), + ("vectors", deserialize_weights) )) util.from_bytes(data, deserializers, exclude) return self diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 7b1dd5f03..44a69351f 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -4,9 +4,9 @@ from __future__ import unicode_literals import numpy import srsly - from collections import OrderedDict from thinc.neural.util import get_array_module + from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .typedefs cimport attr_t @@ -27,6 +27,8 @@ cdef class Vocab: """A look-up table that allows you to access `Lexeme` objects. The `Vocab` instance also provides access to the `StringStore`, and owns underlying C-data that is shared between `Doc` objects. + + DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, strings=tuple(), oov_prob=-20., **deprecated_kwargs): @@ -62,7 +64,7 @@ cdef class Vocab: langfunc = None if self.lex_attr_getters: langfunc = self.lex_attr_getters.get(LANG, None) - return langfunc('_') if langfunc else '' + return langfunc("_") if langfunc else "" def __len__(self): """The current number of lexemes stored. @@ -87,11 +89,7 @@ cdef class Vocab: available bit will be chosen. RETURNS (int): The integer ID by which the flag value can be checked. - EXAMPLE: - >>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy'] - >>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter) - >>> doc = nlp(u'I like spaCy') - >>> assert doc[2].check_flag(MY_PRODUCT) == True + DOCS: https://spacy.io/api/vocab#add_flag """ if flag_id == -1: for bit in range(1, 64): @@ -112,7 +110,7 @@ cdef class Vocab: `Lexeme` if necessary using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon. """ - if string == u'': + if string == "": return &EMPTY_LEXEME cdef LexemeC* lex cdef hash_t key = self.strings[string] @@ -176,10 +174,12 @@ cdef class Vocab: string (unicode): The ID string. RETURNS (bool) Whether the string has an entry in the vocabulary. + + DOCS: https://spacy.io/api/vocab#contains """ cdef hash_t int_key if isinstance(key, bytes): - int_key = self.strings[key.decode('utf8')] + int_key = self.strings[key.decode("utf8")] elif isinstance(key, unicode): int_key = self.strings[key] else: @@ -191,6 +191,8 @@ cdef class Vocab: """Iterate over the lexemes in the vocabulary. YIELDS (Lexeme): An entry in the vocabulary. + + DOCS: https://spacy.io/api/vocab#iter """ cdef attr_t key cdef size_t addr @@ -210,8 +212,10 @@ cdef class Vocab: RETURNS (Lexeme): The lexeme indicated by the given ID. EXAMPLE: - >>> apple = nlp.vocab.strings['apple'] - >>> assert nlp.vocab[apple] == nlp.vocab[u'apple'] + >>> apple = nlp.vocab.strings["apple"] + >>> assert nlp.vocab[apple] == nlp.vocab[u"apple"] + + DOCS: https://spacy.io/api/vocab#getitem """ cdef attr_t orth if isinstance(id_or_string, unicode): @@ -284,6 +288,8 @@ cdef class Vocab: `(string, score)` tuples, where `string` is the entry the removed word was mapped to, and `score` the similarity score between the two words. + + DOCS: https://spacy.io/api/vocab#prune_vectors """ xp = get_array_module(self.vectors.data) # Make prob negative so it sorts by rank ascending @@ -291,16 +297,12 @@ cdef class Vocab: priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth) for lex in self if lex.orth in self.vectors.key2row] priority.sort() - indices = xp.asarray([i for (prob, i, key) in priority], dtype='i') - keys = xp.asarray([key for (prob, i, key) in priority], dtype='uint64') - + indices = xp.asarray([i for (prob, i, key) in priority], dtype="i") + keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) - self.vectors = Vectors(data=keep, keys=keys) - syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) - remap = {} for i, key in enumerate(keys[nr_row:]): self.vectors.add(key, row=syn_rows[i]) @@ -319,21 +321,22 @@ cdef class Vocab: RETURNS (numpy.ndarray): A word vector. Size and shape determined by the `vocab.vectors` instance. Usually, a numpy ndarray of shape (300,) and dtype float32. + + DOCS: https://spacy.io/api/vocab#get_vector """ if isinstance(orth, basestring_): orth = self.strings.add(orth) word = self[orth].orth_ if orth in self.vectors.key2row: return self.vectors[orth] - # Assign default ngram limits to minn and maxn which is the length of the word. if minn is None: minn = len(word) if maxn is None: maxn = len(word) - vectors = numpy.zeros((self.vectors_length,), dtype='f') - - # Fasttext's ngram computation taken from https://github.com/facebookresearch/fastText + vectors = numpy.zeros((self.vectors_length,), dtype="f") + # Fasttext's ngram computation taken from + # https://github.com/facebookresearch/fastText ngrams_size = 0; for i in range(len(word)): ngram = "" @@ -356,12 +359,16 @@ cdef class Vocab: n = n + 1 if ngrams_size > 0: vectors = vectors * (1.0/ngrams_size) - return vectors def set_vector(self, orth, vector): """Set a vector for a word in the vocabulary. Words can be referenced by string or int ID. + + orth (int / unicode): The word. + vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set. + + DOCS: https://spacy.io/api/vocab#set_vector """ if isinstance(orth, basestring_): orth = self.strings.add(orth) @@ -372,13 +379,19 @@ cdef class Vocab: else: width = self.vectors.shape[1] self.vectors.resize((new_rows, width)) - lex = self[orth] # Adds worse to vocab + lex = self[orth] # Adds words to vocab self.vectors.add(orth, vector=vector) self.vectors.add(orth, vector=vector) def has_vector(self, orth): """Check whether a word has a vector. Returns False if no vectors have - been loaded. Words can be looked up by string or int ID.""" + been loaded. Words can be looked up by string or int ID. + + orth (int / unicode): The word. + RETURNS (bool): Whether the word has a vector. + + DOCS: https://spacy.io/api/vocab#has_vector + """ if isinstance(orth, basestring_): orth = self.strings.add(orth) return orth in self.vectors @@ -388,12 +401,14 @@ cdef class Vocab: path (unicode or Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. + + DOCS: https://spacy.io/api/vocab#to_disk """ path = util.ensure_path(path) if not path.exists(): path.mkdir() - self.strings.to_disk(path / 'strings.json') - with (path / 'lexemes.bin').open('wb') as file_: + self.strings.to_disk(path / "strings.json") + with (path / "lexemes.bin").open('wb') as file_: file_.write(self.lexemes_to_bytes()) if self.vectors is not None: self.vectors.to_disk(path) @@ -405,13 +420,15 @@ cdef class Vocab: path (unicode or Path): A path to a directory. Paths may be either strings or `Path`-like objects. RETURNS (Vocab): The modified `Vocab` object. + + DOCS: https://spacy.io/api/vocab#to_disk """ path = util.ensure_path(path) - self.strings.from_disk(path / 'strings.json') - with (path / 'lexemes.bin').open('rb') as file_: + self.strings.from_disk(path / "strings.json") + with (path / "lexemes.bin").open("rb") as file_: self.lexemes_from_bytes(file_.read()) if self.vectors is not None: - self.vectors.from_disk(path, exclude='strings.json') + self.vectors.from_disk(path, exclude="strings.json") if self.vectors.name is not None: link_vectors_to_models(self) return self @@ -421,6 +438,8 @@ cdef class Vocab: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `Vocab` object. + + DOCS: https://spacy.io/api/vocab#to_bytes """ def deserialize_vectors(): if self.vectors is None: @@ -429,9 +448,9 @@ cdef class Vocab: return self.vectors.to_bytes() getters = OrderedDict(( - ('strings', lambda: self.strings.to_bytes()), - ('lexemes', lambda: self.lexemes_to_bytes()), - ('vectors', deserialize_vectors) + ("strings", lambda: self.strings.to_bytes()), + ("lexemes", lambda: self.lexemes_to_bytes()), + ("vectors", deserialize_vectors) )) return util.to_bytes(getters, exclude) @@ -441,6 +460,8 @@ cdef class Vocab: bytes_data (bytes): The data to load from. **exclude: Named attributes to prevent from being loaded. RETURNS (Vocab): The `Vocab` object. + + DOCS: https://spacy.io/api/vocab#from_bytes """ def serialize_vectors(b): if self.vectors is None: @@ -448,9 +469,9 @@ cdef class Vocab: else: return self.vectors.from_bytes(b) setters = OrderedDict(( - ('strings', lambda b: self.strings.from_bytes(b)), - ('lexemes', lambda b: self.lexemes_from_bytes(b)), - ('vectors', lambda b: serialize_vectors(b)) + ("strings", lambda b: self.strings.from_bytes(b)), + ("lexemes", lambda b: self.lexemes_from_bytes(b)), + ("vectors", lambda b: serialize_vectors(b)) )) util.from_bytes(bytes_data, setters, exclude) if self.vectors.name is not None: @@ -467,7 +488,7 @@ cdef class Vocab: if addr == 0: continue size += sizeof(lex_data.data) - byte_string = b'\0' * size + byte_string = b"\0" * size byte_ptr = byte_string cdef int j cdef int i = 0 diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index ca3725647..13ae320cc 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -1,7 +1,7 @@ --- title: DependencyParser tag: class -source: spacy/pipeline.pyx +source: spacy/pipeline/pipes.pyx --- This class is a subclass of `Pipe` and follows the same API. The pipeline @@ -211,7 +211,7 @@ Modify the pipe's model, to use the given parameter values. > ```python > parser = DependencyParser(nlp.vocab) > with parser.use_params(): -> parser.to_disk('/best_model') +> parser.to_disk("/best_model") > ``` | Name | Type | Description | @@ -226,7 +226,7 @@ Add a new label to the pipe. > > ```python > parser = DependencyParser(nlp.vocab) -> parser.add_label('MY_LABEL') +> parser.add_label("MY_LABEL") > ``` | Name | Type | Description | @@ -241,7 +241,7 @@ Serialize the pipe to disk. > > ```python > parser = DependencyParser(nlp.vocab) -> parser.to_disk('/path/to/parser') +> parser.to_disk("/path/to/parser") > ``` | Name | Type | Description | @@ -256,7 +256,7 @@ Load the pipe from disk. Modifies the object in place and returns it. > > ```python > parser = DependencyParser(nlp.vocab) -> parser.from_disk('/path/to/parser') +> parser.from_disk("/path/to/parser") > ``` | Name | Type | Description | @@ -266,7 +266,7 @@ Load the pipe from disk. Modifies the object in place and returns it. ## DependencyParser.to_bytes {#to_bytes tag="method"} -> #### example +> #### Example > > ```python > parser = DependencyParser(nlp.vocab) diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 2a760658a..267d8f711 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -127,6 +127,7 @@ details, see the documentation on | `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | | `setter` | callable | Setter function that takes the `Doc` and a value, and modifies the object. Is called when the user writes to the `Doc._` attribute. | +| `force` | bool | Force overwriting existing attribute. | ## Doc.get_extension {#get_extension tag="classmethod" new="2"} @@ -263,6 +264,46 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | ----------- | -------------------------------------- | ----------------------------------------------- | | **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Doc`. | +## Doc.to_json {#to_json, tag="method" new="2.1"} + +Convert a Doc to JSON. The format it produces will be the new format for the +[`spacy train`](/api/cli#train) command (not implemented yet). If custom +underscore attributes are specified, their values need to be JSON-serializable. +They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`. + +> #### Example +> +> ```python +> doc = nlp(u"Hello") +> json_doc = doc.to_json() +> ``` +> +> #### Result +> +> ```python +> { +> "text": "Hello", +> "ents": [], +> "sents": [{"start": 0, "end": 5}], +> "tokens": [{"id": 0, "start": 0, "end": 5, "pos": "INTJ", "tag": "UH", "dep": "ROOT", "head": 0} +> ] +> } +> ``` + +| Name | Type | Description | +| ------------ | ---- | ------------------------------------------------------------------------------ | +| `underscore` | list | Optional list of string names of custom JSON-serializable `doc._.` attributes. | +| **RETURNS** | dict | The JSON-formatted data. | + + + +spaCy previously implemented a `Doc.print_tree` method that returned a similar +JSON-formatted representation of a `Doc`. As of v2.1, this method is deprecated +in favor of `Doc.to_json`. If you need more complex nested representations, you +might want to write your own function to extract the data. + + + ## Doc.to_array {#to_array tag="method"} Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence @@ -310,7 +351,7 @@ array of attributes. | Name | Type | Description | | ----------- | -------------------------------------- | ----------------------------- | -| `attrs` | ints | A list of attribute ID ints. | +| `attrs` | list | A list of attribute ID ints. | | `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. | | **RETURNS** | `Doc` | Itself. | @@ -429,14 +470,16 @@ to specify how the new subtokens should be integrated into the dependency tree. The list of per-token heads can either be a token in the original document, e.g. `doc[2]`, or a tuple consisting of the token in the original document and its subtoken index. For example, `(doc[3], 1)` will attach the subtoken to the -second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other -newly created subtokens, without having to keep track of the changing token -indices. If the specified head token will be split within the retokenizer block -and no subtoken index is specified, it will default to `0`. Attributes to set on -subtokens can be provided as a list of values. They'll be applied to the -resulting token (if they're context-dependent token attributes like `LEMMA` or -`DEP`) or to the underlying lexeme (if they're context-independent lexical -attributes like `LOWER` or `IS_STOP`). +second subtoken of `doc[3]`. + +This mechanism allows attaching subtokens to other newly created subtokens, +without having to keep track of the changing token indices. If the specified +head token will be split within the retokenizer block and no subtoken index is +specified, it will default to `0`. Attributes to set on subtokens can be +provided as a list of values. They'll be applied to the resulting token (if +they're context-dependent token attributes like `LEMMA` or `DEP`) or to the +underlying lexeme (if they're context-independent lexical attributes like +`LOWER` or `IS_STOP`). > #### Example > @@ -487,8 +530,8 @@ and end token boundaries, the document remains unchanged. ## Doc.ents {#ents tag="property" model="NER"} -Iterate over the entities in the document. Yields named-entity `Span` objects, -if the entity recognizer has been applied to the document. +The named entities in the document. Returns a tuple of named entity `Span` +objects, if the entity recognizer has been applied. > #### Example > @@ -500,9 +543,9 @@ if the entity recognizer has been applied to the document. > assert ents[0].text == u"Mr. Best" > ``` -| Name | Type | Description | -| ---------- | ------ | ------------------------- | -| **YIELDS** | `Span` | Entities in the document. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------ | +| **RETURNS** | tuple | Entities in the document, one `Span` per entity. | ## Doc.noun_chunks {#noun_chunks tag="property" model="parser"} @@ -541,9 +584,9 @@ will be unavailable. > assert [s.root.text for s in sents] == [u"is", u"'s"] > ``` -| Name | Type | Description | -| ---------- | ---------------------------------- | ----------- | -| **YIELDS** | `Span | Sentences in the document. | +| Name | Type | Description | +| ---------- | ------ | -------------------------- | +| **YIELDS** | `Span` | Sentences in the document. | ## Doc.has_vector {#has_vector tag="property" model="vectors"} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 8f71005bc..c9db2c409 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -1,7 +1,7 @@ --- title: EntityRecognizer tag: class -source: spacy/pipeline.pyx +source: spacy/pipeline/pipes.pyx --- This class is a subclass of `Pipe` and follows the same API. The pipeline @@ -211,7 +211,7 @@ Modify the pipe's model, to use the given parameter values. > ```python > ner = EntityRecognizer(nlp.vocab) > with ner.use_params(): -> ner.to_disk('/best_model') +> ner.to_disk("/best_model") > ``` | Name | Type | Description | @@ -226,7 +226,7 @@ Add a new label to the pipe. > > ```python > ner = EntityRecognizer(nlp.vocab) -> ner.add_label('MY_LABEL') +> ner.add_label("MY_LABEL") > ``` | Name | Type | Description | @@ -241,7 +241,7 @@ Serialize the pipe to disk. > > ```python > ner = EntityRecognizer(nlp.vocab) -> ner.to_disk('/path/to/ner') +> ner.to_disk("/path/to/ner") > ``` | Name | Type | Description | @@ -256,7 +256,7 @@ Load the pipe from disk. Modifies the object in place and returns it. > > ```python > ner = EntityRecognizer(nlp.vocab) -> ner.from_disk('/path/to/ner') +> ner.from_disk("/path/to/ner") > ``` | Name | Type | Description | @@ -266,7 +266,7 @@ Load the pipe from disk. Modifies the object in place and returns it. ## EntityRecognizer.to_bytes {#to_bytes tag="method"} -> #### example +> #### Example > > ```python > ner = EntityRecognizer(nlp.vocab) diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index df2db1a23..45c4756f2 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -1,7 +1,7 @@ --- title: EntityRuler tag: class -source: spacy/pipeline.pyx +source: spacy/pipeline/entityruler.py new: 2.1 --- @@ -128,7 +128,7 @@ newline-delimited JSON (JSONL). > > ```python > ruler = EntityRuler(nlp) -> ruler.to_disk('/path/to/rules.jsonl') +> ruler.to_disk("/path/to/rules.jsonl") > ``` | Name | Type | Description | @@ -144,7 +144,7 @@ JSON (JSONL) with one entry per line. > > ```python > ruler = EntityRuler(nlp) -> ruler.from_disk('/path/to/rules.jsonl') +> ruler.from_disk("/path/to/rules.jsonl") > ``` | Name | Type | Description | diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 9c0e6ad97..9059491a1 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -1,7 +1,7 @@ --- title: Pipeline Functions teaser: Other built-in pipeline components and helpers -source: spacy/pipeline.pyx +source: spacy/pipeline/functions.py menu: - ['merge_noun_chunks', 'merge_noun_chunks'] - ['merge_entities', 'merge_entities'] @@ -73,10 +73,10 @@ components to the end of the pipeline and after all other components. | `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | **RETURNS** | `Doc` | The modified `Doc` with merged entities. | -## merge_subtokens {#merge_entities tag="function" new="2.1"} +## merge_subtokens {#merge_subtokens tag="function" new="2.1"} Merge subtokens into a single token. Also available via the string name -`"merge_entities"`. After initialization, the component is typically added to +`"merge_subtokens"`. After initialization, the component is typically added to the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). As of v2.1, the parser is able to predict "subtokens" that should be merged into diff --git a/website/docs/api/sentencesegmenter.md b/website/docs/api/sentencesegmenter.md index 8090b060d..d4055536d 100644 --- a/website/docs/api/sentencesegmenter.md +++ b/website/docs/api/sentencesegmenter.md @@ -1,7 +1,7 @@ --- title: SentenceSegmenter tag: class -source: spacy/pipeline.pyx +source: spacy/pipeline/hooks.py --- A simple spaCy hook, to allow custom sentence boundary detection logic that diff --git a/website/docs/api/span.md b/website/docs/api/span.md index bf95bd6f2..033aa579c 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -260,8 +260,8 @@ Retokenize the document, such that the span is merged into a single token. ## Span.ents {#ents tag="property" new="2.0.12" model="ner"} -Iterate over the entities in the span. Yields named-entity `Span` objects, if -the entity recognizer has been applied to the parent document. +The named entities in the span. Returns a tuple of named entity `Span` objects, +if the entity recognizer has been applied. > #### Example > @@ -274,9 +274,9 @@ the entity recognizer has been applied to the parent document. > assert ents[0].text == u"Mr. Best" > ``` -| Name | Type | Description | -| ---------- | ------ | ------------------------- | -| **YIELDS** | `Span` | Entities in the document. | +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------- | +| **RETURNS** | tuple | Entities in the span, one `Span` per entity. | ## Span.as_doc {#as_doc tag="method"} @@ -297,8 +297,9 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. ## Span.root {#root tag="property" model="parser"} -The token within the span that's highest in the parse tree. If there's a tie, -the earliest is preferred. +The token with the shortest path to the root of the sentence (or the root +itself). If multiple tokens are equally high in the tree, the first token is +taken. > #### Example > diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 7b9581f9a..fa3eda993 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -1,7 +1,7 @@ --- title: Tagger tag: class -source: spacy/pipeline.pyx +source: spacy/pipeline/pipes.pyx --- This class is a subclass of `Pipe` and follows the same API. The pipeline @@ -209,7 +209,7 @@ Modify the pipe's model, to use the given parameter values. > ```python > tagger = Tagger(nlp.vocab) > with tagger.use_params(): -> tagger.to_disk('/best_model') +> tagger.to_disk("/best_model") > ``` | Name | Type | Description | @@ -225,7 +225,7 @@ Add a new label to the pipe. > ```python > from spacy.symbols import POS > tagger = Tagger(nlp.vocab) -> tagger.add_label('MY_LABEL', {POS: 'NOUN'}) +> tagger.add_label("MY_LABEL", {POS: 'NOUN'}) > ``` | Name | Type | Description | @@ -241,7 +241,7 @@ Serialize the pipe to disk. > > ```python > tagger = Tagger(nlp.vocab) -> tagger.to_disk('/path/to/tagger') +> tagger.to_disk("/path/to/tagger") > ``` | Name | Type | Description | @@ -256,7 +256,7 @@ Load the pipe from disk. Modifies the object in place and returns it. > > ```python > tagger = Tagger(nlp.vocab) -> tagger.from_disk('/path/to/tagger') +> tagger.from_disk("/path/to/tagger") > ``` | Name | Type | Description | @@ -266,7 +266,7 @@ Load the pipe from disk. Modifies the object in place and returns it. ## Tagger.to_bytes {#to_bytes tag="method"} -> #### example +> #### Example > > ```python > tagger = Tagger(nlp.vocab) diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index faeb45bc6..cb90aa271 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -1,7 +1,7 @@ --- title: TextCategorizer tag: class -source: spacy/pipeline.pyx +source: spacy/pipeline/pipes.pyx new: 2 --- @@ -227,7 +227,7 @@ Modify the pipe's model, to use the given parameter values. > ```python > textcat = TextCategorizer(nlp.vocab) > with textcat.use_params(): -> textcat.to_disk('/best_model') +> textcat.to_disk("/best_model") > ``` | Name | Type | Description | @@ -242,7 +242,7 @@ Add a new label to the pipe. > > ```python > textcat = TextCategorizer(nlp.vocab) -> textcat.add_label('MY_LABEL') +> textcat.add_label("MY_LABEL") > ``` | Name | Type | Description | @@ -257,7 +257,7 @@ Serialize the pipe to disk. > > ```python > textcat = TextCategorizer(nlp.vocab) -> textcat.to_disk('/path/to/textcat') +> textcat.to_disk("/path/to/textcat") > ``` | Name | Type | Description | @@ -272,7 +272,7 @@ Load the pipe from disk. Modifies the object in place and returns it. > > ```python > textcat = TextCategorizer(nlp.vocab) -> textcat.from_disk('/path/to/textcat') +> textcat.from_disk("/path/to/textcat") > ``` | Name | Type | Description | @@ -282,7 +282,7 @@ Load the pipe from disk. Modifies the object in place and returns it. ## TextCategorizer.to_bytes {#to_bytes tag="method"} -> #### example +> #### Example > > ```python > textcat = TextCategorizer(nlp.vocab) diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 1089d2329..f30fd4639 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -324,7 +324,7 @@ A sequence containing the token and all the token's syntactic descendants. ## Token.is_sent_start {#is_sent_start tag="property" new="2"} A boolean value indicating whether the token starts a sentence. `None` if -unknown. Defaults to `True` for the first token in the `doc`. +unknown. Defaults to `True` for the first token in the `Doc`. > #### Example > diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index b4a1126e6..2f0cc0542 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -116,6 +116,72 @@ details and examples. | `string` | unicode | The string to specially tokenize. | | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | +## Tokenizer.to_disk {#to_disk tag="method"} + +Serialize the tokenizer to disk. + +> #### Example +> +> ```python +> tokenizer = Tokenizer(nlp.vocab) +> tokenizer.to_disk("/path/to/tokenizer") +> ``` + +| Name | Type | Description | +| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | + +## Tokenizer.from_disk {#from_disk tag="method"} + +Load the tokenizer from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> tokenizer = Tokenizer(nlp.vocab) +> tokenizer.from_disk("/path/to/tokenizer") +> ``` + +| Name | Type | Description | +| ----------- | ---------------- | -------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | + +## Tokenizer.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> tokenizer = tokenizer(nlp.vocab) +> tokenizer_bytes = tokenizer.to_bytes() +> ``` + +Serialize the tokenizer to a bytestring. + +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------------- | +| `**exclude` | - | Named attributes to prevent from being serialized. | +| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | + +## Tokenizer.from_bytes {#from_bytes tag="method"} + +Load the tokenizer from a bytestring. Modifies the object in place and returns +it. + +> #### Example +> +> ```python +> tokenizer_bytes = tokenizer.to_bytes() +> tokenizer = Tokenizer(nlp.vocab) +> tokenizer.from_bytes(tokenizer_bytes) +> ``` + +| Name | Type | Description | +| ------------ | ----------- | ---------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| `**exclude` | - | Named attributes to prevent from being loaded. | +| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | + ## Attributes {#attributes} | Name | Type | Description | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 79540592a..43f075b53 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -642,7 +642,7 @@ All Python code is written in an **intersection of Python 2 and Python 3**. This is easy in Cython, but somewhat ugly in Python. Logic that deals with Python or platform compatibility only lives in `spacy.compat`. To distinguish them from the builtin functions, replacement functions are suffixed with an underscore, -e.e `unicode_`. +e.g. `unicode_`. > #### Example > @@ -660,7 +660,7 @@ e.e `unicode_`. | `compat.input_` | `raw_input` | `input` | | `compat.path2str` | `str(path)` with `.decode('utf8')` | `str(path)` | -### compat.is_config {#is_config tag="function"} +### compat.is_config {#compat.is_config tag="function"} Check if a specific configuration of Python version and operating system matches the user's setup. Mostly used to display targeted error messages. diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index a73c4386d..719644a57 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -424,7 +424,7 @@ take a path to a JSON file containing the patterns. This lets you reuse the component with different patterns, depending on your application: ```python -html_merger = BadHTMLMerger(nlp, path='/path/to/patterns.json') +html_merger = BadHTMLMerger(nlp, path="/path/to/patterns.json") ``` From 76764fcf59dc661c0641515db78e63b4f9d9537e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 8 Mar 2019 23:15:23 +0100 Subject: [PATCH 04/55] =?UTF-8?q?=F0=9F=92=AB=20Improve=20converters=20and?= =?UTF-8?q?=20training=20data=20file=20formats=20(#3374)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Populate converter argument info automatically * Add conversion option for msgpack * Update docs * Allow reading training data from JSONL --- spacy/cli/convert.py | 18 ++++++++++--- spacy/errors.py | 2 +- spacy/gold.pyx | 5 +++- website/docs/api/cli.md | 58 ++++++++++++++++++++++++++++------------- 4 files changed, 59 insertions(+), 24 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index a909a4241..12a3d2698 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -23,15 +23,16 @@ CONVERTERS = { } # File types -FILE_TYPES = ("json", "jsonl") +FILE_TYPES = ("json", "jsonl", "msg") +FILE_TYPES_STDOUT = ("json", "jsonl") @plac.annotations( input_file=("Input file", "positional", None, str), - output_dir=("Output directory for converted file", "positional", None, str), - file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str), + output_dir=("Output directory. '-' for stdout.", "positional", None, str), + file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str), n_sents=("Number of sentences per doc", "option", "n", int), - converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), + converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), lang=("Language (if tokenizer required)", "option", "l", str), morphology=("Enable appending morphology to tags", "flag", "m", bool), ) @@ -58,6 +59,13 @@ def convert( "Supported file types: '{}'".format(", ".join(FILE_TYPES)), exits=1, ) + if file_type not in FILE_TYPES_STDOUT and output_dir == "-": + # TODO: support msgpack via stdout in srsly? + msg.fail( + "Can't write .{} data to stdout.".format(file_type), + "Please specify an output directory.", + exits=1, + ) if not input_path.exists(): msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): @@ -78,6 +86,8 @@ def convert( srsly.write_json(output_file, data) elif file_type == "jsonl": srsly.write_jsonl(output_file, data) + elif file_type == "msg": + srsly.write_msgpack(output_file, data) msg.good("Generated output file ({} documents)".format(len(data)), output_file) else: # Print to stdout diff --git a/spacy/errors.py b/spacy/errors.py index 13382d146..c409e5a0c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -342,7 +342,7 @@ class Errors(object): "equal to span length ({span_len}).") E122 = ("Cannot find token to be split. Did it get merged?") E123 = ("Cannot find head of token to be split. Did it get merged?") - E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg") + E124 = ("Cannot read from file: {path}. Supported formats: {formats}") E125 = ("Unexpected value: {value}") E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " "This is likely a bug in spaCy, so feel free to open an issue.") diff --git a/spacy/gold.pyx b/spacy/gold.pyx index d03d13d2d..02306c651 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -153,10 +153,13 @@ class GoldCorpus(object): loc = util.ensure_path(loc) if loc.parts[-1].endswith("json"): gold_tuples = read_json_file(loc) + elif loc.parts[-1].endswith("jsonl"): + gold_tuples = srsly.read_jsonl(loc) elif loc.parts[-1].endswith("msg"): gold_tuples = srsly.read_msgpack(loc) else: - raise ValueError(Errors.E124.format(path=path2str(loc))) + supported = ("json", "jsonl", "msg") + raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) for item in gold_tuples: yield item i += len(item[1]) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index ee4c3787b..2d3c13e37 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file extension of the input file. ```bash -$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents] -[--morphology] +$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter] +[--n-sents] [--morphology] [--lang] ``` -| Argument | Type | Description | -| -------------------------------------------- | ---------- | ---------------------------------------------------------- | -| `input_file` | positional | Input file. | -| `output_dir` | positional | Output directory for converted JSON file. | -| `converter`, `-c` 2 | option | Name of converter to use (see below). | -| `--n-sents`, `-n` | option | Number of sentences per document. | -| `--morphology`, `-m` | option | Enable appending morphology to tags. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). | +| Argument | Type | Description | +| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- | +| `input_file` | positional | Input file. | +| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | +| `--file-type`, `-t` 2.1 | option | Type of file to create (see below). | +| `--converter`, `-c` 2 | option | Name of converter to use (see below). | +| `--n-sents`, `-n` | option | Number of sentences per document. | +| `--morphology`, `-m` | option | Enable appending morphology to tags. | +| `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). | -The following file format converters are available: +### Output file types {new="2.1"} -| ID | Description | -| ----------------- | --------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension (default). | -| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format. | -| `ner` | Tab-based named entity recognition format. | -| `iob` | IOB or IOB2 named entity recognition format. | +> #### Which format should I choose? +> +> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means +> that there's one JSON object per line. Unlike a regular JSON file, it can also +> be read in line-by-line and you won't have to parse the _entire file_ first. +> This makes it a very convenient format for larger corpora. + +All output files generated by this command are compatible with +[`spacy train`](/api/cli#train). + +| ID | Description | +| ------- | --------------------------------- | +| `jsonl` | Newline-delimited JSON (default). | +| `json` | Regular JSON. | +| `msg` | Binary MessagePack format. | + +### Converter options + + + +| ID | Description | +| ------------------------------ | --------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension (default). | +| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | +| `ner` | Tab-based named entity recognition format. | +| `iob` | IOB or IOB2 named entity recognition format. | ## Train {#train} From ec93b423534463ac0b0f4edd367d2699d27a1fd4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 00:19:03 +0100 Subject: [PATCH 05/55] Set up CI with Azure Pipelines --- azure-pipelines.yml | 71 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 azure-pipelines.yml diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 000000000..7df776910 --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,71 @@ +trigger: + batch: true + branches: + include: + - '*' + exclude: + - 'spacy.io' + +jobs: + +- job: 'Test' + strategy: + matrix: + Python27Linux: + imageName: 'ubuntu-16.04' + python.version: '2.7' + Python27Mac: + imageName: 'macos-10.13' + python.version: '2.7' + Python35Linux: + imageName: 'ubuntu-16.04' + python.version: '3.5' + Python35Windows: + imageName: 'vs2017-win2016' + python.version: '3.5' + Python35Mac: + imageName: 'macos-10.13' + python.version: '3.5' + Python36Linux: + imageName: 'ubuntu-16.04' + python.version: '3.6' + Python36Windows: + imageName: 'vs2017-win2016' + python.version: '3.6' + Python36Mac: + imageName: 'macos-10.13' + python.version: '3.6' + Python37Linux: + imageName: 'ubuntu-16.04' + python.version: '3.7' + Python37Windows: + imageName: 'vs2017-win2016' + python.version: '3.7' + Python37Mac: + imageName: 'macos-10.13' + python.version: '3.7' + maxParallel: 4 + pool: + vmImage: $(imageName) + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + architecture: 'x64' + + - script: flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics + displayName: 'flake8' + + - script: | + python -m pip install --upgrade pip + pip install -r requirements.txt + displayName: 'Install dependencies' + + - script: + python setup.py build_ext --inplace + pip install -e . + displayName: 'Build and install' + + - script: python -m pytest --tb=native spacy + displayName: 'Run tests' From 086b267bdbf0debaca0eeb92bc69f225ef8dc5e1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 00:20:44 +0100 Subject: [PATCH 06/55] Update azure-pipelines.yml for Azure Pipelines --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 7df776910..056cfaff2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -54,14 +54,14 @@ jobs: versionSpec: '$(python.version)' architecture: 'x64' - - script: flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics - displayName: 'flake8' - - script: | python -m pip install --upgrade pip pip install -r requirements.txt displayName: 'Install dependencies' + - script: flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics + displayName: 'flake8' + - script: python setup.py build_ext --inplace pip install -e . From 70511ba9657c2dd188619d054229d40467fefa07 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 00:24:20 +0100 Subject: [PATCH 07/55] Update .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index d9e75a229..2cb2d8bc0 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ keys/ website/www/ website/_deploy.sh website/.gitignore +website/public +node_modules # Cython / C extensions cythonize.json From db9512f9e177067dde3a9d2d8c438c550344c121 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 00:24:40 +0100 Subject: [PATCH 08/55] Update azure-pipelines.yml --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 056cfaff2..0b7ecd018 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -59,11 +59,11 @@ jobs: pip install -r requirements.txt displayName: 'Install dependencies' - - script: flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics + - script: python -m flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics displayName: 'flake8' - script: - python setup.py build_ext --inplace + python setup.py build_ext --inplace pip install -e . displayName: 'Build and install' From 7342348fc736f8e2a700d14274d3da7cb8d4d842 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 00:26:54 +0100 Subject: [PATCH 09/55] Update azure-pipelines.yml --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0b7ecd018..5de5addaa 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -59,8 +59,8 @@ jobs: pip install -r requirements.txt displayName: 'Install dependencies' - - script: python -m flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics - displayName: 'flake8' + # - script: python -m flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics + # displayName: 'flake8' - script: python setup.py build_ext --inplace From 70da2097b4269003c99ad99b052c66b5faa22887 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 00:35:41 +0100 Subject: [PATCH 10/55] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5de5addaa..4096a7bba 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -62,7 +62,7 @@ jobs: # - script: python -m flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics # displayName: 'flake8' - - script: + - script: | python setup.py build_ext --inplace pip install -e . displayName: 'Build and install' From 5bb8f123cae0d47e4ffe9b156d23b32a04ce508b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 00:43:36 +0100 Subject: [PATCH 11/55] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4096a7bba..dd417febe 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -55,7 +55,7 @@ jobs: architecture: 'x64' - script: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip wheel pip install -r requirements.txt displayName: 'Install dependencies' From 3d08bf9514fa605ef2de5230ed87fbd98bd163c3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 00:48:08 +0100 Subject: [PATCH 12/55] Update azure-pipelines.yml Try to work around "conflict with the backend dependencies: wheel==0.33.1 is incompatible with wheel<0.33.0,>=0.32.0" --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index dd417febe..9410d6a8b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -55,7 +55,7 @@ jobs: architecture: 'x64' - script: | - python -m pip install --upgrade pip wheel + python -m pip install --upgrade pip==18.1 pip install -r requirements.txt displayName: 'Install dependencies' From 400c9eecb64842b9389c928a32682a77cd4fd34a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 01:20:42 +0100 Subject: [PATCH 13/55] Re-add flake8 to CI --- .flake8 | 16 ++++++++++++++++ azure-pipelines.yml | 6 ++++-- requirements.txt | 1 + 3 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..4f6e9562b --- /dev/null +++ b/.flake8 @@ -0,0 +1,16 @@ + +15 lines (14 sloc) 290 Bytes +[flake8] +ignore = E203, E266, E501, E731, W503 +max-line-length = 80 +select = B,C,E,F,W,T4,B9 +exclude = + .env, + .git, + __pycache__, + lemmatizer.py, + lookup.py, + _tokenizer_exceptions_list.py, + spacy/lang/fr/lemmatizer, + spacy/lang/nb/lemmatizer + spacy/__init__.py diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9410d6a8b..9ffd60a87 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -54,13 +54,15 @@ jobs: versionSpec: '$(python.version)' architecture: 'x64' + # Downgrading pip is necessary to prevent a wheel version incompatiblity. + # Might be fixed in the future or some other way, so investigate again. - script: | python -m pip install --upgrade pip==18.1 pip install -r requirements.txt displayName: 'Install dependencies' - # - script: python -m flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics - # displayName: 'flake8' + - script: python -m flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics + displayName: 'flake8' - script: | python setup.py build_ext --inplace diff --git a/requirements.txt b/requirements.txt index a505dded1..567872472 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ requests>=2.13.0,<3.0.0 pytest>=4.0.0,<4.1.0 mock>=2.0.0,<3.0.0 pathlib==1.0.1; python_version < "3.4" +flake8>=3.5.0,<3.6.0 From 78aa663f79321ed5349e5a9c4d20af0bf98221f3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 01:30:40 +0100 Subject: [PATCH 14/55] Fix flake8 --- .flake8 | 2 -- azure-pipelines.yml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.flake8 b/.flake8 index 4f6e9562b..dfedc15df 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,3 @@ - -15 lines (14 sloc) 290 Bytes [flake8] ignore = E203, E266, E501, E731, W503 max-line-length = 80 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9ffd60a87..5f205f298 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -61,7 +61,7 @@ jobs: pip install -r requirements.txt displayName: 'Install dependencies' - - script: python -m flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics + - script: python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics displayName: 'flake8' - script: | From b7f9cbdc834fdde3f7c2ebf93698d3918faa8929 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 01:35:36 +0100 Subject: [PATCH 15/55] Fix undefined names --- spacy/lang/ga/irish_morphology_helpers.py | 48 +++++++++++------------ 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py index 383e24efc..0221829b3 100644 --- a/spacy/lang/ga/irish_morphology_helpers.py +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -1,33 +1,31 @@ # coding: utf8 from __future__ import unicode_literals -class IrishMorph: - consonants = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'] - broad_vowels = ['a', 'á', 'o', 'ó', 'u', 'ú'] - slender_vowels = ['e', 'é', 'i', 'í'] - vowels = broad_vowels + slender_vowels +consonants = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'] +broad_vowels = ['a', 'á', 'o', 'ó', 'u', 'ú'] +slender_vowels = ['e', 'é', 'i', 'í'] +vowels = broad_vowels + slender_vowels - def ends_dentals(word): - if word != "" and word[-1] in ['d', 'n', 't', 's']: - return True - else: - return False +def ends_dentals(word): + if word != "" and word[-1] in ['d', 'n', 't', 's']: + return True + else: + return False - def devoice(word): - if len(word) > 2 and word[-2] == 's' and word[-1] == 'd': - return word[:-1] + 't' - else: - return word +def devoice(word): + if len(word) > 2 and word[-2] == 's' and word[-1] == 'd': + return word[:-1] + 't' + else: + return word - def ends_with_vowel(word): - return word != "" and word[-1] in vowels +def ends_with_vowel(word): + return word != "" and word[-1] in vowels - def starts_with_vowel(word): - return word != "" and word[0] in vowels - - def deduplicate(word): - if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants: - return word[:-1] - else: - return word +def starts_with_vowel(word): + return word != "" and word[0] in vowels +def deduplicate(word): + if len(word) > 2 and word[-2] == word[-1] and word[-1] in consonants: + return word[:-1] + else: + return word From 47bf549f95d4d929a37121a5c241fee458b3b07c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 01:36:22 +0100 Subject: [PATCH 16/55] Update azure-pipelines.yml --- azure-pipelines.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5f205f298..ce200cc11 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -61,6 +61,8 @@ jobs: pip install -r requirements.txt displayName: 'Install dependencies' + # Perform basic checks for most important errors (syntax etc.) Uses the config + # defined in .flake8 and overwrites the selected codes. - script: python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics displayName: 'flake8' From 14a9b9753e4c5de91d36946914e3016d52a02b2a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 01:42:17 +0100 Subject: [PATCH 17/55] Update README.rst --- README.rst | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index 8b372e166..93dd4e8c4 100644 --- a/README.rst +++ b/README.rst @@ -12,13 +12,9 @@ integration. It's commercial open-source software, released under the MIT licens 💫 **Version 2.0 out now!** `Check out the release notes here. `_ -.. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis - :target: https://travis-ci.org/explosion/spaCy - :alt: Build Status - -.. image:: https://img.shields.io/appveyor/ci/explosion/spaCy/master.svg?style=flat-square&logo=appveyor - :target: https://ci.appveyor.com/project/explosion/spaCy - :alt: Appveyor Build Status +.. image:: https://img.shields.io/azure-devops/build/explosion-ai/public/8.svg?logo=azure-devops&style=flat-square + :target: https://dev.azure.com/explosion-ai/public/_build?definitionId=8 + :alt: Azure Pipelines .. image:: https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square :target: https://github.com/explosion/spaCy/releases From 9b42e2d5dda713beec02f47ec69e66fc6db4eed7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:05:26 +0100 Subject: [PATCH 18/55] Experiment with escaping hyphens --- spacy/lang/char_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index cb2e817d5..4a5ee6d67 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -215,7 +215,7 @@ _punct = ( r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪" ) _quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' -_hyphens = "- – — -- --- —— ~" +_hyphens = r"\- – — \-\- \-\-\- —— ~" # Various symbols like dingbats, but also emoji # Details: https://www.compart.com/en/unicode/category/So From 65402c3d026fa66a8045c63df9496586a7ebca42 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:13:00 +0100 Subject: [PATCH 19/55] Revert "Experiment with escaping hyphens" This reverts commit 9b42e2d5dda713beec02f47ec69e66fc6db4eed7. --- spacy/lang/char_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 4a5ee6d67..cb2e817d5 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -215,7 +215,7 @@ _punct = ( r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪" ) _quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' -_hyphens = r"\- – — \-\- \-\-\- —— ~" +_hyphens = "- – — -- --- —— ~" # Various symbols like dingbats, but also emoji # Details: https://www.compart.com/en/unicode/category/So From 5a2e2b9db7b3b486773e64bdf7e3dcd20194cf83 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:13:34 +0100 Subject: [PATCH 20/55] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 93dd4e8c4..db51e0dcc 100644 --- a/README.rst +++ b/README.rst @@ -12,7 +12,7 @@ integration. It's commercial open-source software, released under the MIT licens 💫 **Version 2.0 out now!** `Check out the release notes here. `_ -.. image:: https://img.shields.io/azure-devops/build/explosion-ai/public/8.svg?logo=azure-devops&style=flat-square +.. image:: https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square :target: https://dev.azure.com/explosion-ai/public/_build?definitionId=8 :alt: Azure Pipelines From b28de881dac770af6138eebbef6ae0a07ab34058 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:27:13 +0100 Subject: [PATCH 21/55] Update azure-pipelines.yml --- azure-pipelines.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index ce200cc11..48fb7bf56 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -12,13 +12,13 @@ jobs: strategy: matrix: Python27Linux: - imageName: 'ubuntu-16.04' + imageName: 'ubuntu-18.04' python.version: '2.7' Python27Mac: imageName: 'macos-10.13' python.version: '2.7' Python35Linux: - imageName: 'ubuntu-16.04' + imageName: 'ubuntu-18.04' python.version: '3.5' Python35Windows: imageName: 'vs2017-win2016' @@ -27,7 +27,7 @@ jobs: imageName: 'macos-10.13' python.version: '3.5' Python36Linux: - imageName: 'ubuntu-16.04' + imageName: 'ubuntu-18.04' python.version: '3.6' Python36Windows: imageName: 'vs2017-win2016' @@ -36,7 +36,7 @@ jobs: imageName: 'macos-10.13' python.version: '3.6' Python37Linux: - imageName: 'ubuntu-16.04' + imageName: 'ubuntu-18.04' python.version: '3.7' Python37Windows: imageName: 'vs2017-win2016' From fe0c8e147c6caffd0d24f0465f0b04e03ec31878 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:28:21 +0100 Subject: [PATCH 22/55] Update azure-pipelines.yml --- azure-pipelines.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 48fb7bf56..ce200cc11 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -12,13 +12,13 @@ jobs: strategy: matrix: Python27Linux: - imageName: 'ubuntu-18.04' + imageName: 'ubuntu-16.04' python.version: '2.7' Python27Mac: imageName: 'macos-10.13' python.version: '2.7' Python35Linux: - imageName: 'ubuntu-18.04' + imageName: 'ubuntu-16.04' python.version: '3.5' Python35Windows: imageName: 'vs2017-win2016' @@ -27,7 +27,7 @@ jobs: imageName: 'macos-10.13' python.version: '3.5' Python36Linux: - imageName: 'ubuntu-18.04' + imageName: 'ubuntu-16.04' python.version: '3.6' Python36Windows: imageName: 'vs2017-win2016' @@ -36,7 +36,7 @@ jobs: imageName: 'macos-10.13' python.version: '3.6' Python37Linux: - imageName: 'ubuntu-18.04' + imageName: 'ubuntu-16.04' python.version: '3.7' Python37Windows: imageName: 'vs2017-win2016' From d957d7a6972c112270ab34cb203c495e0d0d3058 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:37:41 +0100 Subject: [PATCH 23/55] Auto-format --- spacy/lang/fr/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 9b4385dc7..997a81534 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -379,7 +379,7 @@ _regular_exp = [ _regular_exp += [ "^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format( prefix=p, - hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash + hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash elision=ELISION, al=ALPHA_LOWER, ) From ae09b6a6cf4fc47b9ea6385a9b62bc73fad1e46a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:37:50 +0100 Subject: [PATCH 24/55] Try fixing unicode inconsistencies on Python 2 --- spacy/lang/fr/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 997a81534..b0f188278 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -423,5 +423,5 @@ _regular_exp.append(URL_PATTERN) TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( - "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE + "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE, re.UNICODE ).match From b9c71fc0f050b29d34ab86c71e7f2a4bd88ac473 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:46:04 +0100 Subject: [PATCH 25/55] Fix flags --- spacy/lang/fr/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index b0f188278..4b3b2c908 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -423,5 +423,5 @@ _regular_exp.append(URL_PATTERN) TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( - "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE, re.UNICODE + "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE ).match From 95312138468f9841673d23e1d2186453f6439915 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:56:08 +0100 Subject: [PATCH 26/55] Remove other CI --- .appveyor.yml | 51 --------------------------------------------------- .travis.yml | 36 ------------------------------------ travis.sh | 32 -------------------------------- 3 files changed, 119 deletions(-) delete mode 100644 .appveyor.yml delete mode 100644 .travis.yml delete mode 100755 travis.sh diff --git a/.appveyor.yml b/.appveyor.yml deleted file mode 100644 index c4c405531..000000000 --- a/.appveyor.yml +++ /dev/null @@ -1,51 +0,0 @@ -environment: - - matrix: - - # For Python versions available on Appveyor, see - # http://www.appveyor.com/docs/installed-software#python - - #- PYTHON: "C:\\Python27" - #- PYTHON: "C:\\Python34" - #- PYTHON: "C:\\Python35" - - PYTHON: "C:\\Python27-x64" - #- DISTUTILS_USE_SDK: "1" - #- PYTHON: "C:\\Python34-x64" - #- DISTUTILS_USE_SDK: "1" - #- PYTHON: "C:\\Python35-x64" - - PYTHON: "C:\\Python36-x64" - -install: - # We need wheel installed to build wheels - - "%PYTHON%\\python.exe -m pip install wheel" - - "%PYTHON%\\python.exe -m pip install cython" - - "%PYTHON%\\python.exe -m pip install -r requirements.txt" - - "%PYTHON%\\python.exe -m pip install -e ." - -build: off - -test_script: - # Put your test command here. - # If you don't need to build C extensions on 64-bit Python 3.4, - # you can remove "build.cmd" from the front of the command, as it's - # only needed to support those cases. - # Note that you must use the environment variable %PYTHON% to refer to - # the interpreter you're using - Appveyor does not do anything special - # to put the Python version you want to use on PATH. - - "%PYTHON%\\python.exe -m pytest spacy/" - -after_test: - # This step builds your wheels. - # Again, you only need build.cmd if you're building C extensions for - # 64-bit Python 3.4. And you need to use %PYTHON% to get the correct - # interpreter - - "%PYTHON%\\python.exe setup.py bdist_wheel" - -artifacts: - # bdist_wheel puts your built wheel in the dist directory - - path: dist\* - -#on_success: -# You can use this step to upload your artifacts to a public website. -# See Appveyor's documentation for more details. Or you can simply -# access your wheels from the Appveyor "artifacts" tab for your build. diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index bc6cd7b87..000000000 --- a/.travis.yml +++ /dev/null @@ -1,36 +0,0 @@ -language: python - -sudo: false -dist: trusty -group: edge - -python: - - "2.7" - - "3.5" - - "3.6" - -os: - - linux - -env: - - VIA=compile - - VIA=flake8 - #- VIA=pypi_nightly - -install: - - "./travis.sh" - - pip install flake8 - -script: - - "pip install pytest-timeout" - - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi - - if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi - - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi - - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi - -notifications: - slack: - secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ= - email: false - -cache: pip diff --git a/travis.sh b/travis.sh deleted file mode 100755 index eed6a96f2..000000000 --- a/travis.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -if [ "${VIA}" == "pypi" ]; then - rm -rf * - pip install spacy-nightly - python -m spacy download en -fi - -if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then - rm -rf * - pip uninstall spacy - wget https://api.explosion.ai/build/spacy/sdist/$TRAVIS_COMMIT - mv $TRAVIS_COMMIT sdist.tgz - pip install -U sdist.tgz -fi - - -if [ "${VIA}" == "compile" ]; then - pip install -r requirements.txt - python setup.py build_ext --inplace - pip install -e . -fi - -# mkdir -p corpora/en -# cd corpora/en -# wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz -# tar -xzf WordNet-3.0.tar.gz -# mv WordNet-3.0 wordnet -# cd ../../ -# mkdir models/ -# python bin/init_model.py en lang_data/ corpora/ models/en -#fi From 40def86fdf0817e7a051e8ef1cc35f676a6782a6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:56:20 +0100 Subject: [PATCH 27/55] Try running flake8 first --- azure-pipelines.yml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index ce200cc11..822606065 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -8,7 +8,17 @@ trigger: jobs: +# Perform basic checks for most important errors (syntax etc.) Uses the config +# defined in .flake8 and overwrites the selected codes. +- job: 'Validate' + steps: + - script: | + pip install flake8 + python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics + displayName: 'flake8' + - job: 'Test' + dependsOn: 'Validate' strategy: matrix: Python27Linux: @@ -61,11 +71,6 @@ jobs: pip install -r requirements.txt displayName: 'Install dependencies' - # Perform basic checks for most important errors (syntax etc.) Uses the config - # defined in .flake8 and overwrites the selected codes. - - script: python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics - displayName: 'flake8' - - script: | python setup.py build_ext --inplace pip install -e . From db0355828895a73a03a20752493aeaa066de8ce8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 02:59:29 +0100 Subject: [PATCH 28/55] Fix flake8 --- azure-pipelines.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 822606065..3deca85ce 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,7 +11,12 @@ jobs: # Perform basic checks for most important errors (syntax etc.) Uses the config # defined in .flake8 and overwrites the selected codes. - job: 'Validate' + pool: + vmImage: 'ubuntu-16.04' steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.7' - script: | pip install flake8 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics From a145bfe627f023d24aefcaeec6c1aea670551b00 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 03:06:50 +0100 Subject: [PATCH 29/55] Try escaping hyphens again --- spacy/lang/char_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index cb2e817d5..4a5ee6d67 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -215,7 +215,7 @@ _punct = ( r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪" ) _quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' -_hyphens = "- – — -- --- —— ~" +_hyphens = r"\- – — \-\- \-\-\- —— ~" # Various symbols like dingbats, but also emoji # Details: https://www.compart.com/en/unicode/category/So From d59f8cff29fcee2f185f389cdb88699ad89c6147 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 03:18:11 +0100 Subject: [PATCH 30/55] Re-add missing travis.sh for now --- travis.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 travis.sh diff --git a/travis.sh b/travis.sh new file mode 100644 index 000000000..b16dce8f3 --- /dev/null +++ b/travis.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +if [ "${VIA}" == "pypi" ]; then + rm -rf * + pip install spacy-nightly + python -m spacy download en +fi + +if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then + rm -rf * + pip uninstall spacy + wget https://api.explosion.ai/build/spacy/sdist/$TRAVIS_COMMIT + mv $TRAVIS_COMMIT sdist.tgz + pip install -U sdist.tgz +fi + + +if [ "${VIA}" == "compile" ]; then + pip install -r requirements.txt + python setup.py build_ext --inplace + pip install -e . +fi From b8db2198500cab7c334a5f72801432b86fc2ba9d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 12:40:58 +0100 Subject: [PATCH 31/55] Auto-format --- spacy/lang/fr/punctuation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index 298e41906..1422b4194 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -21,7 +21,9 @@ _suffixes = ( r"(?<=[0-9])%", # 4% -> ["4", "%"] r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), - r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES), + r"(?<=[0-9{al}{e}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES + ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), ] ) From bbabb6aaaeb332ec12b9a436d88d38e72a283009 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 12:41:05 +0100 Subject: [PATCH 32/55] Escape more hyphens --- spacy/lang/fr/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index 1422b4194..eda4c1593 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -7,7 +7,7 @@ from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "") +HYPHENS = r"\- – — ‐ ‑".strip().replace(" ", "").replace("\n", "") _suffixes = ( From 610fb306bd0afb27be3f23c2eb7923e107eac805 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 12:51:53 +0100 Subject: [PATCH 33/55] Revert hyphens --- spacy/lang/char_classes.py | 2 +- spacy/lang/fr/punctuation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 4a5ee6d67..cb2e817d5 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -215,7 +215,7 @@ _punct = ( r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪" ) _quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' -_hyphens = r"\- – — \-\- \-\-\- —— ~" +_hyphens = "- – — -- --- —— ~" # Various symbols like dingbats, but also emoji # Details: https://www.compart.com/en/unicode/category/So diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index eda4c1593..1422b4194 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -7,7 +7,7 @@ from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -HYPHENS = r"\- – — ‐ ‑".strip().replace(" ", "").replace("\n", "") +HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "") _suffixes = ( From b11ca720f87e496bb6599280f4e8aa0e656a6413 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 12:56:39 +0100 Subject: [PATCH 34/55] Update azure-pipelines.yml --- azure-pipelines.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3deca85ce..47bbf8367 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -5,6 +5,10 @@ trigger: - '*' exclude: - 'spacy.io' + paths: + exclude: + - 'website/*' + - '*.md' jobs: @@ -72,6 +76,7 @@ jobs: # Downgrading pip is necessary to prevent a wheel version incompatiblity. # Might be fixed in the future or some other way, so investigate again. - script: | + python -c "import sys; print('UNICODE:', sys.maxunicode)" python -m pip install --upgrade pip==18.1 pip install -r requirements.txt displayName: 'Install dependencies' From b65e2f554fc0fc6b83641affd46a13aa71c4138a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 13:06:18 +0100 Subject: [PATCH 35/55] Update CI Only keep Travis for Python 2.7 for now until we've sorted out unicode build in Azure Pipelines --- .appveyor.yml | 21 --------------------- .travis.yml | 13 +++---------- README.md | 3 +-- azure-pipelines.yml | 16 +++++++++------- travis.sh | 22 ---------------------- 5 files changed, 13 insertions(+), 62 deletions(-) delete mode 100644 .appveyor.yml delete mode 100644 travis.sh diff --git a/.appveyor.yml b/.appveyor.yml deleted file mode 100644 index daf4c2457..000000000 --- a/.appveyor.yml +++ /dev/null @@ -1,21 +0,0 @@ -environment: - matrix: - - PYTHON: "C:\\Python35-x64" - - PYTHON: "C:\\Python36-x64" - - PYTHON: "C:\\Python37-x64" -install: - # We need wheel installed to build wheels - - "%PYTHON%\\python.exe -m pip install wheel" - - "%PYTHON%\\python.exe -m pip install cython" - - "%PYTHON%\\python.exe -m pip install -r requirements.txt" - - "%PYTHON%\\python.exe -m pip install -e ." -build: off -test_script: - - "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs" -after_test: - - "%PYTHON%\\python.exe setup.py bdist_wheel" -artifacts: - - path: dist\* -branches: - except: - - spacy.io diff --git a/.travis.yml b/.travis.yml index dc6cfb4d0..c517744cb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,23 +5,16 @@ dist: trusty group: edge python: - "2.7" - - "3.5" - - "3.6" os: - linux -env: - - VIA=compile - - VIA=flake8 install: - - "./travis.sh" - - pip install flake8 + - "pip install -r requirements.txt" + - "python setup.py build_ext --inplace" + - "pip install -e ." script: - "cat /proc/cpuinfo | grep flags | head -n 1" - "pip install pytest pytest-timeout" - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi - - if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi - - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi - - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi branches: except: - spacy.io diff --git a/README.md b/README.md index a4670f6ec..1ceb36dd1 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,7 @@ released under the MIT license. 💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) -[![Travis Build Status](https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/explosion/spaCy) -[![Appveyor Build Status](https://img.shields.io/appveyor/ci/explosion/spaCy/master.svg?style=flat-square&logo=appveyor)](https://ci.appveyor.com/project/explosion/spaCy) +[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square)](https://github.com/explosion/spaCy/releases) [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.python.org/pypi/spacy) [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 47bbf8367..c5fa563be 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -30,12 +30,15 @@ jobs: dependsOn: 'Validate' strategy: matrix: - Python27Linux: - imageName: 'ubuntu-16.04' - python.version: '2.7' - Python27Mac: - imageName: 'macos-10.13' - python.version: '2.7' + # Python 2.7 currently doesn't work because it seems to be a narrow + # unicode build, which causes problems with the regular expressions + + # Python27Linux: + # imageName: 'ubuntu-16.04' + # python.version: '2.7' + # Python27Mac: + # imageName: 'macos-10.13' + # python.version: '2.7' Python35Linux: imageName: 'ubuntu-16.04' python.version: '3.5' @@ -76,7 +79,6 @@ jobs: # Downgrading pip is necessary to prevent a wheel version incompatiblity. # Might be fixed in the future or some other way, so investigate again. - script: | - python -c "import sys; print('UNICODE:', sys.maxunicode)" python -m pip install --upgrade pip==18.1 pip install -r requirements.txt displayName: 'Install dependencies' diff --git a/travis.sh b/travis.sh deleted file mode 100644 index b16dce8f3..000000000 --- a/travis.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -if [ "${VIA}" == "pypi" ]; then - rm -rf * - pip install spacy-nightly - python -m spacy download en -fi - -if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then - rm -rf * - pip uninstall spacy - wget https://api.explosion.ai/build/spacy/sdist/$TRAVIS_COMMIT - mv $TRAVIS_COMMIT sdist.tgz - pip install -U sdist.tgz -fi - - -if [ "${VIA}" == "compile" ]; then - pip install -r requirements.txt - python setup.py build_ext --inplace - pip install -e . -fi From d12af4c92ce6dcd00decf36e301ad9bdb83ddeb6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 13:07:35 +0100 Subject: [PATCH 36/55] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1ceb36dd1..218b2696d 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ released under the MIT license. 💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) +[![Travis Build Status](https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/explosion/spaCy) [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square)](https://github.com/explosion/spaCy/releases) [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.python.org/pypi/spacy) [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy) From 16fa4d6b907330a4b63802e2a889fb16b45b0d3f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 9 Mar 2019 14:36:52 +0100 Subject: [PATCH 37/55] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c517744cb..957112e92 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ install: script: - "cat /proc/cpuinfo | grep flags | head -n 1" - "pip install pytest pytest-timeout" - - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi + - "python -m pytest --tb=native spacy" branches: except: - spacy.io From 28c26e212dcd17773209596bedb8d928b2583d84 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 9 Mar 2019 17:50:08 +0000 Subject: [PATCH 38/55] Fix textcat model for GPU --- spacy/_ml.py | 49 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index fdacc1eb8..e2a5acf13 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -84,16 +84,52 @@ def _zero_init(model): @layerize def _preprocess_doc(docs, drop=0.0): keys = [doc.to_array(LOWER) for doc in docs] - ops = Model.ops # The dtype here matches what thinc is expecting -- which differs per # platform (by int definition). This should be fixed once the problem # is fixed on Thinc's side. - lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_) - keys = ops.xp.concatenate(keys) - vals = ops.allocate(keys.shape) + 1.0 + lengths = numpy.array([arr.shape[0] for arr in keys], dtype=numpy.int_) + keys = numpy.concatenate(keys) + vals = numpy.zeros(keys.shape, dtype='f') return (keys, vals, lengths), None +def with_cpu(ops, model): + model.to_cpu() + def with_cpu_forward(inputs, drop=0.): + cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop) + gpu_outputs = _to_device(ops, cpu_outputs) + + def with_cpu_backprop(d_outputs, sgd=None): + cpu_d_outputs = _to_cpu(d_outputs) + return backprop(cpu_d_outputs, sgd=sgd) + + return gpu_outputs, with_cpu_backprop + + return wrap(with_cpu_forward, model) + + +def _to_cpu(X): + if isinstance(X, numpy.ndarray): + return X + elif isinstance(X, tuple): + return tuple([_to_cpu(x) for x in X]) + elif isinstance(X, list): + return [_to_cpu(x) for x in X] + elif hasattr(X, 'get'): + return X.get() + else: + return X + + +def _to_device(ops, X): + if isinstance(X, tuple): + return tuple([_to_device(ops, x) for x in X]) + elif isinstance(X, list): + return [_to_device(ops, x) for x in X] + else: + return ops.asarray(X) + + @layerize def _preprocess_doc_bigrams(docs, drop=0.0): unigrams = [doc.to_array(LOWER) for doc in docs] @@ -563,7 +599,10 @@ def build_text_classifier(nr_class, width=64, **cfg): >> zero_init(Affine(nr_class, width, drop_factor=0.0)) ) - linear_model = _preprocess_doc >> LinearModel(nr_class) + linear_model = ( + _preprocess_doc + >> with_cpu(Model.ops, LinearModel(nr_class)) + ) if cfg.get('exclusive_classes'): output_layer = Softmax(nr_class, nr_class * 2) else: From ce1fe8a5104fd48a9b27a3d5102aa686c82773d0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 9 Mar 2019 17:51:17 +0000 Subject: [PATCH 39/55] Add comment --- spacy/_ml.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/_ml.py b/spacy/_ml.py index e2a5acf13..168519bfe 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -94,6 +94,8 @@ def _preprocess_doc(docs, drop=0.0): def with_cpu(ops, model): + """Wrap a model that should run on CPU, transferring inputs and outputs + as necessary.""" model.to_cpu() def with_cpu_forward(inputs, drop=0.): cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop) From bdc77848f5eb9ada0f900ec07b471b6a6b72e30a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 10 Mar 2019 13:00:00 +0100 Subject: [PATCH 40/55] Add helper method to apply a transition in parser/NER --- spacy/syntax/transition_system.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 5ec254e04..4cc00828e 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -94,6 +94,13 @@ cdef class TransitionSystem: raise ValueError(Errors.E024) return history + def apply_transition(self, StateClass state, name): + if not self.is_valid(state, name): + raise ValueError( + "Cannot apply transition {name}: invalid for the current state.".format(name=name)) + action = self.lookup_transition(name) + action.do(state.c, action.label) + cdef int initialize_state(self, StateC* state) nogil: pass From 231bc7bb7bd7e373d4b4c9a3e33d6539d0637828 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 10 Mar 2019 13:00:15 +0100 Subject: [PATCH 41/55] Add xfailing test for #3345 --- spacy/tests/regression/test_issue3345.py | 27 ++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 spacy/tests/regression/test_issue3345.py diff --git a/spacy/tests/regression/test_issue3345.py b/spacy/tests/regression/test_issue3345.py new file mode 100644 index 000000000..7b1f41fbf --- /dev/null +++ b/spacy/tests/regression/test_issue3345.py @@ -0,0 +1,27 @@ +"""Test interaction between preset entities and sentence boundaries in NER.""" +import spacy +from spacy.tokens import Doc +from spacy.pipeline import EntityRuler, EntityRecognizer + + +@pytest.mark.xfail +def test_issue3345(): + """Test case where preset entity crosses sentence boundary.""" + nlp = spacy.blank("en") + doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) + doc[4].is_sent_start = True + + ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) + ner = EntityRecognizer(doc.vocab) + # Add the OUT action. I wouldn't have thought this would be necessary... + ner.moves.add_action(5, "") + ner.add_label("GPE") + + doc = ruler(doc) + # Get into the state just before "New" + state = ner.moves.init_batch([doc])[0] + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + # Check that B-GPE is valid. + assert ner.moves.is_valid(state, "B-GPE") From 3fe5811fa7ec0039bc7898b94bf02b6bd143842e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 10 Mar 2019 13:02:24 +0100 Subject: [PATCH 42/55] Only link model after download if shortcut link (#3378) --- spacy/cli/download.py | 44 +++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index a2ec15fa4..66a47823c 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -41,24 +41,32 @@ def download(model, direct=False, *pip_args): dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) if dl != 0: # if download subprocess doesn't return 0, exit sys.exit(dl) - try: - # Get package path here because link uses - # pip.get_installed_distributions() to check if model is a - # package, which fails if model was just installed via - # subprocess - package_path = get_package_path(model_name) - link(model_name, model, force=True, model_path=package_path) - except: # noqa: E722 - # Dirty, but since spacy.download and the auto-linking is - # mostly a convenience wrapper, it's best to show a success - # message and loading instructions, even if linking fails. - msg.warn( - "Download successful but linking failed", - "Creating a shortcut link for 'en' didn't work (maybe you " - "don't have admin permissions?), but you can still load the " - "model via its full package name: " - "nlp = spacy.load('{}')".format(model_name), - ) + msg.good( + "Download and installation successful", + "You can now load the model via spacy.load('{}')".format(model_name), + ) + # Only create symlink if the model is installed via a shortcut like 'en'. + # There's no real advantage over an additional symlink for en_core_web_sm + # and if anything, it's more error prone and causes more confusion. + if model in shortcuts: + try: + # Get package path here because link uses + # pip.get_installed_distributions() to check if model is a + # package, which fails if model was just installed via + # subprocess + package_path = get_package_path(model_name) + link(model_name, model, force=True, model_path=package_path) + except: # noqa: E722 + # Dirty, but since spacy.download and the auto-linking is + # mostly a convenience wrapper, it's best to show a success + # message and loading instructions, even if linking fails. + msg.warn( + "Download successful but linking failed", + "Creating a shortcut link for '{}' didn't work (maybe you " + "don't have admin permissions?), but you can still load " + "the model via its full package name: " + "nlp = spacy.load('{}')".format(model, model_name), + ) def get_json(url, desc): From a5b1f6dcecf3180fe84b4903f63f13a2fc0fd7f1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 10 Mar 2019 14:53:03 +0100 Subject: [PATCH 43/55] Fix NER when preset entities cross sentence boundaries (#3379) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 💫 Fix NER when preset entities cross sentence boundaries --- spacy/syntax/_parser_model.pyx | 4 + spacy/syntax/_state.pxd | 6 ++ spacy/syntax/ner.pyx | 96 +++++++++++++++++------- spacy/syntax/nn_parser.pyx | 11 ++- spacy/tests/regression/test_issue3345.py | 9 ++- 5 files changed, 94 insertions(+), 32 deletions(-) diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 30d4b67d3..f664e6a2c 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -157,6 +157,10 @@ cdef void cpu_log_loss(float* d_scores, cdef double max_, gmax, Z, gZ best = arg_max_if_gold(scores, costs, is_valid, O) guess = arg_max_if_valid(scores, is_valid, O) + if best == -1 or guess == -1: + # These shouldn't happen, but if they do, we want to make sure we don't + # cause an OOB access. + return Z = 1e-10 gZ = 1e-10 max_ = scores[guess] diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index d082cee5c..204f723d8 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -323,6 +323,12 @@ cdef cppclass StateC: if this._s_i >= 1: this._s_i -= 1 + void force_final() nogil: + # This should only be used in desperate situations, as it may leave + # the analysis in an unexpected state. + this._s_i = 0 + this._b_i = this.length + void unshift() nogil: this._b_i -= 1 this._buffer[this._b_i] = this.S(0) diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index b43a879d4..804167b0e 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -257,30 +257,42 @@ cdef class Missing: cdef class Begin: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: - # Ensure we don't clobber preset entities. If no entity preset, - # ent_iob is 0 cdef int preset_ent_iob = st.B_(0).ent_iob - if preset_ent_iob == 1: + cdef int preset_ent_label = st.B_(0).ent_type + # If we're the last token of the input, we can't B -- must U or O. + if st.B(1) == -1: return False - elif preset_ent_iob == 2: + elif st.entity_is_open(): return False - elif preset_ent_iob == 3 and st.B_(0).ent_type != label: + elif label == 0: return False - # If the next word is B or O, we can't B now + elif preset_ent_iob == 1 or preset_ent_iob == 2: + # Ensure we don't clobber preset entities. If no entity preset, + # ent_iob is 0 + return False + elif preset_ent_iob == 3: + # Okay, we're in a preset entity. + if label != preset_ent_label: + # If label isn't right, reject + return False + elif st.B_(1).ent_iob != 1: + # If next token isn't marked I, we need to make U, not B. + return False + else: + # Otherwise, force acceptance, even if we're across a sentence + # boundary or the token is whitespace. + return True elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3: + # If the next word is B or O, we can't B now return False - # If the current word is B, and the next word isn't I, the current word - # is really U - elif preset_ent_iob == 3 and st.B_(1).ent_iob != 1: - return False - # Don't allow entities to extend across sentence boundaries elif st.B_(1).sent_start == 1: + # Don't allow entities to extend across sentence boundaries return False # Don't allow entities to start on whitespace elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): return False else: - return label != 0 and not st.entity_is_open() + return True @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -314,18 +326,27 @@ cdef class In: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef int preset_ent_iob = st.B_(0).ent_iob - if preset_ent_iob == 2: + if label == 0: + return False + elif st.E_(0).ent_type != label: + return False + elif not st.entity_is_open(): + return False + elif st.B(1) == -1: + # If we're at the end, we can't I. + return False + elif preset_ent_iob == 2: return False elif preset_ent_iob == 3: return False - # TODO: Is this quite right? I think it's supposed to be ensuring the - # gazetteer matches are maintained - elif st.B(1) != -1 and st.B_(1).ent_iob != preset_ent_iob: + elif st.B_(1).ent_iob == 2 or st.B_(1).ent_iob == 3: + # If we know the next word is B or O, we can't be I (must be L) return False - # Don't allow entities to extend across sentence boundaries elif st.B(1) != -1 and st.B_(1).sent_start == 1: + # Don't allow entities to extend across sentence boundaries return False - return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label + else: + return True @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -370,9 +391,17 @@ cdef class In: cdef class Last: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: - if st.B_(1).ent_iob == 1: + if label == 0: return False - return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label + elif not st.entity_is_open(): + return False + elif st.E_(0).ent_type != label: + return False + elif st.B_(1).ent_iob == 1: + # If a preset entity has I next, we can't L here. + return False + else: + return True @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -416,17 +445,29 @@ cdef class Unit: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef int preset_ent_iob = st.B_(0).ent_iob - if preset_ent_iob == 2: + cdef attr_t preset_ent_label = st.B_(0).ent_type + if label == 0: return False - elif preset_ent_iob == 1: + elif st.entity_is_open(): return False - elif preset_ent_iob == 3 and st.B_(0).ent_type != label: + elif preset_ent_iob == 2: + # Don't clobber preset O return False elif st.B_(1).ent_iob == 1: + # If next token is In, we can't be Unit -- must be Begin return False + elif preset_ent_iob == 3: + # Okay, there's a preset entity here + if label != preset_ent_label: + # Require labels to match + return False + else: + # Otherwise return True, ignoring the whitespace constraint. + return True elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): return False - return label != 0 and not st.entity_is_open() + else: + return True @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -461,11 +502,14 @@ cdef class Out: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef int preset_ent_iob = st.B_(0).ent_iob - if preset_ent_iob == 3: + if st.entity_is_open(): + return False + elif preset_ent_iob == 3: return False elif preset_ent_iob == 1: return False - return not st.entity_is_open() + else: + return True @staticmethod cdef int transition(StateC* st, attr_t label) nogil: diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index ee9d0ee7e..0009eba72 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -363,9 +363,14 @@ cdef class Parser: for i in range(batch_size): self.moves.set_valid(is_valid, states[i]) guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class) - action = self.moves.c[guess] - action.do(states[i], action.label) - states[i].push_hist(guess) + if guess == -1: + # This shouldn't happen, but it's hard to raise an error here, + # and we don't want to infinite loop. So, force to end state. + states[i].force_final() + else: + action = self.moves.c[guess] + action.do(states[i], action.label) + states[i].push_hist(guess) free(is_valid) def transition_beams(self, beams, float[:, ::1] scores): diff --git a/spacy/tests/regression/test_issue3345.py b/spacy/tests/regression/test_issue3345.py index 7b1f41fbf..8a7823d96 100644 --- a/spacy/tests/regression/test_issue3345.py +++ b/spacy/tests/regression/test_issue3345.py @@ -1,5 +1,8 @@ -"""Test interaction between preset entities and sentence boundaries in NER.""" -import spacy +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.en import English from spacy.tokens import Doc from spacy.pipeline import EntityRuler, EntityRecognizer @@ -7,7 +10,7 @@ from spacy.pipeline import EntityRuler, EntityRecognizer @pytest.mark.xfail def test_issue3345(): """Test case where preset entity crosses sentence boundary.""" - nlp = spacy.blank("en") + nlp = English() doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True From 6bbf4ea309263913229686b42017cde83440e105 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 10 Mar 2019 15:05:56 +0100 Subject: [PATCH 44/55] Simplify tests and avoid tokenizing --- spacy/tests/doc/test_array.py | 44 ++++++++++++++++------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index c18fce966..8a8ff2296 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,46 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals +from spacy.tokens import Doc from spacy.attrs import ORTH, SHAPE, POS, DEP from ..util import get_doc -def test_doc_array_attr_of_token(en_tokenizer, en_vocab): - text = "An example sentence" - tokens = en_tokenizer(text) - example = tokens.vocab["example"] +def test_doc_array_attr_of_token(en_vocab): + doc = Doc(en_vocab, words=["An", "example", "sentence"]) + example = doc.vocab["example"] assert example.orth != example.shape - feats_array = tokens.to_array((ORTH, SHAPE)) + feats_array = doc.to_array((ORTH, SHAPE)) assert feats_array[0][0] != feats_array[0][1] assert feats_array[0][0] != feats_array[0][1] -def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab): - text = "An example sentence" - tokens = en_tokenizer(text) - example = tokens.vocab["example"] +def test_doc_stringy_array_attr_of_token(en_vocab): + doc = Doc(en_vocab, words=["An", "example", "sentence"]) + example = doc.vocab["example"] assert example.orth != example.shape - feats_array = tokens.to_array((ORTH, SHAPE)) - feats_array_stringy = tokens.to_array(("ORTH", "SHAPE")) + feats_array = doc.to_array((ORTH, SHAPE)) + feats_array_stringy = doc.to_array(("ORTH", "SHAPE")) assert feats_array_stringy[0][0] == feats_array[0][0] assert feats_array_stringy[0][1] == feats_array[0][1] -def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab): - text = "An example sentence" - tokens = en_tokenizer(text) - example = tokens.vocab["example"] +def test_doc_scalar_attr_of_token(en_vocab): + doc = Doc(en_vocab, words=["An", "example", "sentence"]) + example = doc.vocab["example"] assert example.orth != example.shape - feats_array = tokens.to_array(ORTH) + feats_array = doc.to_array(ORTH) assert feats_array.shape == (3,) -def test_doc_array_tag(en_tokenizer): - text = "A nice sentence." +def test_doc_array_tag(en_vocab): + words = ["A", "nice", "sentence", "."] pos = ["DET", "ADJ", "NOUN", "PUNCT"] - tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos) + doc = get_doc(en_vocab, words=words, pos=pos) assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos feats_array = doc.to_array((ORTH, POS)) assert feats_array[0][1] == doc[0].pos @@ -49,11 +46,10 @@ def test_doc_array_tag(en_tokenizer): assert feats_array[3][1] == doc[3].pos -def test_doc_array_dep(en_tokenizer): - text = "A nice sentence." +def test_doc_array_dep(en_vocab): + words = ["A", "nice", "sentence", "."] deps = ["det", "amod", "ROOT", "punct"] - tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) + doc = get_doc(en_vocab, words=words, deps=deps) feats_array = doc.to_array((ORTH, DEP)) assert feats_array[0][1] == doc[0].dep assert feats_array[1][1] == doc[1].dep From 798454395316e54053b7eb0277dab471ad97f489 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 10 Mar 2019 15:08:15 +0100 Subject: [PATCH 45/55] Add xfailing test for to_array/from_array string attrs --- spacy/tests/doc/test_array.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 8a8ff2296..3cc5746b4 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import pytest from spacy.tokens import Doc from spacy.attrs import ORTH, SHAPE, POS, DEP @@ -55,3 +56,14 @@ def test_doc_array_dep(en_vocab): assert feats_array[1][1] == doc[1].dep assert feats_array[2][1] == doc[2].dep assert feats_array[3][1] == doc[3].dep + + +@pytest.mark.xfail +@pytest.mark.parametrize("attrs", [["ORTH", "SHAPE"], "IS_ALPHA"]) +def test_doc_array_to_from_string_attrs(en_vocab, attrs): + """Test that both Doc.to_array and Doc.from_array accept string attrs, + as well as single attrs and sequences of attrs. + """ + words = ["An", "example", "sentence"] + doc = Doc(en_vocab, words=words) + Doc(en_vocab, words=words).from_array(attrs, doc.to_array(attrs)) From 0426689db872ff6be74bd8250bef2197b07c8b2b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 10 Mar 2019 15:24:34 +0100 Subject: [PATCH 46/55] =?UTF-8?q?=F0=9F=92=AB=20Improve=20Doc.to=5Fjson=20?= =?UTF-8?q?and=20add=20Doc.is=5Fnered=20(#3381)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Use default return instead of else * Add Doc.is_nered to indicate if entities have been set * Add properties in Doc.to_json if they were set, not if they're available This way, if a processed Doc exports "pos": None, it means that the tag was explicitly unset. If it exports "ents": [], it means that entity annotations are available but that this document doesn't contain any entities. Before, this would have been unclear and problematic for training. --- spacy/tests/doc/test_doc_api.py | 18 +++++++++++++++- spacy/tokens/doc.pyx | 26 +++++++++++++++-------- website/docs/api/doc.md | 37 +++++++++++++++++---------------- 3 files changed, 53 insertions(+), 28 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 1c3c948c3..8eed2c267 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -4,9 +4,10 @@ from __future__ import unicode_literals import pytest import numpy -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.errors import ModelsWarning +from spacy.attrs import ENT_TYPE, ENT_IOB from ..util import get_doc @@ -256,3 +257,18 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix): assert lca[1, 1] == 1 assert lca[0, 1] == 2 assert lca[1, 2] == 2 + + +def test_doc_is_nered(en_vocab): + words = ["I", "live", "in", "New", "York"] + doc = Doc(en_vocab, words=words) + assert not doc.is_nered + doc.ents = [Span(doc, 3, 5, label="GPE")] + assert doc.is_nered + # Test creating doc from array with unknown values + arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") + doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) + assert doc.is_nered + # Test serialization + new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) + assert new_doc.is_nered diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1dfcd1687..ff38d825f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -240,8 +240,18 @@ cdef class Doc: for i in range(1, self.length): if self.c[i].sent_start == -1 or self.c[i].sent_start == 1: return True - else: - return False + return False + + @property + def is_nered(self): + """Check if the document has named entities set. Will return True if + *any* of the tokens has a named entity tag set (even if the others are + uknown values). + """ + for i in range(self.length): + if self.c[i].ent_iob != 0: + return True + return False def __getitem__(self, object i): """Get a `Token` or `Span` object. @@ -990,11 +1000,11 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_json """ data = {"text": self.text} - if self.ents: + if self.is_nered: data["ents"] = [{"start": ent.start_char, "end": ent.end_char, "label": ent.label_} for ent in self.ents] - sents = list(self.sents) - if sents: + if self.is_sentenced: + sents = list(self.sents) data["sents"] = [{"start": sent.start_char, "end": sent.end_char} for sent in sents] if self.cats: @@ -1002,13 +1012,11 @@ cdef class Doc: data["tokens"] = [] for token in self: token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)} - if token.pos_: + if self.is_tagged: token_data["pos"] = token.pos_ - if token.tag_: token_data["tag"] = token.tag_ - if token.dep_: + if self.is_parsed: token_data["dep"] = token.dep_ - if token.head: token_data["head"] = token.head.i data["tokens"].append(token_data) if underscore: diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 267d8f711..e53619cff 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -237,7 +237,7 @@ attribute ID. > from spacy.attrs import ORTH > doc = nlp(u"apple apple orange banana") > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2} -> doc.to_array([attrs.ORTH]) +> doc.to_array([ORTH]) > # array([[11880], [11880], [7561], [12800]]) > ``` @@ -640,20 +640,21 @@ The L2 norm of the document's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| ----------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `text` | unicode | A unicode representation of the document text. | -| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | -| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | -| `vocab` | `Vocab` | The store of lexical types. | -| `tensor` 2 | object | Container for dense vector representations. | -| `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | -| `user_data` | - | A generic storage area, for user custom data. | -| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | -| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | -| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. | -| `sentiment` | float | The document's positivity/negativity score, if available. | -| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | -| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | -| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Type | Description | +| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `text` | unicode | A unicode representation of the document text. | +| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | +| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | +| `vocab` | `Vocab` | The store of lexical types. | +| `tensor` 2 | object | Container for dense vector representations. | +| `cats` 2 | dictionary | Maps either a label to a score for categories applied to whole document, or `(start_char, end_char, label)` to score for categories applied to spans. `start_char` and `end_char` should be character offsets, label can be either a string or an integer ID, and score should be a float. | +| `user_data` | - | A generic storage area, for user custom data. | +| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. | +| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. | +| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. | +| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if _any_ of the tokens has an entity tag set, even if the others are unknown. | +| `sentiment` | float | The document's positivity/negativity score, if available. | +| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | +| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | +| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | From 4e80fc41ad16a4c9b1cb08168ff393cb80c5ad4c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 10 Mar 2019 15:50:48 +0100 Subject: [PATCH 47/55] Make doc.from_array() consistent with doc.to_array(). Closes #3382 --- spacy/tokens/doc.pyx | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ff38d825f..fce2a6e7c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -737,6 +737,18 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#from_array """ + # Handle scalar/list inputs of strings/ints for py_attr_ids + # See also #3064 + if isinstance(attrs, basestring_): + # Handle inputs like doc.to_array('ORTH') + attrs = [attrs] + elif not hasattr(attrs, "__iter__"): + # Handle inputs like doc.to_array(ORTH) + attrs = [attrs] + # Allow strings, e.g. 'lemma' or 'LEMMA' + attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) + for id_ in attrs] + if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) cdef int i, col From 8a6272f84246588e6d1caabf0c1f2a83ca5ee908 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 10 Mar 2019 15:51:15 +0100 Subject: [PATCH 48/55] Un-xfail test --- spacy/tests/doc/test_array.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 3cc5746b4..7b513cfab 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -58,7 +58,6 @@ def test_doc_array_dep(en_vocab): assert feats_array[3][1] == doc[3].dep -@pytest.mark.xfail @pytest.mark.parametrize("attrs", [["ORTH", "SHAPE"], "IS_ALPHA"]) def test_doc_array_to_from_string_attrs(en_vocab, attrs): """Test that both Doc.to_array and Doc.from_array accept string attrs, From 7461e5e055bafab323faa6d7f0160162b655b523 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 10 Mar 2019 16:01:34 +0100 Subject: [PATCH 49/55] Fix batch bug in issue #3344 --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 0009eba72..4e3141a41 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -221,7 +221,7 @@ cdef class Parser: for batch in util.minibatch(docs, size=batch_size): batch_in_order = list(batch) by_length = sorted(batch_in_order, key=lambda doc: len(doc)) - for subbatch in util.minibatch(by_length, size=batch_size//4): + for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): subbatch = list(subbatch) parse_states = self.predict(subbatch, beam_width=beam_width, beam_density=beam_density) From 61e5ce02a47c159bb89c363f2143ccec9b43abd7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 10 Mar 2019 16:36:29 +0100 Subject: [PATCH 50/55] Add xfailing test for #2153 --- .../tests/serialize/test_serialize_vocab_strings.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index fc51ea930..88b469c0a 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -12,13 +12,12 @@ test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] test_strings_attrs = [(["rats", "are", "cute"], "Hello")] -@pytest.mark.xfail @pytest.mark.parametrize("text", ["rat"]) def test_serialize_vocab(en_vocab, text): text_hash = en_vocab.strings.add(text) vocab_bytes = en_vocab.to_bytes() new_vocab = Vocab().from_bytes(vocab_bytes) - assert new_vocab.strings(text_hash) == text + assert new_vocab.strings[text_hash] == text @pytest.mark.parametrize("strings1,strings2", test_strings) @@ -69,6 +68,16 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): assert vocab2[strings[0]].norm_ == lex_attr +@pytest.mark.xfail +@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) +def test_deserialize_vocab_seen_entries(strings, lex_attr): + # Reported in #2153 + vocab = Vocab(strings=strings) + length = len(vocab) + vocab.from_bytes(vocab.to_bytes()) + assert len(vocab) == length + + @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_serialize_vocab_lex_attrs_disk(strings, lex_attr): vocab1 = Vocab(strings=strings) From d6eaa71afc90b2771aeefde1ab0f9a6ee37f807b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 10 Mar 2019 16:54:03 +0100 Subject: [PATCH 51/55] Handle scalar values in doc.from_array() --- spacy/tokens/doc.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index fce2a6e7c..36c747396 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -747,7 +747,7 @@ cdef class Doc: attrs = [attrs] # Allow strings, e.g. 'lemma' or 'LEMMA' attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) - for id_ in attrs] + for id_ in attrs] if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) @@ -761,6 +761,8 @@ cdef class Doc: attr_ids = mem.alloc(n_attrs, sizeof(attr_id_t)) for i, attr_id in enumerate(attrs): attr_ids[i] = attr_id + if len(array.shape) == 1: + array = array.reshape((array.size, 1)) # Now load the data for i in range(self.length): token = &self.c[i] From 27dd820753e3c7130a75431b01813b8b18003ba2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 10 Mar 2019 17:21:19 +0100 Subject: [PATCH 52/55] Fix vocab deserialization when loading already present lexemes (#3383) * Fix vocab deserialization bug. Closes #2153 * Un-xfail test for #2153 --- spacy/tests/serialize/test_serialize_vocab_strings.py | 1 - spacy/vocab.pyx | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 88b469c0a..378dcb245 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -68,7 +68,6 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): assert vocab2[strings[0]].norm_ == lex_attr -@pytest.mark.xfail @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 44a69351f..8895e3a7c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,6 +1,7 @@ # coding: utf8 # cython: profile=True from __future__ import unicode_literals +from libc.string cimport memcpy import numpy import srsly @@ -518,7 +519,10 @@ cdef class Vocab: for j in range(sizeof(lex_data.data)): lex_data.data[j] = bytes_ptr[i+j] Lexeme.c_from_bytes(lexeme, lex_data) - + prev_entry = self._by_orth.get(lexeme.orth) + if prev_entry != NULL: + memcpy(prev_entry, lexeme, sizeof(LexemeC)) + continue ptr = self.strings._map.get(lexeme.orth) if ptr == NULL: continue From 67e38690d4c688de9779e50819d91e733884da6b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 10 Mar 2019 18:42:16 +0100 Subject: [PATCH 53/55] Un-xfail passing tests and tidy up --- spacy/tests/doc/test_retokenize_merge.py | 1 - spacy/tests/regression/test_issue3345.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 4d4a70e30..b62e69f6c 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -69,7 +69,6 @@ def test_doc_retokenize_retokenizer_attrs(en_tokenizer): assert doc[4].ent_type_ == "ORG" -@pytest.mark.xfail def test_doc_retokenize_lex_attrs(en_tokenizer): """Test that lexical attributes can be changed (see #2390).""" doc = en_tokenizer("WKRO played beach boys songs") diff --git a/spacy/tests/regression/test_issue3345.py b/spacy/tests/regression/test_issue3345.py index 8a7823d96..c358fd7bc 100644 --- a/spacy/tests/regression/test_issue3345.py +++ b/spacy/tests/regression/test_issue3345.py @@ -1,25 +1,21 @@ # coding: utf8 from __future__ import unicode_literals -import pytest from spacy.lang.en import English from spacy.tokens import Doc from spacy.pipeline import EntityRuler, EntityRecognizer -@pytest.mark.xfail def test_issue3345(): """Test case where preset entity crosses sentence boundary.""" nlp = English() doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True - ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) ner = EntityRecognizer(doc.vocab) # Add the OUT action. I wouldn't have thought this would be necessary... ner.moves.add_action(5, "") ner.add_label("GPE") - doc = ruler(doc) # Get into the state just before "New" state = ner.moves.init_batch([doc])[0] From 9a8f169e5c940fd1d2a50d99fc41d11726b9bf65 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 10 Mar 2019 18:58:51 +0100 Subject: [PATCH 54/55] Update v2-1.md --- website/docs/usage/v2-1.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md index f97d9d283..42adcb657 100644 --- a/website/docs/usage/v2-1.md +++ b/website/docs/usage/v2-1.md @@ -237,6 +237,19 @@ if all of your models are up to date, you can run the + retokenizer.merge(doc[6:8]) ``` +- The serialization methods `to_disk`, `from_disk`, `to_bytes` and `from_bytes` + now support a single `exclude` argument to provide a list of string names to + exclude. The docs have been updated to list the available serialization fields + for each class. The `disable` argument on the [`Language`](/api/language) + serialization methods has been renamed to `exclude` for consistency. + + ```diff + - nlp.to_disk("/path", disable=["parser", "ner"]) + + nlp.to_disk("/path", exclude=["parser", "ner"]) + - data = nlp.tokenizer.to_bytes(vocab=False) + + data = nlp.tokenizer.to_bytes(exclude=["vocab"]) + ``` + - For better compatibility with the Universal Dependencies data, the lemmatizer now preserves capitalization, e.g. for proper nouns. See [this issue](https://github.com/explosion/spaCy/issues/3256) for details. From 7ba3a5d95cbfbb4381dc925da56aa063e4e50235 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 10 Mar 2019 19:16:45 +0100 Subject: [PATCH 55/55] =?UTF-8?q?=F0=9F=92=AB=20Make=20serialization=20met?= =?UTF-8?q?hods=20consistent=20(#3385)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Make serialization methods consistent exclude keyword argument instead of random named keyword arguments and deprecation handling * Update docs and add section on serialization fields --- spacy/errors.py | 11 +- spacy/language.py | 134 ++++++++---------- spacy/pipeline/pipes.pyx | 60 ++++---- spacy/strings.pyx | 6 +- spacy/syntax/nn_parser.pyx | 26 ++-- spacy/syntax/transition_system.pyx | 14 +- spacy/tests/doc/test_doc_api.py | 4 +- spacy/tests/serialize/test_serialize_doc.py | 16 +++ .../serialize/test_serialize_language.py | 16 +++ .../serialize/test_serialize_pipeline.py | 26 +++- spacy/tokenizer.pyx | 25 ++-- spacy/tokens/doc.pyx | 32 +++-- spacy/util.py | 39 +++-- spacy/vectors.pyx | 23 +-- spacy/vocab.pyx | 53 ++++--- website/docs/api/dependencyparser.md | 44 ++++-- website/docs/api/doc.md | 57 ++++++-- website/docs/api/entityrecognizer.md | 44 ++++-- website/docs/api/language.md | 61 +++++--- website/docs/api/stringstore.md | 16 +-- website/docs/api/tagger.md | 45 ++++-- website/docs/api/textcategorizer.md | 44 ++++-- website/docs/api/tokenizer.md | 48 +++++-- website/docs/api/vectors.md | 23 ++- website/docs/api/vocab.md | 45 ++++-- 25 files changed, 598 insertions(+), 314 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index c409e5a0c..f12c73c69 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -70,6 +70,12 @@ class Warnings(object): W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more " "efficient and less error-prone Doc.retokenize context manager " "instead.") + W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization " + "methods is and should be replaced with `exclude`. This makes it " + "consistent with the other objects serializable.") + W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from " + "being serialized or deserialized is deprecated. Please use the " + "`exclude` argument instead. For example: exclude=['{arg}'].") @add_codes @@ -348,7 +354,10 @@ class Errors(object): "This is likely a bug in spaCy, so feel free to open an issue.") E127 = ("Cannot create phrase pattern representation for length 0. This " "is likely a bug in spaCy.") - + E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword " + "arguments to exclude fields from being serialized or deserialized " + "is now deprecated. Please use the `exclude` argument instead. " + "For example: exclude=['{arg}'].") @add_codes diff --git a/spacy/language.py b/spacy/language.py index 723c49ef7..6fb30e46d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -28,7 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors +from .errors import Errors, Warnings, deprecation_warning from . import util from . import about @@ -699,124 +699,114 @@ class Language(object): self.tokenizer._reset_cache(keys) nr_seen = 0 - def to_disk(self, path, disable=tuple()): + def to_disk(self, path, exclude=tuple(), disable=None): """Save the current state to a directory. If a model is loaded, this will include the model. - path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be strings or `Path`-like objects. - disable (list): Names of pipeline components to disable and prevent - from being saved. + path (unicode or Path): Path to a directory, which will be created if + it doesn't exist. + exclude (list): Names of components or serialization fields to exclude. - EXAMPLE: - >>> nlp.to_disk('/path/to/models') + DOCS: https://spacy.io/api/language#to_disk """ + if disable is not None: + deprecation_warning(Warnings.W014) + exclude = disable path = util.ensure_path(path) - serializers = OrderedDict( - ( - ("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)), - ("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))), - ) - ) + serializers = OrderedDict() + serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(p, exclude=["vocab"]) + serializers["meta.json"] = lambda p: p.open("w").write(srsly.json_dumps(self.meta)) for name, proc in self.pipeline: if not hasattr(proc, "name"): continue - if name in disable: + if name in exclude: continue if not hasattr(proc, "to_disk"): continue - serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) + serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"]) serializers["vocab"] = lambda p: self.vocab.to_disk(p) - util.to_disk(path, serializers, {p: False for p in disable}) + util.to_disk(path, serializers, exclude) - def from_disk(self, path, disable=tuple()): + def from_disk(self, path, exclude=tuple(), disable=None): """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the model will be loaded. - path (unicode or Path): A path to a directory. Paths may be either - strings or `Path`-like objects. - disable (list): Names of the pipeline components to disable. + path (unicode or Path): A path to a directory. + exclude (list): Names of components or serialization fields to exclude. RETURNS (Language): The modified `Language` object. - EXAMPLE: - >>> from spacy.language import Language - >>> nlp = Language().from_disk('/path/to/models') + DOCS: https://spacy.io/api/language#from_disk """ + if disable is not None: + deprecation_warning(Warnings.W014) + exclude = disable path = util.ensure_path(path) - deserializers = OrderedDict( - ( - ("meta.json", lambda p: self.meta.update(srsly.read_json(p))), - ( - "vocab", - lambda p: ( - self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self) - ), - ), - ("tokenizer", lambda p: self.tokenizer.from_disk(p, vocab=False)), - ) - ) + deserializers = OrderedDict() + deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) + deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self) + deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"]) for name, proc in self.pipeline: - if name in disable: + if name in exclude: continue if not hasattr(proc, "from_disk"): continue - deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) - exclude = {p: False for p in disable} - if not (path / "vocab").exists(): - exclude["vocab"] = True + deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"]) + if not (path / "vocab").exists() and "vocab" not in exclude: + # Convert to list here in case exclude is (default) tuple + exclude = list(exclude) + ["vocab"] util.from_disk(path, deserializers, exclude) self._path = path return self - def to_bytes(self, disable=[], **exclude): + def to_bytes(self, exclude=tuple(), disable=None, **kwargs): """Serialize the current state to a binary string. - disable (list): Nameds of pipeline components to disable and prevent - from being serialized. + exclude (list): Names of components or serialization fields to exclude. RETURNS (bytes): The serialized form of the `Language` object. + + DOCS: https://spacy.io/api/language#to_bytes """ - serializers = OrderedDict( - ( - ("vocab", lambda: self.vocab.to_bytes()), - ("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)), - ("meta", lambda: srsly.json_dumps(self.meta)), - ) - ) - for i, (name, proc) in enumerate(self.pipeline): - if name in disable: + if disable is not None: + deprecation_warning(Warnings.W014) + exclude = disable + serializers = OrderedDict() + serializers["vocab"] = lambda: self.vocab.to_bytes() + serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) + serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) + for name, proc in self.pipeline: + if name in exclude: continue if not hasattr(proc, "to_bytes"): continue - serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False) + serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"]) + exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, disable=[]): + def from_bytes(self, bytes_data, exclude=tuple(), disable=None, **kwargs): """Load state from a binary string. bytes_data (bytes): The data to load from. - disable (list): Names of the pipeline components to disable. + exclude (list): Names of components or serialization fields to exclude. RETURNS (Language): The `Language` object. + + DOCS: https://spacy.io/api/language#from_bytes """ - deserializers = OrderedDict( - ( - ("meta", lambda b: self.meta.update(srsly.json_loads(b))), - ( - "vocab", - lambda b: ( - self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self) - ), - ), - ("tokenizer", lambda b: self.tokenizer.from_bytes(b, vocab=False)), - ) - ) - for i, (name, proc) in enumerate(self.pipeline): - if name in disable: + if disable is not None: + deprecation_warning(Warnings.W014) + exclude = disable + deserializers = OrderedDict() + deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) + deserializers["vocab"] = lambda b: self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self) + deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(b, exclude=["vocab"]) + for name, proc in self.pipeline: + if name in exclude: continue if not hasattr(proc, "from_bytes"): continue - deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False) - util.from_bytes(bytes_data, deserializers, {}) + deserializers[name] = lambda b, proc=proc: proc.from_bytes(b, exclude=["vocab"]) + exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) + util.from_bytes(bytes_data, deserializers, exclude) return self diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b3c3db04d..4167bf7cb 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -141,16 +141,21 @@ class Pipe(object): with self.model.use_params(params): yield - def to_bytes(self, **exclude): - """Serialize the pipe to a bytestring.""" + def to_bytes(self, exclude=tuple(), **kwargs): + """Serialize the pipe to a bytestring. + + exclude (list): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. + """ serialize = OrderedDict() serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) if self.model not in (True, False, None): serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): """Load the pipe from a bytestring.""" def load_model(b): @@ -161,26 +166,25 @@ class Pipe(object): self.model = self.Model(**self.cfg) self.model.from_bytes(b) - deserialize = OrderedDict( - ( - ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), - ("vocab", lambda b: self.vocab.from_bytes(b)), - ("model", load_model), - ) - ) + deserialize = OrderedDict() + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) + deserialize["model"] = load_model + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, **exclude): + def to_disk(self, path, exclude=tuple(), **kwargs): """Serialize the pipe to disk.""" serialize = OrderedDict() serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) - def from_disk(self, path, **exclude): + def from_disk(self, path, exclude=tuple(), **kwargs): """Load the pipe from disk.""" def load_model(p): @@ -191,13 +195,11 @@ class Pipe(object): self.model = self.Model(**self.cfg) self.model.from_bytes(p.open("rb").read()) - deserialize = OrderedDict( - ( - ("cfg", lambda p: self.cfg.update(_load_cfg(p))), - ("vocab", lambda p: self.vocab.from_disk(p)), - ("model", load_model), - ) - ) + deserialize = OrderedDict() + deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) + deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize["model"] = load_model + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -537,7 +539,7 @@ class Tagger(Pipe): with self.model.use_params(params): yield - def to_bytes(self, **exclude): + def to_bytes(self, exclude=tuple(), **kwargs): serialize = OrderedDict() if self.model not in (None, True, False): serialize["model"] = self.model.to_bytes @@ -545,9 +547,10 @@ class Tagger(Pipe): serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def load_model(b): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: @@ -572,20 +575,22 @@ class Tagger(Pipe): ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), ("model", lambda b: load_model(b)), )) + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, **exclude): + def to_disk(self, path, exclude=tuple(), **kwargs): tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) serialize = OrderedDict(( - ('vocab', lambda p: self.vocab.to_disk(p)), - ('tag_map', lambda p: srsly.write_msgpack(p, tag_map)), - ('model', lambda p: p.open("wb").write(self.model.to_bytes())), - ('cfg', lambda p: srsly.write_json(p, self.cfg)) + ("vocab", lambda p: self.vocab.to_disk(p)), + ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)), + ("model", lambda p: p.open("wb").write(self.model.to_bytes())), + ("cfg", lambda p: srsly.write_json(p, self.cfg)) )) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) - def from_disk(self, path, **exclude): + def from_disk(self, path, exclude=tuple(), **kwargs): def load_model(p): # TODO: Remove this once we don't have to handle previous models if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: @@ -608,6 +613,7 @@ class Tagger(Pipe): ("tag_map", load_tag_map), ("model", load_model), )) + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 433b30e8b..df86f8ac7 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -236,19 +236,17 @@ cdef class StringStore: self.add(word) return self - def to_bytes(self, **exclude): + def to_bytes(self, **kwargs): """Serialize the current state to a binary string. - **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `StringStore` object. """ return srsly.json_dumps(list(self)) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, **kwargs): """Load state from a binary string. bytes_data (bytes): The data to load from. - **exclude: Named attributes to prevent from being loaded. RETURNS (StringStore): The `StringStore` object. """ strings = srsly.json_loads(bytes_data) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 4e3141a41..cbeef756d 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -228,7 +228,7 @@ cdef class Parser: self.set_annotations(subbatch, parse_states, tensors=None) for doc in batch_in_order: yield doc - + def require_model(self): """Raise an error if the component's model is not initialized.""" if getattr(self, 'model', None) in (None, True, False): @@ -272,7 +272,7 @@ cdef class Parser: beams = self.moves.init_beams(docs, beam_width, beam_density=beam_density) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to - # expand our model output. + # expand our model output. self.model.resize_output(self.moves.n_moves) model = self.model(docs) token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), @@ -442,7 +442,7 @@ cdef class Parser: if self._rehearsal_model is None: return None losses.setdefault(self.name, 0.) - + states = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to @@ -603,22 +603,24 @@ cdef class Parser: self.cfg.update(cfg) return sgd - def to_disk(self, path, **exclude): + def to_disk(self, path, exclude=tuple(), **kwargs): serializers = { 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), 'vocab': lambda p: self.vocab.to_disk(p), - 'moves': lambda p: self.moves.to_disk(p, strings=False), + 'moves': lambda p: self.moves.to_disk(p, exclude=["strings"]), 'cfg': lambda p: srsly.write_json(p, self.cfg) } + exclude = util.get_serialization_exclude(serializers, exclude, kwargs) util.to_disk(path, serializers, exclude) - def from_disk(self, path, **exclude): + def from_disk(self, path, exclude=tuple(), **kwargs): deserializers = { 'vocab': lambda p: self.vocab.from_disk(p), - 'moves': lambda p: self.moves.from_disk(p, strings=False), + 'moves': lambda p: self.moves.from_disk(p, exclude=["strings"]), 'cfg': lambda p: self.cfg.update(srsly.read_json(p)), 'model': lambda p: None } + exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) util.from_disk(path, deserializers, exclude) if 'model' not in exclude: path = util.ensure_path(path) @@ -632,22 +634,24 @@ cdef class Parser: self.cfg.update(cfg) return self - def to_bytes(self, **exclude): + def to_bytes(self, exclude=tuple(), **kwargs): serializers = OrderedDict(( ('model', lambda: (self.model.to_bytes() if self.model is not True else True)), ('vocab', lambda: self.vocab.to_bytes()), - ('moves', lambda: self.moves.to_bytes(strings=False)), + ('moves', lambda: self.moves.to_bytes(exclude=["strings"])), ('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)) )) + exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): deserializers = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), - ('moves', lambda b: self.moves.from_bytes(b, strings=False)), + ('moves', lambda b: self.moves.from_bytes(b, exclude=["strings"])), ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), ('model', lambda b: None) )) + exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: # TODO: Remove this once we don't have to handle previous models diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 4cc00828e..523cd6699 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -208,30 +208,32 @@ cdef class TransitionSystem: self.labels[action][label_name] = new_freq-1 return 1 - def to_disk(self, path, **exclude): + def to_disk(self, path, **kwargs): with path.open('wb') as file_: - file_.write(self.to_bytes(**exclude)) + file_.write(self.to_bytes(**kwargs)) - def from_disk(self, path, **exclude): + def from_disk(self, path, **kwargs): with path.open('rb') as file_: byte_data = file_.read() - self.from_bytes(byte_data, **exclude) + self.from_bytes(byte_data, **kwargs) return self - def to_bytes(self, **exclude): + def to_bytes(self, exclude=tuple(), **kwargs): transitions = [] serializers = { 'moves': lambda: srsly.json_dumps(self.labels), 'strings': lambda: self.strings.to_bytes() } + exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): labels = {} deserializers = { 'moves': lambda b: labels.update(srsly.json_loads(b)), 'strings': lambda b: self.strings.from_bytes(b) } + exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) self.initialize_actions(labels) return self diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 8eed2c267..4069e018a 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -113,14 +113,14 @@ def test_doc_api_serialize(en_tokenizer, text): assert [t.orth for t in tokens] == [t.orth for t in new_tokens] new_tokens = Doc(tokens.vocab).from_bytes( - tokens.to_bytes(tensor=False), tensor=False + tokens.to_bytes(exclude=["tensor"]), exclude=["tensor"] ) assert tokens.text == new_tokens.text assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] new_tokens = Doc(tokens.vocab).from_bytes( - tokens.to_bytes(sentiment=False), sentiment=False + tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"] ) assert tokens.text == new_tokens.text assert [t.text for t in tokens] == [t.text for t in new_tokens] diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 77d6e6833..b109ca0b2 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import pytest from spacy.tokens import Doc from spacy.compat import path2str @@ -41,3 +42,18 @@ def test_serialize_doc_roundtrip_disk_str_path(en_vocab): doc.to_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path) assert doc.to_bytes() == doc_d.to_bytes() + + +def test_serialize_doc_exclude(en_vocab): + doc = Doc(en_vocab, words=["hello", "world"]) + doc.user_data["foo"] = "bar" + new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) + assert new_doc.user_data["foo"] == "bar" + new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"]) + assert not new_doc.user_data + new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"])) + assert not new_doc.user_data + with pytest.raises(ValueError): + doc.to_bytes(user_data=False) + with pytest.raises(ValueError): + Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False) diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index edc5d125d..efc5d181c 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -52,3 +52,19 @@ def test_serialize_with_custom_tokenizer(): nlp.tokenizer = custom_tokenizer(nlp) with make_tempdir() as d: nlp.to_disk(d) + + +def test_serialize_language_exclude(meta_data): + name = "name-in-fixture" + nlp = Language(meta=meta_data) + assert nlp.meta["name"] == name + new_nlp = Language().from_bytes(nlp.to_bytes()) + assert nlp.meta["name"] == name + new_nlp = Language().from_bytes(nlp.to_bytes(), exclude=["meta"]) + assert not new_nlp.meta["name"] == name + new_nlp = Language().from_bytes(nlp.to_bytes(exclude=["meta"])) + assert not new_nlp.meta["name"] == name + with pytest.raises(ValueError): + nlp.to_bytes(meta=False) + with pytest.raises(ValueError): + Language().from_bytes(nlp.to_bytes(), meta=False) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 680df2288..68378e612 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -55,7 +55,9 @@ def test_serialize_parser_roundtrip_disk(en_vocab, Parser): parser_d = Parser(en_vocab) parser_d.model, _ = parser_d.Model(0) parser_d = parser_d.from_disk(file_path) - assert parser.to_bytes(model=False) == parser_d.to_bytes(model=False) + parser_bytes = parser.to_bytes(exclude=["model"]) + parser_d_bytes = parser_d.to_bytes(exclude=["model"]) + assert parser_bytes == parser_d_bytes def test_to_from_bytes(parser, blank_parser): @@ -114,3 +116,25 @@ def test_serialize_textcat_empty(en_vocab): # See issue #1105 textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) textcat.to_bytes() + + +@pytest.mark.parametrize("Parser", test_parsers) +def test_serialize_pipe_exclude(en_vocab, Parser): + def get_new_parser(): + new_parser = Parser(en_vocab) + new_parser.model, _ = new_parser.Model(0) + return new_parser + + parser = Parser(en_vocab) + parser.model, _ = parser.Model(0) + parser.cfg["foo"] = "bar" + new_parser = get_new_parser().from_bytes(parser.to_bytes()) + assert "foo" in new_parser.cfg + new_parser = get_new_parser().from_bytes(parser.to_bytes(), exclude=["cfg"]) + assert "foo" not in new_parser.cfg + new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["cfg"])) + assert "foo" not in new_parser.cfg + with pytest.raises(ValueError): + parser.to_bytes(cfg=False) + with pytest.raises(ValueError): + get_new_parser().from_bytes(parser.to_bytes(), cfg=False) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4e0d49b59..98aba149d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -360,36 +360,37 @@ cdef class Tokenizer: self._cache.set(key, cached) self._rules[string] = substrings - def to_disk(self, path, **exclude): + def to_disk(self, path, **kwargs): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or Path-like objects. + it doesn't exist. + exclude (list): String names of serialization fields to exclude. DOCS: https://spacy.io/api/tokenizer#to_disk """ with path.open("wb") as file_: - file_.write(self.to_bytes(**exclude)) + file_.write(self.to_bytes(**kwargs)) - def from_disk(self, path, **exclude): + def from_disk(self, path, **kwargs): """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. Paths may be either - strings or `Path`-like objects. + path (unicode or Path): A path to a directory. + exclude (list): String names of serialization fields to exclude. RETURNS (Tokenizer): The modified `Tokenizer` object. DOCS: https://spacy.io/api/tokenizer#from_disk """ with path.open("rb") as file_: bytes_data = file_.read() - self.from_bytes(bytes_data, **exclude) + self.from_bytes(bytes_data, **kwargs) return self - def to_bytes(self, **exclude): + def to_bytes(self, exclude=tuple(), **kwargs): """Serialize the current state to a binary string. - **exclude: Named attributes to prevent from being serialized. + exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized form of the `Tokenizer` object. DOCS: https://spacy.io/api/tokenizer#to_bytes @@ -402,13 +403,14 @@ cdef class Tokenizer: ("token_match", lambda: _get_regex_pattern(self.token_match)), ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) + exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): """Load state from a binary string. bytes_data (bytes): The data to load from. - **exclude: Named attributes to prevent from being loaded. + exclude (list): String names of serialization fields to exclude. RETURNS (Tokenizer): The `Tokenizer` object. DOCS: https://spacy.io/api/tokenizer#from_bytes @@ -422,6 +424,7 @@ cdef class Tokenizer: ("token_match", lambda b: data.setdefault("token_match", b)), ("exceptions", lambda b: data.setdefault("rules", b)) )) + exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) if data.get("prefix_search"): self.prefix_search = re.compile(data["prefix_search"]).search diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 36c747396..483fa6a10 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -794,24 +794,26 @@ cdef class Doc: """ return numpy.asarray(_get_lca_matrix(self, 0, len(self))) - def to_disk(self, path, **exclude): + def to_disk(self, path, **kwargs): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. + exclude (list): String names of serialization fields to exclude. DOCS: https://spacy.io/api/doc#to_disk """ path = util.ensure_path(path) with path.open("wb") as file_: - file_.write(self.to_bytes(**exclude)) + file_.write(self.to_bytes(**kwargs)) - def from_disk(self, path, **exclude): + def from_disk(self, path, **kwargs): """Loads state from a directory. Modifies the object in place and returns it. path (unicode or Path): A path to a directory. Paths may be either strings or `Path`-like objects. + exclude (list): String names of serialization fields to exclude. RETURNS (Doc): The modified `Doc` object. DOCS: https://spacy.io/api/doc#from_disk @@ -819,11 +821,12 @@ cdef class Doc: path = util.ensure_path(path) with path.open("rb") as file_: bytes_data = file_.read() - return self.from_bytes(bytes_data, **exclude) + return self.from_bytes(bytes_data, **kwargs) - def to_bytes(self, **exclude): + def to_bytes(self, exclude=tuple(), **kwargs): """Serialize, i.e. export the document contents to a binary string. + exclude (list): String names of serialization fields to exclude. RETURNS (bytes): A losslessly serialized copy of the `Doc`, including all annotations. @@ -849,16 +852,22 @@ cdef class Doc: "sentiment": lambda: self.sentiment, "tensor": lambda: self.tensor, } + for key in kwargs: + if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"): + raise ValueError(Errors.E128.format(arg=key)) if "user_data" not in exclude and self.user_data: user_data_keys, user_data_values = list(zip(*self.user_data.items())) - serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys) - serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) + if "user_data_keys" not in exclude: + serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys) + if "user_data_values" not in exclude: + serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. + exclude (list): String names of serialization fields to exclude. RETURNS (Doc): Itself. DOCS: https://spacy.io/api/doc#from_bytes @@ -874,6 +883,9 @@ cdef class Doc: "user_data_keys": lambda b: None, "user_data_values": lambda b: None, } + for key in kwargs: + if key in deserializers or key in ("user_data",): + raise ValueError(Errors.E128.format(arg=key)) msg = util.from_bytes(bytes_data, deserializers, exclude) # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within @@ -1170,7 +1182,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): def pickle_doc(doc): - bytes_data = doc.to_bytes(vocab=False, user_data=False) + bytes_data = doc.to_bytes(exclude=["vocab", "user_data"]) hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks, doc.user_token_hooks) return (unpickle_doc, (doc.vocab, srsly.pickle_dumps(hooks_and_data), bytes_data)) @@ -1179,7 +1191,7 @@ def pickle_doc(doc): def unpickle_doc(vocab, hooks_and_data, bytes_data): user_data, doc_hooks, span_hooks, token_hooks = srsly.pickle_loads(hooks_and_data) - doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude="user_data") + doc = Doc(vocab, user_data=user_data).from_bytes(bytes_data, exclude=["user_data"]) doc.user_hooks.update(doc_hooks) doc.user_span_hooks.update(span_hooks) doc.user_token_hooks.update(token_hooks) diff --git a/spacy/util.py b/spacy/util.py index bff00b585..0066b196d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -25,7 +25,7 @@ except ImportError: from .symbols import ORTH from .compat import cupy, CudaStream, path2str, basestring_, unicode_ from .compat import import_file -from .errors import Errors +from .errors import Errors, Warnings, deprecation_warning LANGUAGES = {} @@ -565,7 +565,8 @@ def itershuffle(iterable, bufsize=1000): def to_bytes(getters, exclude): serialized = OrderedDict() for key, getter in getters.items(): - if key not in exclude: + # Split to support file names like meta.json + if key.split(".")[0] not in exclude: serialized[key] = getter() return srsly.msgpack_dumps(serialized) @@ -573,7 +574,8 @@ def to_bytes(getters, exclude): def from_bytes(bytes_data, setters, exclude): msg = srsly.msgpack_loads(bytes_data) for key, setter in setters.items(): - if key not in exclude and key in msg: + # Split to support file names like meta.json + if key.split(".")[0] not in exclude and key in msg: setter(msg[key]) return msg @@ -583,7 +585,8 @@ def to_disk(path, writers, exclude): if not path.exists(): path.mkdir() for key, writer in writers.items(): - if key not in exclude: + # Split to support file names like meta.json + if key.split(".")[0] not in exclude: writer(path / key) return path @@ -591,7 +594,8 @@ def to_disk(path, writers, exclude): def from_disk(path, readers, exclude): path = ensure_path(path) for key, reader in readers.items(): - if key not in exclude: + # Split to support file names like meta.json + if key.split(".")[0] not in exclude: reader(path / key) return path @@ -677,6 +681,23 @@ def validate_json(data, validator): return errors +def get_serialization_exclude(serializers, exclude, kwargs): + """Helper function to validate serialization args and manage transition from + keyword arguments (pre v2.1) to exclude argument. + """ + exclude = list(exclude) + # Split to support file names like meta.json + options = [name.split(".")[0] for name in serializers] + for key, value in kwargs.items(): + if key in ("vocab",) and value is False: + deprecation_warning(Warnings.W015.format(arg=key)) + exclude.append(key) + elif key.split(".")[0] in options: + raise ValueError(Errors.E128.format(arg=key)) + # TODO: user warning? + return exclude + + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default function or method argument (for arguments that should default to empty @@ -696,14 +717,14 @@ class SimpleFrozenDict(dict): class DummyTokenizer(object): # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # allow serialization (see #1557) - def to_bytes(self, **exclude): + def to_bytes(self, **kwargs): return b"" - def from_bytes(self, _bytes_data, **exclude): + def from_bytes(self, _bytes_data, **kwargs): return self - def to_disk(self, _path, **exclude): + def to_disk(self, _path, **kwargs): return None - def from_disk(self, _path, **exclude): + def from_disk(self, _path, **kwargs): return self diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 311bb9634..690ad33bd 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -377,11 +377,11 @@ cdef class Vectors: self.add(key, row=i) return strings - def to_disk(self, path, **exclude): + def to_disk(self, path, **kwargs): """Save the current state to a directory. path (unicode / Path): A path to a directory, which will be created if - it doesn't exists. Either a string or a Path-like object. + it doesn't exists. DOCS: https://spacy.io/api/vectors#to_disk """ @@ -394,9 +394,9 @@ cdef class Vectors: ("vectors", lambda p: save_array(self.data, p.open("wb"))), ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) )) - return util.to_disk(path, serializers, exclude) + return util.to_disk(path, serializers, []) - def from_disk(self, path, **exclude): + def from_disk(self, path, **kwargs): """Loads state from a directory. Modifies the object in place and returns it. @@ -428,13 +428,13 @@ cdef class Vectors: ("keys", load_keys), ("vectors", load_vectors), )) - util.from_disk(path, serializers, exclude) + util.from_disk(path, serializers, []) return self - def to_bytes(self, **exclude): + def to_bytes(self, **kwargs): """Serialize the current state to a binary string. - **exclude: Named attributes to prevent from being serialized. + exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized form of the `Vectors` object. DOCS: https://spacy.io/api/vectors#to_bytes @@ -444,17 +444,18 @@ cdef class Vectors: return self.data.to_bytes() else: return srsly.msgpack_dumps(self.data) + serializers = OrderedDict(( ("key2row", lambda: srsly.msgpack_dumps(self.key2row)), ("vectors", serialize_weights) )) - return util.to_bytes(serializers, exclude) + return util.to_bytes(serializers, []) - def from_bytes(self, data, **exclude): + def from_bytes(self, data, **kwargs): """Load state from a binary string. data (bytes): The data to load from. - **exclude: Named attributes to prevent from being loaded. + exclude (list): String names of serialization fields to exclude. RETURNS (Vectors): The `Vectors` object. DOCS: https://spacy.io/api/vectors#from_bytes @@ -469,5 +470,5 @@ cdef class Vectors: ("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))), ("vectors", deserialize_weights) )) - util.from_bytes(data, deserializers, exclude) + util.from_bytes(data, deserializers, []) return self diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 8895e3a7c..0923f977a 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -397,47 +397,57 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors - def to_disk(self, path, **exclude): + def to_disk(self, path, exclude=tuple(), **kwargs): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. Paths may be either strings or Path-like objects. + it doesn't exist. + exclude (list): String names of serialization fields to exclude. DOCS: https://spacy.io/api/vocab#to_disk """ path = util.ensure_path(path) if not path.exists(): path.mkdir() - self.strings.to_disk(path / "strings.json") - with (path / "lexemes.bin").open('wb') as file_: - file_.write(self.lexemes_to_bytes()) - if self.vectors is not None: + setters = ["strings", "lexemes", "vectors"] + exclude = util.get_serialization_exclude(setters, exclude, kwargs) + if "strings" not in exclude: + self.strings.to_disk(path / "strings.json") + if "lexemes" not in exclude: + with (path / "lexemes.bin").open("wb") as file_: + file_.write(self.lexemes_to_bytes()) + if "vectors" not in "exclude" and self.vectors is not None: self.vectors.to_disk(path) - def from_disk(self, path, **exclude): + def from_disk(self, path, exclude=tuple(), **kwargs): """Loads state from a directory. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. Paths may be either - strings or `Path`-like objects. + path (unicode or Path): A path to a directory. + exclude (list): String names of serialization fields to exclude. RETURNS (Vocab): The modified `Vocab` object. DOCS: https://spacy.io/api/vocab#to_disk """ path = util.ensure_path(path) - self.strings.from_disk(path / "strings.json") - with (path / "lexemes.bin").open("rb") as file_: - self.lexemes_from_bytes(file_.read()) - if self.vectors is not None: - self.vectors.from_disk(path, exclude="strings.json") - if self.vectors.name is not None: - link_vectors_to_models(self) + getters = ["strings", "lexemes", "vectors"] + exclude = util.get_serialization_exclude(getters, exclude, kwargs) + if "strings" not in exclude: + self.strings.from_disk(path / "strings.json") # TODO: add exclude? + if "lexemes" not in exclude: + with (path / "lexemes.bin").open("rb") as file_: + self.lexemes_from_bytes(file_.read()) + if "vectors" not in exclude: + if self.vectors is not None: + self.vectors.from_disk(path, exclude=["strings"]) + if self.vectors.name is not None: + link_vectors_to_models(self) return self - def to_bytes(self, **exclude): + def to_bytes(self, exclude=tuple(), **kwargs): """Serialize the current state to a binary string. - **exclude: Named attributes to prevent from being serialized. + exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized form of the `Vocab` object. DOCS: https://spacy.io/api/vocab#to_bytes @@ -453,13 +463,14 @@ cdef class Vocab: ("lexemes", lambda: self.lexemes_to_bytes()), ("vectors", deserialize_vectors) )) + exclude = util.get_serialization_exclude(getters, exclude, kwargs) return util.to_bytes(getters, exclude) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): """Load state from a binary string. bytes_data (bytes): The data to load from. - **exclude: Named attributes to prevent from being loaded. + exclude (list): String names of serialization fields to exclude. RETURNS (Vocab): The `Vocab` object. DOCS: https://spacy.io/api/vocab#from_bytes @@ -469,11 +480,13 @@ cdef class Vocab: return None else: return self.vectors.from_bytes(b) + setters = OrderedDict(( ("strings", lambda b: self.strings.from_bytes(b)), ("lexemes", lambda b: self.lexemes_from_bytes(b)), ("vectors", lambda b: serialize_vectors(b)) )) + exclude = util.get_serialization_exclude(setters, exclude, kwargs) util.from_bytes(bytes_data, setters, exclude) if self.vectors.name is not None: link_vectors_to_models(self) diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 13ae320cc..329f96ead 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -244,9 +244,10 @@ Serialize the pipe to disk. > parser.to_disk("/path/to/parser") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## DependencyParser.from_disk {#from_disk tag="method"} @@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ------------------ | -------------------------------------------------------------------------- | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | ## DependencyParser.to_bytes {#to_bytes tag="method"} @@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------------------- | -| `**exclude` | - | Named attributes to prevent from being serialized. | -| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------------------- | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | ## DependencyParser.from_bytes {#from_bytes tag="method"} @@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > parser.from_bytes(parser_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------------ | ---------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `**exclude` | - | Named attributes to prevent from being loaded. | -| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | +| Name | Type | Description | +| ------------ | ------------------ | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | ## DependencyParser.labels {#labels tag="property"} @@ -312,3 +314,21 @@ The labels currently added to the component. | Name | Type | Description | | ----------- | ----- | ---------------------------------- | | **RETURNS** | tuple | The labels added to the component. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = parser.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index e53619cff..953a31c2d 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -349,11 +349,12 @@ array of attributes. > assert doc[0].pos_ == doc2[0].pos_ > ``` -| Name | Type | Description | -| ----------- | -------------------------------------- | ----------------------------- | -| `attrs` | list | A list of attribute ID ints. | -| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. | -| **RETURNS** | `Doc` | Itself. | +| Name | Type | Description | +| ----------- | -------------------------------------- | ------------------------------------------------------------------------- | +| `attrs` | list | A list of attribute ID ints. | +| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | Itself. | ## Doc.to_disk {#to_disk tag="method" new="2"} @@ -365,9 +366,10 @@ Save the current state to a directory. > doc.to_disk("/path/to/doc") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Doc.from_disk {#from_disk tag="method" new="2"} @@ -384,6 +386,7 @@ Loads state from a directory. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ---------------- | -------------------------------------------------------------------------- | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `Doc` | The modified `Doc` object. | ## Doc.to_bytes {#to_bytes tag="method"} @@ -397,9 +400,10 @@ Serialize, i.e. export the document contents to a binary string. > doc_bytes = doc.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | --------------------------------------------------------------------- | -| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------------------- | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | ## Doc.from_bytes {#from_bytes tag="method"} @@ -416,10 +420,11 @@ Deserialize, i.e. import the document contents from a binary string. > assert doc.text == doc2.text > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `data` | bytes | The string to load from. | -| **RETURNS** | `Doc` | The `Doc` object. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------------------- | +| `data` | bytes | The string to load from. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The `Doc` object. | ## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"} @@ -658,3 +663,25 @@ The L2 norm of the document's vector representation. | `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | | `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | | `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = doc.to_bytes(exclude=["text", "tensor"]) +> doc.from_disk("./doc.bin", exclude=["user_data"]) +> ``` + +| Name | Description | +| ------------------ | --------------------------------------------- | +| `text` | The value of the `Doc.text` attribute. | +| `sentiment` | The value of the `Doc.sentiment` attribute. | +| `tensor` | The value of the `Doc.tensor` attribute. | +| `user_data` | The value of the `Doc.user_data` dictionary. | +| `user_data_keys` | The keys of the `Doc.user_data` dictionary. | +| `user_data_values` | The values of the `Doc.user_data` dictionary. | diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index c9db2c409..7279a7f77 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -244,9 +244,10 @@ Serialize the pipe to disk. > ner.to_disk("/path/to/ner") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityRecognizer.from_disk {#from_disk tag="method"} @@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ------------------ | -------------------------------------------------------------------------- | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | ## EntityRecognizer.to_bytes {#to_bytes tag="method"} @@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------------------- | -| `**exclude` | - | Named attributes to prevent from being serialized. | -| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------------------- | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | ## EntityRecognizer.from_bytes {#from_bytes tag="method"} @@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > ner.from_bytes(ner_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------------ | ---------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `**exclude` | - | Named attributes to prevent from being loaded. | -| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | +| Name | Type | Description | +| ------------ | ------------------ | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | ## EntityRecognizer.labels {#labels tag="property"} @@ -312,3 +314,21 @@ The labels currently added to the component. | Name | Type | Description | | ----------- | ----- | ---------------------------------- | | **RETURNS** | tuple | The labels added to the component. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = ner.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 990a7f962..34d14ec01 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -327,7 +327,7 @@ the model**. | Name | Type | Description | | --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being saved. | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | ## Language.from_disk {#from_disk tag="method" new="2"} @@ -349,22 +349,22 @@ loaded object. > nlp = English().from_disk("/path/to/en_model") > ``` -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Language` | The modified `Language` object. | +| Name | Type | Description | +| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The modified `Language` object. | As of spaCy v2.0, the `save_to_directory` method has been renamed to `to_disk`, to improve consistency across classes. Pipeline components to prevent from being -loaded can now be added as a list to `disable`, instead of specifying one -keyword argument per component. +loaded can now be added as a list to `disable` (v2.0) or `exclude` (v2.1), +instead of specifying one keyword argument per component. ```diff - nlp = spacy.load("en", tagger=False, entity=False) -+ nlp = English().from_disk("/model", disable=["tagger', 'ner"]) ++ nlp = English().from_disk("/model", exclude=["tagger", "ner"]) ``` @@ -379,10 +379,10 @@ Serialize the current state to a binary string. > nlp_bytes = nlp.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------- | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling) and prevent from being serialized. | -| **RETURNS** | bytes | The serialized form of the `Language` object. | +| Name | Type | Description | +| ----------- | ----- | ----------------------------------------------------------------------------------------- | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Language` object. | ## Language.from_bytes {#from_bytes tag="method"} @@ -400,20 +400,21 @@ available to the loaded object. > nlp2.from_bytes(nlp_bytes) > ``` -| Name | Type | Description | -| ------------ | ---------- | --------------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Language` | The `Language` object. | +| Name | Type | Description | +| ------------ | ---------- | ----------------------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The `Language` object. | Pipeline components to prevent from being loaded can now be added as a list to -`disable`, instead of specifying one keyword argument per component. +`disable` (v2.0) or `exclude` (v2.1), instead of specifying one keyword argument +per component. ```diff - nlp = English().from_bytes(bytes, tagger=False, entity=False) -+ nlp = English().from_bytes(bytes, disable=["tagger", "ner"]) ++ nlp = English().from_bytes(bytes, exclude=["tagger", "ner"]) ``` @@ -437,3 +438,23 @@ Pipeline components to prevent from being loaded can now be added as a list to | `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | | `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | | `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = nlp.to_bytes(exclude=["tokenizer", "vocab"]) +> nlp.from_disk("./model-data", exclude=["ner"]) +> ``` + +| Name | Description | +| ----------- | -------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `tokenizer` | Tokenization rules and exceptions. | +| `meta` | The meta data, available as `Language.meta`. | +| ... | String names of pipeline components, e.g. `"ner"`. | diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index e7184ea95..40d27a62a 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -151,10 +151,9 @@ Serialize the current state to a binary string. > store_bytes = stringstore.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------- | -| `**exclude` | - | Named attributes to prevent from being serialized. | -| **RETURNS** | bytes | The serialized form of the `StringStore` object. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------ | +| **RETURNS** | bytes | The serialized form of the `StringStore` object. | ## StringStore.from_bytes {#from_bytes tag="method"} @@ -168,11 +167,10 @@ Load state from a binary string. > new_store = StringStore().from_bytes(store_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------- | ---------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `**exclude` | - | Named attributes to prevent from being loaded. | -| **RETURNS** | `StringStore` | The `StringStore` object. | +| Name | Type | Description | +| ------------ | ------------- | ------------------------- | +| `bytes_data` | bytes | The data to load from. | +| **RETURNS** | `StringStore` | The `StringStore` object. | ## Utilities {#util} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index fa3eda993..a1d921b41 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -244,9 +244,10 @@ Serialize the pipe to disk. > tagger.to_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Tagger.from_disk {#from_disk tag="method"} @@ -262,6 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ---------------- | -------------------------------------------------------------------------- | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `Tagger` | The modified `Tagger` object. | ## Tagger.to_bytes {#to_bytes tag="method"} @@ -275,10 +277,10 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------- | -| `**exclude` | - | Named attributes to prevent from being serialized. | -| **RETURNS** | bytes | The serialized form of the `Tagger` object. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------------------- | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tagger` object. | ## Tagger.from_bytes {#from_bytes tag="method"} @@ -292,11 +294,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > tagger.from_bytes(tagger_bytes) > ``` -| Name | Type | Description | -| ------------ | -------- | ---------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `**exclude` | - | Named attributes to prevent from being loaded. | -| **RETURNS** | `Tagger` | The `Tagger` object. | +| Name | Type | Description | +| ------------ | -------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The `Tagger` object. | ## Tagger.labels {#labels tag="property"} @@ -314,3 +316,22 @@ tags by default, e.g. `VERB`, `NOUN` and so on. | Name | Type | Description | | ----------- | ----- | ---------------------------------- | | **RETURNS** | tuple | The labels added to the component. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = tagger.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| --------- | ------------------------------------------------------------------------------------------ | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | +| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. | diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index cb90aa271..b307d4507 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -260,9 +260,10 @@ Serialize the pipe to disk. > textcat.to_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## TextCategorizer.from_disk {#from_disk tag="method"} @@ -278,6 +279,7 @@ Load the pipe from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ----------------- | -------------------------------------------------------------------------- | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | ## TextCategorizer.to_bytes {#to_bytes tag="method"} @@ -291,10 +293,10 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | ----- | ---------------------------------------------------- | -| `**exclude` | - | Named attributes to prevent from being serialized. | -| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------------------- | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | ## TextCategorizer.from_bytes {#from_bytes tag="method"} @@ -308,11 +310,11 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > textcat.from_bytes(textcat_bytes) > ``` -| Name | Type | Description | -| ------------ | ----------------- | ---------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `**exclude` | - | Named attributes to prevent from being loaded. | -| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | +| Name | Type | Description | +| ------------ | ----------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | ## TextCategorizer.labels {#labels tag="property"} @@ -328,3 +330,21 @@ The labels currently added to the component. | Name | Type | Description | | ----------- | ----- | ---------------------------------- | | **RETURNS** | tuple | The labels added to the component. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = textcat.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 2f0cc0542..50f4fceae 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -127,9 +127,10 @@ Serialize the tokenizer to disk. > tokenizer.to_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Tokenizer.from_disk {#from_disk tag="method"} @@ -145,6 +146,7 @@ Load the tokenizer from disk. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ---------------- | -------------------------------------------------------------------------- | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | ## Tokenizer.to_bytes {#to_bytes tag="method"} @@ -158,10 +160,10 @@ Load the tokenizer from disk. Modifies the object in place and returns it. Serialize the tokenizer to a bytestring. -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------- | -| `**exclude` | - | Named attributes to prevent from being serialized. | -| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------------------- | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | ## Tokenizer.from_bytes {#from_bytes tag="method"} @@ -176,11 +178,11 @@ it. > tokenizer.from_bytes(tokenizer_bytes) > ``` -| Name | Type | Description | -| ------------ | ----------- | ---------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `**exclude` | - | Named attributes to prevent from being loaded. | -| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | +| Name | Type | Description | +| ------------ | ----------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | ## Attributes {#attributes} @@ -190,3 +192,25 @@ it. | `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | | `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | | `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = tokenizer.to_bytes(exclude=["vocab", "exceptions"]) +> tokenizer.from_disk("./data", exclude=["token_match"]) +> ``` + +| Name | Description | +| ---------------- | --------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `prefix_search` | The prefix rules. | +| `suffix_search` | The suffix rules. | +| `infix_finditer` | The infix rules. | +| `token_match` | The token match expression. | +| `exceptions` | The tokenizer exception rules. | diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 62e91edc7..a9c9f93d6 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -311,10 +311,9 @@ Save the current state to a directory. > > ``` -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `**exclude` | - | Named attributes to prevent from being saved. | +| Name | Type | Description | +| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## Vectors.from_disk {#from_disk tag="method"} @@ -342,10 +341,9 @@ Serialize the current state to a binary string. > vectors_bytes = vectors.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------- | -| `**exclude` | - | Named attributes to prevent from being serialized. | -| **RETURNS** | bytes | The serialized form of the `Vectors` object. | +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------- | +| **RETURNS** | bytes | The serialized form of the `Vectors` object. | ## Vectors.from_bytes {#from_bytes tag="method"} @@ -360,11 +358,10 @@ Load state from a binary string. > new_vectors.from_bytes(vectors_bytes) > ``` -| Name | Type | Description | -| ----------- | --------- | ---------------------------------------------- | -| `data` | bytes | The data to load from. | -| `**exclude` | - | Named attributes to prevent from being loaded. | -| **RETURNS** | `Vectors` | The `Vectors` object. | +| Name | Type | Description | +| ----------- | --------- | ---------------------- | +| `data` | bytes | The data to load from. | +| **RETURNS** | `Vectors` | The `Vectors` object. | ## Attributes {#attributes} diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 1e962ac74..64e153331 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -221,9 +221,10 @@ Save the current state to a directory. > nlp.vocab.to_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | ## Vocab.from_disk {#from_disk tag="method" new="2"} @@ -239,6 +240,7 @@ Loads state from a directory. Modifies the object in place and returns it. | Name | Type | Description | | ----------- | ---------------- | -------------------------------------------------------------------------- | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `Vocab` | The modified `Vocab` object. | ## Vocab.to_bytes {#to_bytes tag="method"} @@ -251,10 +253,10 @@ Serialize the current state to a binary string. > vocab_bytes = nlp.vocab.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------- | -| `**exclude` | - | Named attributes to prevent from being serialized. | -| **RETURNS** | bytes | The serialized form of the `Vocab` object. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------------------- | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Vocab` object. | ## Vocab.from_bytes {#from_bytes tag="method"} @@ -269,11 +271,11 @@ Load state from a binary string. > vocab.from_bytes(vocab_bytes) > ``` -| Name | Type | Description | -| ------------ | ------- | ---------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `**exclude` | - | Named attributes to prevent from being loaded. | -| **RETURNS** | `Vocab` | The `Vocab` object. | +| Name | Type | Description | +| ------------ | ------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The `Vocab` object. | ## Attributes {#attributes} @@ -291,3 +293,22 @@ Load state from a binary string. | `strings` | `StringStore` | A table managing the string-to-int mapping. | | `vectors` 2 | `Vectors` | A table associating word IDs to word vectors. | | `vectors_length` | int | Number of dimensions for each word vector. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = vocab.to_bytes(exclude=["strings", "vectors"]) +> vocab.from_disk("./vocab", exclude=["strings"]) +> ``` + +| Name | Description | +| --------- | ----------------------------------------------------- | +| `strings` | The strings in the [`StringStore`](/api/stringstore). | +| `lexemes` | The lexeme data. | +| `vectors` | The word vectors, if available. |