diff --git a/.github/contributors/juliensalinas.md b/.github/contributors/juliensalinas.md new file mode 100644 index 000000000..0062426ba --- /dev/null +++ b/.github/contributors/juliensalinas.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +| ----------------------------- | ------------------- | +| Name | Julien Salinas | +| Company name (if applicable) | NLP Cloud | +| Title or role (if applicable) | Founder and CTO | +| Date | Mayb 14th 2021 | +| GitHub username | juliensalinas | +| Website (optional) | https://nlpcloud.io | diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bea65cae2..5840b916b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -82,18 +82,18 @@ jobs: python_version: '$(python.version)' architecture: 'x64' - - job: "TestGPU" - dependsOn: "Validate" - strategy: - matrix: - Python38LinuxX64_GPU: - python.version: '3.8' - pool: - name: "LinuxX64_GPU" - steps: - - template: .github/azure-steps.yml - parameters: - python_version: '$(python.version)' - architecture: 'x64' - gpu: true - num_build_jobs: 24 +# - job: "TestGPU" +# dependsOn: "Validate" +# strategy: +# matrix: +# Python38LinuxX64_GPU: +# python.version: '3.8' +# pool: +# name: "LinuxX64_GPU" +# steps: +# - template: .github/azure-steps.yml +# parameters: +# python_version: '$(python.version)' +# architecture: 'x64' +# gpu: true +# num_build_jobs: 24 diff --git a/requirements.txt b/requirements.txt index 09d1cabda..dda9c7773 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ pathy>=0.3.5 numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.7.1,<1.8.0 +pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 jinja2 # Official Python utilities setuptools diff --git a/setup.cfg b/setup.cfg index 5cda00fb2..846ccf4b3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,7 +52,7 @@ install_requires = tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 - pydantic>=1.7.1,<1.8.0 + pydantic>=1.7.4,!=1.8,!=1.8.1,<1.9.0 jinja2 # Official Python utilities setuptools diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 1ebf65957..b4119abdf 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -174,7 +174,8 @@ def debug_data( n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) msg.warn( "{} words in training data without vectors ({:.0f}%)".format( - n_missing_vectors, 100 * (n_missing_vectors / gold_train_data["n_words"]) + n_missing_vectors, + 100 * (n_missing_vectors / gold_train_data["n_words"]), ), ) msg.text( @@ -282,42 +283,7 @@ def debug_data( labels = _get_labels_from_model(nlp, "textcat") msg.info(f"Text Classification: {len(labels)} label(s)") msg.text(f"Labels: {_format_labels(labels)}", show=verbose) - labels_with_counts = _format_labels( - gold_train_data["cats"].most_common(), counts=True - ) - msg.text(f"Labels in train data: {labels_with_counts}", show=verbose) - missing_labels = labels - set(gold_train_data["cats"].keys()) - if missing_labels: - msg.warn( - "Some model labels are not present in the train data. The " - "model performance may be degraded for these labels after " - f"training: {_format_labels(missing_labels)}." - ) - if gold_train_data["n_cats_multilabel"] > 0: - # Note: you should never get here because you run into E895 on - # initialization first. - msg.warn( - "The train data contains instances without " - "mutually-exclusive classes. Use the component " - "'textcat_multilabel' instead of 'textcat'." - ) - if gold_dev_data["n_cats_multilabel"] > 0: - msg.fail( - "Train/dev mismatch: the dev data contains instances " - "without mutually-exclusive classes while the train data " - "contains only instances with mutually-exclusive classes." - ) - - if "textcat_multilabel" in factory_names: - msg.divider("Text Classification (Multilabel)") - labels = _get_labels_from_model(nlp, "textcat_multilabel") - msg.info(f"Text Classification: {len(labels)} label(s)") - msg.text(f"Labels: {_format_labels(labels)}", show=verbose) - labels_with_counts = _format_labels( - gold_train_data["cats"].most_common(), counts=True - ) - msg.text(f"Labels in train data: {labels_with_counts}", show=verbose) - missing_labels = labels - set(gold_train_data["cats"].keys()) + missing_labels = labels - set(gold_train_data["cats"]) if missing_labels: msg.warn( "Some model labels are not present in the train data. The " @@ -325,17 +291,76 @@ def debug_data( f"training: {_format_labels(missing_labels)}." ) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): - msg.fail( - f"The train and dev labels are not the same. " + msg.warn( + "Potential train/dev mismatch: the train and dev labels are " + "not the same. " f"Train labels: {_format_labels(gold_train_data['cats'])}. " f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) + if len(labels) < 2: + msg.fail( + "The model does not have enough labels. 'textcat' requires at " + "least two labels due to mutually-exclusive classes, e.g. " + "LABEL/NOT_LABEL or POSITIVE/NEGATIVE for a binary " + "classification task." + ) + if ( + gold_train_data["n_cats_bad_values"] > 0 + or gold_dev_data["n_cats_bad_values"] > 0 + ): + msg.fail( + "Unsupported values for cats: the supported values are " + "1.0/True and 0.0/False." + ) + if gold_train_data["n_cats_multilabel"] > 0: + # Note: you should never get here because you run into E895 on + # initialization first. + msg.fail( + "The train data contains instances without mutually-exclusive " + "classes. Use the component 'textcat_multilabel' instead of " + "'textcat'." + ) + if gold_dev_data["n_cats_multilabel"] > 0: + msg.fail( + "The dev data contains instances without mutually-exclusive " + "classes. Use the component 'textcat_multilabel' instead of " + "'textcat'." + ) + + if "textcat_multilabel" in factory_names: + msg.divider("Text Classification (Multilabel)") + labels = _get_labels_from_model(nlp, "textcat_multilabel") + msg.info(f"Text Classification: {len(labels)} label(s)") + msg.text(f"Labels: {_format_labels(labels)}", show=verbose) + missing_labels = labels - set(gold_train_data["cats"]) + if missing_labels: + msg.warn( + "Some model labels are not present in the train data. The " + "model performance may be degraded for these labels after " + f"training: {_format_labels(missing_labels)}." + ) + if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): + msg.warn( + "Potential train/dev mismatch: the train and dev labels are " + "not the same. " + f"Train labels: {_format_labels(gold_train_data['cats'])}. " + f"Dev labels: {_format_labels(gold_dev_data['cats'])}." + ) + if ( + gold_train_data["n_cats_bad_values"] > 0 + or gold_dev_data["n_cats_bad_values"] > 0 + ): + msg.fail( + "Unsupported values for cats: the supported values are " + "1.0/True and 0.0/False." + ) if gold_train_data["n_cats_multilabel"] > 0: if gold_dev_data["n_cats_multilabel"] == 0: msg.warn( "Potential train/dev mismatch: the train data contains " "instances without mutually-exclusive classes while the " - "dev data does not." + "dev data contains only instances with mutually-exclusive " + "classes." ) else: msg.warn( @@ -556,6 +581,7 @@ def _compile_gold( "n_nonproj": 0, "n_cycles": 0, "n_cats_multilabel": 0, + "n_cats_bad_values": 0, "texts": set(), } for eg in examples: @@ -599,7 +625,9 @@ def _compile_gold( data["ner"]["-"] += 1 if "textcat" in factory_names or "textcat_multilabel" in factory_names: data["cats"].update(gold.cats) - if list(gold.cats.values()).count(1.0) != 1: + if any(val not in (0, 1) for val in gold.cats.values()): + data["n_cats_bad_values"] += 1 + if list(gold.cats.values()).count(1) != 1: data["n_cats_multilabel"] += 1 if "tagger" in factory_names: tags = eg.get_aligned("TAG", as_string=True) diff --git a/spacy/errors.py b/spacy/errors.py index 7cf9e54e4..7be118503 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -375,21 +375,10 @@ class Errors: E125 = ("Unexpected value: {value}") E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " "This is likely a bug in spaCy, so feel free to open an issue.") - E129 = ("Cannot write the label of an existing Span object because a Span " - "is a read-only view of the underlying Token objects stored in the " - "Doc. Instead, create a new Span object and specify the `label` " - "keyword argument, for example:\nfrom spacy.tokens import Span\n" - "span = Span(doc, start={start}, end={end}, label='{label}')") E130 = ("You are running a narrow unicode build, which is incompatible " "with spacy >= 2.1.0. To fix this, reinstall Python and use a wide " "unicode build instead. You can also rebuild Python and set the " "`--enable-unicode=ucs4 flag`.") - E131 = ("Cannot write the kb_id of an existing Span object because a Span " - "is a read-only view of the underlying Token objects stored in " - "the Doc. Instead, create a new Span object and specify the " - "`kb_id` keyword argument, for example:\nfrom spacy.tokens " - "import Span\nspan = Span(doc, start={start}, end={end}, " - "label='{label}', kb_id='{kb_id}')") E132 = ("The vectors for entities and probabilities for alias '{alias}' " "should have equal length, but found {entities_length} and " "{probabilities_length} respectively.") @@ -501,6 +490,12 @@ class Errors: E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.") # New errors added in v3.x + E870 = ("Could not serialize the DocBin because it is too large. Consider " + "splitting up your documents into several doc bins and serializing " + "each separately. spacy.Corpus.v1 will search recursively for all " + "*.spacy files if you provide a directory instead of a filename as " + "the 'path'.") + E871 = ("Error encountered in nlp.pipe with multiprocessing:\n\n{error}") E872 = ("Unable to copy tokenizer from base model due to different " 'tokenizer settings: current tokenizer config "{curr_config}" ' 'vs. base model "{base_config}"') diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 4d02b89d0..d8514b54c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -93,6 +93,15 @@ cdef class KnowledgeBase: self.vocab = vocab self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) + def initialize_entities(self, int64_t nr_entities): + self._entry_index = PreshMap(nr_entities + 1) + self._entries = entry_vec(nr_entities + 1) + self._vectors_table = float_matrix(nr_entities + 1) + + def initialize_aliases(self, int64_t nr_aliases): + self._alias_index = PreshMap(nr_aliases + 1) + self._aliases_table = alias_vec(nr_aliases + 1) + @property def entity_vector_length(self): """RETURNS (uint64): length of the entity vectors""" @@ -144,8 +153,7 @@ cdef class KnowledgeBase: raise ValueError(Errors.E140) nr_entities = len(set(entity_list)) - self._entry_index = PreshMap(nr_entities+1) - self._entries = entry_vec(nr_entities+1) + self.initialize_entities(nr_entities) i = 0 cdef KBEntryC entry @@ -325,6 +333,102 @@ cdef class KnowledgeBase: return 0.0 + def to_bytes(self, **kwargs): + """Serialize the current state to a binary string. + """ + def serialize_header(): + header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length) + return srsly.json_dumps(header) + + def serialize_entries(): + i = 1 + tuples = [] + for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): + entry = self._entries[entry_index] + assert entry.entity_hash == entry_hash + assert entry_index == i + tuples.append((entry.entity_hash, entry.freq, entry.vector_index)) + i = i + 1 + return srsly.json_dumps(tuples) + + def serialize_aliases(): + i = 1 + headers = [] + indices_lists = [] + probs_lists = [] + for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): + alias = self._aliases_table[alias_index] + assert alias_index == i + candidate_length = len(alias.entry_indices) + headers.append((alias_hash, candidate_length)) + indices_lists.append(alias.entry_indices) + probs_lists.append(alias.probs) + i = i + 1 + headers_dump = srsly.json_dumps(headers) + indices_dump = srsly.json_dumps(indices_lists) + probs_dump = srsly.json_dumps(probs_lists) + return srsly.json_dumps((headers_dump, indices_dump, probs_dump)) + + serializers = { + "header": serialize_header, + "entity_vectors": lambda: srsly.json_dumps(self._vectors_table), + "entries": serialize_entries, + "aliases": serialize_aliases, + } + return util.to_bytes(serializers, []) + + def from_bytes(self, bytes_data, *, exclude=tuple()): + """Load state from a binary string. + """ + def deserialize_header(b): + header = srsly.json_loads(b) + nr_entities = header[0] + nr_aliases = header[1] + entity_vector_length = header[2] + self.initialize_entities(nr_entities) + self.initialize_aliases(nr_aliases) + self.entity_vector_length = entity_vector_length + + def deserialize_vectors(b): + self._vectors_table = srsly.json_loads(b) + + def deserialize_entries(b): + cdef KBEntryC entry + tuples = srsly.json_loads(b) + i = 1 + for (entity_hash, freq, vector_index) in tuples: + entry.entity_hash = entity_hash + entry.freq = freq + entry.vector_index = vector_index + entry.feats_row = -1 # Features table currently not implemented + self._entries[i] = entry + self._entry_index[entity_hash] = i + i += 1 + + def deserialize_aliases(b): + cdef AliasC alias + i = 1 + all_data = srsly.json_loads(b) + headers = srsly.json_loads(all_data[0]) + indices = srsly.json_loads(all_data[1]) + probs = srsly.json_loads(all_data[2]) + for header, indices, probs in zip(headers, indices, probs): + alias_hash, candidate_length = header + alias.entry_indices = indices + alias.probs = probs + self._aliases_table[i] = alias + self._alias_index[alias_hash] = i + i += 1 + + setters = { + "header": deserialize_header, + "entity_vectors": deserialize_vectors, + "entries": deserialize_entries, + "aliases": deserialize_aliases, + } + util.from_bytes(bytes_data, setters, exclude) + return self + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): path = ensure_path(path) if not path.exists(): @@ -404,10 +508,8 @@ cdef class KnowledgeBase: cdef int64_t entity_vector_length reader.read_header(&nr_entities, &entity_vector_length) + self.initialize_entities(nr_entities) self.entity_vector_length = entity_vector_length - self._entry_index = PreshMap(nr_entities+1) - self._entries = entry_vec(nr_entities+1) - self._vectors_table = float_matrix(nr_entities+1) # STEP 1: load entity vectors cdef int i = 0 @@ -445,8 +547,7 @@ cdef class KnowledgeBase: # STEP 3: load aliases cdef int64_t nr_aliases reader.read_alias_length(&nr_aliases) - self._alias_index = PreshMap(nr_aliases+1) - self._aliases_table = alias_vec(nr_aliases+1) + self.initialize_aliases(nr_aliases) cdef int64_t nr_candidates cdef vector[int64_t] entry_indices diff --git a/spacy/language.py b/spacy/language.py index dd68f8c8f..6c2feaa72 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -13,6 +13,7 @@ import srsly import multiprocessing as mp from itertools import chain, cycle from timeit import default_timer as timer +import traceback from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab @@ -1538,11 +1539,15 @@ class Language: # Cycle channels not to break the order of docs. # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable. - byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch)) - docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs) + byte_tuples = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch)) try: - for i, (_, doc) in enumerate(zip(raw_texts, docs), 1): - yield doc + for i, (_, (byte_doc, byte_error)) in enumerate(zip(raw_texts, byte_tuples), 1): + if byte_doc is not None: + doc = Doc(self.vocab).from_bytes(byte_doc) + yield doc + elif byte_error is not None: + error = srsly.msgpack_loads(byte_error) + self.default_error_handler(None, None, None, ValueError(Errors.E871.format(error=error))) if i % batch_size == 0: # tell `sender` that one batch was consumed. sender.step() @@ -2036,12 +2041,19 @@ def _apply_pipes( """ Underscore.load_state(underscore_state) while True: - texts = receiver.get() - docs = (make_doc(text) for text in texts) - for pipe in pipes: - docs = pipe(docs) - # Connection does not accept unpickable objects, so send list. - sender.send([doc.to_bytes() for doc in docs]) + try: + texts = receiver.get() + docs = (make_doc(text) for text in texts) + for pipe in pipes: + docs = pipe(docs) + # Connection does not accept unpickable objects, so send list. + byte_docs = [(doc.to_bytes(), None) for doc in docs] + padding = [(None, None)] * (len(texts) - len(byte_docs)) + sender.send(byte_docs + padding) + except Exception: + error_msg = [(None, srsly.msgpack_dumps(traceback.format_exc()))] + padding = [(None, None)] * (len(texts) - 1) + sender.send(error_msg + padding) class _Sender: diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 66070916e..002ea71a7 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -408,6 +408,48 @@ class EntityLinker(TrainablePipe): validate_examples(examples, "EntityLinker.score") return Scorer.score_links(examples, negative_labels=[self.NIL]) + def to_bytes(self, *, exclude=tuple()): + """Serialize the pipe to a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. + + DOCS: https://spacy.io/api/entitylinker#to_bytes + """ + self._validate_serialization_attrs() + serialize = {} + if hasattr(self, "cfg") and self.cfg is not None: + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + serialize["vocab"] = self.vocab.to_bytes + serialize["kb"] = self.kb.to_bytes + serialize["model"] = self.model.to_bytes + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, *, exclude=tuple()): + """Load the pipe from a bytestring. + + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (TrainablePipe): The loaded object. + + DOCS: https://spacy.io/api/entitylinker#from_bytes + """ + self._validate_serialization_attrs() + + def load_model(b): + try: + self.model.from_bytes(b) + except AttributeError: + raise ValueError(Errors.E149) from None + + deserialize = {} + if hasattr(self, "cfg") and self.cfg is not None: + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) + deserialize["kb"] = lambda b: self.kb.from_bytes(b) + deserialize["model"] = load_model + util.from_bytes(bytes_data, deserialize, exclude) + return self + def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 45d9c9aa0..17997279d 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -1,4 +1,6 @@ import pytest +import numpy +from numpy.testing import assert_array_equal from spacy.attrs import ORTH, LENGTH from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab @@ -120,6 +122,17 @@ def test_spans_lca_matrix(en_tokenizer): assert lca[1, 0] == 1 # slept & dog -> slept assert lca[1, 1] == 1 # slept & slept -> slept + # example from Span API docs + tokens = en_tokenizer("I like New York in Autumn") + doc = Doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=[1, 1, 3, 1, 3, 4], + deps=["dep"] * len(tokens), + ) + lca = doc[1:4].get_lca_matrix() + assert_array_equal(lca, numpy.asarray([[0, 0, 0], [0, 1, 2], [0, 2, 2]])) + def test_span_similarity_match(): doc = Doc(Vocab(), words=["a", "b", "a", "b"]) @@ -266,16 +279,10 @@ def test_span_string_label_kb_id(doc): assert span.kb_id == doc.vocab.strings["Q342"] -def test_span_label_readonly(doc): +def test_span_attrs_writable(doc): span = Span(doc, 0, 1) - with pytest.raises(NotImplementedError): - span.label_ = "hello" - - -def test_span_kb_id_readonly(doc): - span = Span(doc, 0, 1) - with pytest.raises(NotImplementedError): - span.kb_id_ = "Q342" + span.label_ = "label" + span.kb_id_ = "kb_id" def test_span_ents_property(doc): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4883cceb8..a7f9364e9 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -2,7 +2,7 @@ from typing import Callable, Iterable import pytest from numpy.testing import assert_equal from spacy.attrs import ENT_KB_ID - +from spacy.compat import pickle from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy.vocab import Vocab @@ -11,7 +11,7 @@ from spacy.ml import load_kb from spacy.scorer import Scorer from spacy.training import Example from spacy.lang.en import English -from spacy.tests.util import make_tempdir +from spacy.tests.util import make_tempdir, make_tempfile from spacy.tokens import Span @@ -290,6 +290,9 @@ def test_vocab_serialization(nlp): assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam" + assert kb_new_vocab.get_vector("Q2") == [2] + assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) + def test_append_alias(nlp): """Test that we can append additional alias-entity pairs""" @@ -546,6 +549,98 @@ def test_kb_serialization(): assert "RandomWord" in nlp2.vocab.strings +@pytest.mark.xfail(reason="Needs fixing") +def test_kb_pickle(): + # Test that the KB can be pickled + nlp = English() + kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + assert not kb_1.contains_alias("Russ Cochran") + kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + assert kb_1.contains_alias("Russ Cochran") + data = pickle.dumps(kb_1) + kb_2 = pickle.loads(data) + assert kb_2.contains_alias("Russ Cochran") + + +@pytest.mark.xfail(reason="Needs fixing") +def test_nel_pickle(): + # Test that a pipeline with an EL component can be pickled + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=3) + kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + return kb + + nlp_1 = English() + nlp_1.add_pipe("ner") + entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True) + entity_linker_1.set_kb(create_kb) + assert nlp_1.pipe_names == ["ner", "entity_linker"] + assert entity_linker_1.kb.contains_alias("Russ Cochran") + + data = pickle.dumps(nlp_1) + nlp_2 = pickle.loads(data) + assert nlp_2.pipe_names == ["ner", "entity_linker"] + entity_linker_2 = nlp_2.get_pipe("entity_linker") + assert entity_linker_2.kb.contains_alias("Russ Cochran") + + +def test_kb_to_bytes(): + # Test that the KB's to_bytes method works correctly + nlp = English() + kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3]) + kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5]) + kb_1.add_alias(alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2]) + assert kb_1.contains_alias("Russ Cochran") + kb_bytes = kb_1.to_bytes() + kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + assert not kb_2.contains_alias("Russ Cochran") + kb_2 = kb_2.from_bytes(kb_bytes) + # check that both KBs are exactly the same + assert kb_1.get_size_entities() == kb_2.get_size_entities() + assert kb_1.entity_vector_length == kb_2.entity_vector_length + assert kb_1.get_entity_strings() == kb_2.get_entity_strings() + assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908") + assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66") + assert kb_2.contains_alias("Russ Cochran") + assert kb_1.get_size_aliases() == kb_2.get_size_aliases() + assert kb_1.get_alias_strings() == kb_2.get_alias_strings() + assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(kb_2.get_alias_candidates("Russ Cochran")) + assert len(kb_1.get_alias_candidates("Randomness")) == len(kb_2.get_alias_candidates("Randomness")) + + +def test_nel_to_bytes(): + # Test that a pipeline with an EL component can be converted to bytes + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=3) + kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) + return kb + + nlp_1 = English() + nlp_1.add_pipe("ner") + entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True) + entity_linker_1.set_kb(create_kb) + assert entity_linker_1.kb.contains_alias("Russ Cochran") + assert nlp_1.pipe_names == ["ner", "entity_linker"] + + nlp_bytes = nlp_1.to_bytes() + nlp_2 = English() + nlp_2.add_pipe("ner") + nlp_2.add_pipe("entity_linker", last=True) + assert nlp_2.pipe_names == ["ner", "entity_linker"] + assert not nlp_2.get_pipe("entity_linker").kb.contains_alias("Russ Cochran") + nlp_2 = nlp_2.from_bytes(nlp_bytes) + kb_2 = nlp_2.get_pipe("entity_linker").kb + assert kb_2.contains_alias("Russ Cochran") + assert kb_2.get_vector("Q2146908") == [6, -4, 3] + assert_almost_equal(kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8) + + def test_scorer_links(): train_examples = [] nlp = English() diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 837c128af..5ce2549aa 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -64,13 +64,15 @@ def test_serialize_doc_span_groups(en_vocab): def test_serialize_doc_bin(): - doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) + doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] cats = {"A": 0.5} nlp = English() for doc in nlp.pipe(texts): doc.cats = cats doc.spans["start"] = [doc[0:2]] + doc[0].norm_ = "UNUSUAL_TOKEN_NORM" + doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" doc_bin.add(doc) bytes_data = doc_bin.to_bytes() @@ -82,6 +84,8 @@ def test_serialize_doc_bin(): assert doc.text == texts[i] assert doc.cats == cats assert len(doc.spans) == 1 + assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" + assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" def test_serialize_doc_bin_unknown_spaces(en_vocab): diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 7fb03da0c..86cce5f9e 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -8,13 +8,36 @@ from spacy.vocab import Vocab from spacy.training import Example from spacy.lang.en import English from spacy.lang.de import German -from spacy.util import registry, ignore_error, raise_error +from spacy.util import registry, ignore_error, raise_error, logger import spacy from thinc.api import NumpyOps, get_current_ops from .util import add_vecs_to_vocab, assert_docs_equal +def evil_component(doc): + if "2" in doc.text: + raise ValueError("no dice") + return doc + + +def perhaps_set_sentences(doc): + if not doc.text.startswith("4"): + doc[-1].is_sent_start = True + return doc + + +def assert_sents_error(doc): + if not doc.has_annotation("SENT_START"): + raise ValueError("no sents") + return doc + + +def warn_error(proc_name, proc, docs, e): + logger = logging.getLogger("spacy") + logger.warning(f"Trouble with component {proc_name}.") + + @pytest.fixture def nlp(): nlp = Language(Vocab()) @@ -93,19 +116,16 @@ def test_evaluate_no_pipe(nlp): nlp.evaluate([Example.from_dict(doc, annots)]) -@Language.component("test_language_vector_modification_pipe") def vector_modification_pipe(doc): doc.vector += 1 return doc -@Language.component("test_language_userdata_pipe") def userdata_pipe(doc): doc.user_data["foo"] = "bar" return doc -@Language.component("test_language_ner_pipe") def ner_pipe(doc): span = Span(doc, 0, 1, label="FIRST") doc.ents += (span,) @@ -123,6 +143,9 @@ def sample_vectors(): @pytest.fixture def nlp2(nlp, sample_vectors): + Language.component("test_language_vector_modification_pipe", func=vector_modification_pipe) + Language.component("test_language_userdata_pipe", func=userdata_pipe) + Language.component("test_language_ner_pipe", func=ner_pipe) add_vecs_to_vocab(nlp.vocab, sample_vectors) nlp.add_pipe("test_language_vector_modification_pipe") nlp.add_pipe("test_language_ner_pipe") @@ -168,82 +191,115 @@ def test_language_pipe_stream(nlp2, n_process, texts): assert_docs_equal(doc, expected_doc) -def test_language_pipe_error_handler(): +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler(n_process): """Test that the error handling of nlp.pipe works well""" - nlp = English() - nlp.add_pipe("merge_subtokens") - nlp.initialize() - texts = ["Curious to see what will happen to this text.", "And this one."] - # the pipeline fails because there's no parser - with pytest.raises(ValueError): + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.add_pipe("merge_subtokens") + nlp.initialize() + texts = ["Curious to see what will happen to this text.", "And this one."] + # the pipeline fails because there's no parser + with pytest.raises(ValueError): + nlp(texts[0]) + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + nlp.set_error_handler(raise_error) + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + # set explicitely to ignoring + nlp.set_error_handler(ignore_error) + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == 0 nlp(texts[0]) - with pytest.raises(ValueError): - list(nlp.pipe(texts)) - nlp.set_error_handler(raise_error) - with pytest.raises(ValueError): - list(nlp.pipe(texts)) - # set explicitely to ignoring - nlp.set_error_handler(ignore_error) - docs = list(nlp.pipe(texts)) - assert len(docs) == 0 - nlp(texts[0]) -def test_language_pipe_error_handler_custom(en_vocab): +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_custom(en_vocab, n_process): """Test the error handling of a custom component that has no pipe method""" + Language.component("my_evil_component", func=evil_component) + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.add_pipe("my_evil_component") + texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"] + with pytest.raises(ValueError): + # the evil custom component throws an error + list(nlp.pipe(texts)) - @Language.component("my_evil_component") - def evil_component(doc): - if "2" in doc.text: - raise ValueError("no dice") - return doc - - def warn_error(proc_name, proc, docs, e): - from spacy.util import logger - - logger.warning(f"Trouble with component {proc_name}.") - - nlp = English() - nlp.add_pipe("my_evil_component") - nlp.initialize() - texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"] - with pytest.raises(ValueError): - # the evil custom component throws an error - list(nlp.pipe(texts)) - - nlp.set_error_handler(warn_error) - logger = logging.getLogger("spacy") - with mock.patch.object(logger, "warning") as mock_warning: - # the errors by the evil custom component raise a warning for each bad batch - docs = list(nlp.pipe(texts)) - mock_warning.assert_called() - assert mock_warning.call_count == 2 - assert len(docs) + mock_warning.call_count == len(texts) - assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"] + nlp.set_error_handler(warn_error) + logger = logging.getLogger("spacy") + with mock.patch.object(logger, "warning") as mock_warning: + # the errors by the evil custom component raise a warning for each + # bad doc + docs = list(nlp.pipe(texts, n_process=n_process)) + # HACK/TODO? the warnings in child processes don't seem to be + # detected by the mock logger + if n_process == 1: + mock_warning.assert_called() + assert mock_warning.call_count == 2 + assert len(docs) + mock_warning.call_count == len(texts) + assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"] -def test_language_pipe_error_handler_pipe(en_vocab): +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_pipe(en_vocab, n_process): """Test the error handling of a component's pipe method""" + Language.component("my_perhaps_sentences", func=perhaps_set_sentences) + Language.component("assert_sents_error", func=assert_sents_error) + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + texts = [f"{str(i)} is enough. Done" for i in range(100)] + nlp = English() + nlp.add_pipe("my_perhaps_sentences") + nlp.add_pipe("assert_sents_error") + nlp.initialize() + with pytest.raises(ValueError): + # assert_sents_error requires sentence boundaries, will throw an error otherwise + docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) + nlp.set_error_handler(ignore_error) + docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10)) + # we lose/ignore the failing 4,40-49 docs + assert len(docs) == 89 - @Language.component("my_sentences") - def perhaps_set_sentences(doc): - if not doc.text.startswith("4"): - doc[-1].is_sent_start = True - return doc - texts = [f"{str(i)} is enough. Done" for i in range(100)] - nlp = English() - nlp.add_pipe("my_sentences") - entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 3}) - entity_linker.kb.add_entity(entity="Q1", freq=12, entity_vector=[1, 2, 3]) - nlp.initialize() - with pytest.raises(ValueError): - # the entity linker requires sentence boundaries, will throw an error otherwise - docs = list(nlp.pipe(texts, batch_size=10)) - nlp.set_error_handler(ignore_error) - docs = list(nlp.pipe(texts, batch_size=10)) - # we lose/ignore the failing 0-9 and 40-49 batches - assert len(docs) == 80 +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_make_doc_actual(n_process): + """Test the error handling for make_doc""" + # TODO: fix so that the following test is the actual behavior + + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.max_length = 10 + texts = ["12345678901234567890", "12345"] * 10 + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + nlp.default_error_handler = ignore_error + if n_process == 1: + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + else: + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == 0 + + +@pytest.mark.xfail +@pytest.mark.parametrize("n_process", [1, 2]) +def test_language_pipe_error_handler_make_doc_preferred(n_process): + """Test the error handling for make_doc""" + + ops = get_current_ops() + if isinstance(ops, NumpyOps) or n_process < 2: + nlp = English() + nlp.max_length = 10 + texts = ["12345678901234567890", "12345"] * 10 + with pytest.raises(ValueError): + list(nlp.pipe(texts, n_process=n_process)) + nlp.default_error_handler = ignore_error + docs = list(nlp.pipe(texts, n_process=n_process)) + assert len(docs) == 0 def test_language_from_config_before_after_init(): diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index d5b4e4ff7..868eb3eab 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -103,10 +103,12 @@ class DocBin: self.strings.add(token.text) self.strings.add(token.tag_) self.strings.add(token.lemma_) + self.strings.add(token.norm_) self.strings.add(str(token.morph)) self.strings.add(token.dep_) self.strings.add(token.ent_type_) self.strings.add(token.ent_kb_id_) + self.strings.add(token.ent_id_) self.cats.append(doc.cats) self.user_data.append(srsly.msgpack_dumps(doc.user_data)) self.span_groups.append(doc.spans.to_bytes()) @@ -244,7 +246,10 @@ class DocBin: """ path = ensure_path(path) with path.open("wb") as file_: - file_.write(self.to_bytes()) + try: + file_.write(self.to_bytes()) + except ValueError: + raise ValueError(Errors.E870) def from_disk(self, path: Union[str, Path]) -> "DocBin": """Load the DocBin from a file (typically called .spacy). diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4858ad9dd..4ab8562c3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1673,7 +1673,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): j_idx_in_sent = start + j - sent_start n_missing_tokens_in_sent = len(sent) - j_idx_in_sent # make sure we do not go past `end`, in cases where `end` < sent.end - max_range = min(j + n_missing_tokens_in_sent, end) + max_range = min(j + n_missing_tokens_in_sent, end - start) for k in range(j + 1, max_range): lca = _get_tokens_lca(token_j, doc[start + k]) # if lca is outside of span, we set it to -1 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 05bbb8cc5..ef29ecd3c 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -740,7 +740,7 @@ cdef class Span: def __get__(self): return self.root.ent_id_ - def __set__(self, hash_t key): + def __set__(self, unicode key): raise NotImplementedError(Errors.E200.format(attr="ent_id_")) @property @@ -762,9 +762,7 @@ cdef class Span: return self.doc.vocab.strings[self.label] def __set__(self, unicode label_): - if not label_: - label_ = '' - raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_)) + self.label = self.doc.vocab.strings.add(label_) property kb_id_: """RETURNS (str): The named entity's KB ID.""" @@ -772,13 +770,7 @@ cdef class Span: return self.doc.vocab.strings[self.kb_id] def __set__(self, unicode kb_id_): - if not kb_id_: - kb_id_ = '' - current_label = self.label_ - if not current_label: - current_label = '' - raise NotImplementedError(Errors.E131.format(start=self.start, end=self.end, - label=current_label, kb_id=kb_id_)) + self.kb_id = self.doc.vocab.strings.add(kb_id_) cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py index c54242eae..e9fa86c83 100644 --- a/spacy/training/batchers.py +++ b/spacy/training/batchers.py @@ -66,7 +66,7 @@ def configure_minibatch_by_words( """ optionals = {"get_length": get_length} if get_length is not None else {} return partial( - minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals + minibatch_by_words, size=size, tolerance=tolerance, discard_oversize=discard_oversize, **optionals ) diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index 0e8e7eed0..42dae8fc4 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -71,6 +71,8 @@ def offsets_to_biluo_tags( entities (iterable): A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. + missing (str): The label used for missing values, e.g. if tokenization + doesn’t align with the entity offsets. Defaults to "O". RETURNS (list): A list of unicode strings, describing the tags. Each tag string will be of the form either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". The missing label is used where the @@ -150,7 +152,7 @@ def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]: to overwrite the doc.ents. doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one + tags (iterable): A sequence of BILUO tags with each tag describing one token. Each tag string will be of the form of either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". RETURNS (list): A sequence of Span objects. Each token with a missing IOB @@ -170,7 +172,7 @@ def biluo_tags_to_offsets( """Encode per-token tags following the BILUO scheme into entity offsets. doc (Doc): The document that the BILUO tags refer to. - entities (iterable): A sequence of BILUO tags with each tag describing one + tags (iterable): A sequence of BILUO tags with each tag describing one token. Each tags string will be of the form of either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". RETURNS (list): A sequence of `(start, end, label)` triples. `start` and diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index b3a1054fc..2994d934b 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -213,10 +213,10 @@ if there is no prediction. > kb_ids = entity_linker.predict([doc1, doc2]) > ``` -| Name | Description | -| ----------- | ------------------------------------------- | -| `docs` | The documents to predict. ~~Iterable[Doc]~~ | -| **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ | +| Name | Description | +| ----------- | -------------------------------------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ | ## EntityLinker.set_annotations {#set_annotations tag="method"} @@ -341,6 +341,42 @@ Load the pipe from disk. Modifies the object in place and returns it. | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | | **RETURNS** | The modified `EntityLinker` object. ~~EntityLinker~~ | +## EntityLinker.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker_bytes = entity_linker.to_bytes() +> ``` + +Serialize the pipe to a bytestring, including the `KnowledgeBase`. + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `EntityLinker` object. ~~bytes~~ | + +## EntityLinker.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> entity_linker_bytes = entity_linker.to_bytes() +> entity_linker = nlp.add_pipe("entity_linker") +> entity_linker.from_bytes(entity_linker_bytes) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `EntityLinker` object. ~~EntityLinker~~ | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/api/language.md b/website/docs/api/language.md index ca87cbb16..b09ae1aa2 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -426,7 +426,8 @@ component, adds it to the pipeline and returns it. > ```python > @Language.component("component") > def component_func(doc): -> # modify Doc and return it return doc +> # modify Doc and return it +> return doc > > nlp.add_pipe("component", before="ner") > component = nlp.add_pipe("component", name="custom_name", last=True) diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index cfaa75bff..8190d9f78 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -879,7 +879,7 @@ This method was previously available as `spacy.gold.offsets_from_biluo_tags`. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `doc` | The document that the BILUO tags refer to. ~~Doc~~ | -| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | +| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ | ### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"} @@ -908,7 +908,7 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`. | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `doc` | The document that the BILUO tags refer to. ~~Doc~~ | -| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | +| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ | ## Utility functions {#util source="spacy/util.py"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 1b345050c..18f3a3ed2 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -45,6 +45,14 @@ you generate a starter config with the **recommended settings** for your specific use case. It's also available in spaCy as the [`init config`](/api/cli#init-config) command. + + +Upgrade to the [latest version of spaCy](/usage) to use the quickstart widget. +For earlier releases, follow the CLI instructions to generate a compatible +config. + + + > #### Instructions: widget > > 1. Select your requirements and settings. diff --git a/website/meta/universe.json b/website/meta/universe.json index 87328074a..02f814c8b 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,31 @@ { "resources": [ + { + "id": "nlpcloud", + "title": "NLPCloud.io", + "slogan": "Production-ready API for spaCy models in production", + "description": "A highly-available hosted API to easily deploy and use spaCy models in production. Supports NER, POS tagging, dependency parsing, and tokenization.", + "github": "nlpcloud", + "pip": "nlpcloud", + "code_example": [ + "import nlpcloud", + "", + "client = nlpcloud.Client('en_core_web_lg', '4eC39HqLyjWDarjtT1zdp7dc')", + "client.entities('John Doe is a Go Developer at Google')", + "# [{'end': 8, 'start': 0, 'text': 'John Doe', 'type': 'PERSON'}, {'end': 25, 'start': 13, 'text': 'Go Developer', 'type': 'POSITION'}, {'end': 35,'start': 30, 'text': 'Google', 'type': 'ORG'}]" + ], + "thumb":"https://avatars.githubusercontent.com/u/77671902", + "image":"https://nlpcloud.io/assets/images/logo.svg", + "code_language": "python", + "author": "NLPCloud.io", + "author_links": { + "github": "nlpcloud", + "twitter": "cloud_nlp", + "website": "https://nlpcloud.io" + }, + "category": ["apis", "nonpython", "standalone"], + "tags": ["api", "deploy", "production"] + }, { "id": "denomme", "title": "denomme : Multilingual Name Detector", diff --git a/website/src/widgets/quickstart-training.js b/website/src/widgets/quickstart-training.js index 8481d2048..2d3a0e679 100644 --- a/website/src/widgets/quickstart-training.js +++ b/website/src/widgets/quickstart-training.js @@ -10,7 +10,7 @@ const DEFAULT_LANG = 'en' const DEFAULT_HARDWARE = 'cpu' const DEFAULT_OPT = 'efficiency' const DEFAULT_TEXTCAT_EXCLUSIVE = true -const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat'] +const COMPONENTS = ['tagger', 'morphologizer', 'parser', 'ner', 'textcat'] const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train' # you can run spacy init fill-config to auto-fill all default settings: # python -m spacy init fill-config ./base_config.cfg ./config.cfg`