From 12c1965070a1a8bbe80efaae5755116633d94886 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 10 Jun 2020 10:46:12 +0200 Subject: [PATCH 01/11] set delay to 7 days --- .github/workflows/issue-manager.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index b789494a2..b52095fe8 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -21,7 +21,7 @@ jobs: config: > { "answered": { - "delay": "P3D", + "delay": "P7D", "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", "remove_label": true } From 28db7dd5d9aaf53a3c4e9b13048415502d998aae Mon Sep 17 00:00:00 2001 From: Jones Martins Date: Wed, 10 Jun 2020 13:47:04 -0300 Subject: [PATCH 02/11] Add missing pronoums/determiners (#5569) * Add missing pronoums/determiners * Add test for missing pronoums * Add contributor file --- .github/contributors/jonesmartins.md | 106 +++++++++++++++++++++++++ spacy/lang/en/tokenizer_exceptions.py | 2 +- spacy/tests/lang/en/test_exceptions.py | 2 +- 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/jonesmartins.md diff --git a/.github/contributors/jonesmartins.md b/.github/contributors/jonesmartins.md new file mode 100644 index 000000000..5663f6193 --- /dev/null +++ b/.github/contributors/jonesmartins.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Jones Martins | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-10 | +| GitHub username | jonesmartins | +| Website (optional) | | diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 6a553052b..f8367c0f5 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -139,7 +139,7 @@ for pron in ["he", "she", "it"]: # W-words, relative pronouns, prepositions etc. -for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: +for word in ["who", "what", "when", "where", "why", "how", "there", "that", "this", "these", "those"]: for orth in [word, word.title()]: _exc[orth + "'s"] = [ {ORTH: orth, LEMMA: word, NORM: word}, diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index a78e1815f..1ff64eff2 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -46,7 +46,7 @@ def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text): assert tokens[0].text == text -@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"]) +@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll", "this'll", "those'll"]) def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 From bab30e4ad2ad35d7133b7f8027a3558a02e018e4 Mon Sep 17 00:00:00 2001 From: Jones Martins Date: Wed, 10 Jun 2020 16:54:06 -0300 Subject: [PATCH 03/11] Add "c'mon" token exception (#5570) * Add "c'mon" exception * Fix typo in "C'mon" exception --- spacy/lang/en/tokenizer_exceptions.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index f8367c0f5..964a714ae 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -399,6 +399,14 @@ _other_exc = { {ORTH: "Let", LEMMA: "let", NORM: "let"}, {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"}, ], + "c'mon": [ + {ORTH: "c'm", NORM: "come", LEMMA: "come"}, + {ORTH: "on"} + ], + "C'mon": [ + {ORTH: "C'm", NORM: "come", LEMMA: "come"}, + {ORTH: "on"} + ] } _exc.update(_other_exc) From fe167fcf7d23ee6c73877a11351984221a9aacd5 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 11 Jun 2020 10:23:50 +0200 Subject: [PATCH 04/11] Update pytest conf for sudachipy with Japanese (#5574) --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 63bbf2e0a..1f13da5d6 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -140,7 +140,7 @@ def it_tokenizer(): @pytest.fixture(scope="session") def ja_tokenizer(): - pytest.importorskip("fugashi") + pytest.importorskip("sudachipy") return get_lang_class("ja").Defaults.create_tokenizer() From 556895177edbc5d7dc64e0f95e36273a2fb16478 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 11 Jun 2020 13:47:37 +0200 Subject: [PATCH 05/11] Expand Japanese requirements warning (#5572) Include explicit install instructions in Japanese requirements warning. --- spacy/lang/ja/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 371cc0f98..a7ad0846e 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -46,7 +46,10 @@ def try_sudachi_import(split_mode="A"): return tok except ImportError: raise ImportError( - "Japanese support requires SudachiPy: " "https://github.com/WorksApplications/SudachiPy" + "Japanese support requires SudachiPy and SudachiDict-core " + "(https://github.com/WorksApplications/SudachiPy). " + "Install with `pip install sudachipy sudachidict_core` or " + "install spaCy with `pip install spacy[ja]`." ) From 18c6dc8093df4e075f6168b98afd500a73a384e6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 11 Jun 2020 14:09:40 +0200 Subject: [PATCH 06/11] removing label both on comment and on close --- .github/workflows/issue-manager.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/issue-manager.yml b/.github/workflows/issue-manager.yml index b52095fe8..3fb42ed01 100644 --- a/.github/workflows/issue-manager.yml +++ b/.github/workflows/issue-manager.yml @@ -20,9 +20,10 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} config: > { - "answered": { + "resolved": { "delay": "P7D", "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.", - "remove_label": true + "remove_label_on_comment": true, + "remove_label_on_close": true } } From fa46e0bef2226d1ba673537d2097d92f151304c5 Mon Sep 17 00:00:00 2001 From: theudas Date: Fri, 12 Jun 2020 02:03:23 +0200 Subject: [PATCH 07/11] Added Parameter to NEL to take n sentences into account (#5548) * added setting for neighbour sentence in NEL * added spaCy contributor agreement * added multi sentence also for training * made the try-except block smaller --- .github/contributors/theudas.md | 106 ++++++++++++++++++++++++++ spacy/pipeline/pipes.pyx | 131 ++++++++++++++++++++------------ 2 files changed, 189 insertions(+), 48 deletions(-) create mode 100644 .github/contributors/theudas.md diff --git a/.github/contributors/theudas.md b/.github/contributors/theudas.md new file mode 100644 index 000000000..3d8a2bd95 --- /dev/null +++ b/.github/contributors/theudas.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Philipp Sodmann | +| Company name (if applicable) | Empolis | +| Title or role (if applicable) | | +| Date | 2017-05-06 | +| GitHub username | theudas | +| Website (optional) | | diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 105ce00e6..01472a6d0 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1170,6 +1170,9 @@ class EntityLinker(Pipe): self.model = True self.kb = None self.cfg = dict(cfg) + + # how many neightbour sentences to take into account + self.n_sents = cfg.get("n_sents", 0) def set_kb(self, kb): self.kb = kb @@ -1218,6 +1221,9 @@ class EntityLinker(Pipe): for doc, gold in zip(docs, golds): ents_by_offset = dict() + + sentences = [s for s in doc.sents] + for ent in doc.ents: ents_by_offset[(ent.start_char, ent.end_char)] = ent @@ -1228,17 +1234,34 @@ class EntityLinker(Pipe): # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt if not (start, end) in ents_by_offset: raise RuntimeError(Errors.E188) + ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): # Currently only training on the positive instances if value: try: - sentence_docs.append(ent.sent.as_doc()) + # find the sentence in the list of sentences. + sent_index = sentences.index(ent.sent) + except AttributeError: # Catch the exception when ent.sent is None and provide a user-friendly warning raise RuntimeError(Errors.E030) + # get n previous sentences, if there are any + start_sentence = max(0, sent_index - self.n_sents) + + # get n posterior sentences, or as many < n as there are + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) + + # get token positions + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end + + # append that span as a doc to training + sent_doc = doc[start_token:end_token].as_doc() + sentence_docs.append(sent_doc) + sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) bp_context(d_scores, sgd=sgd) @@ -1309,69 +1332,81 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] + for i, doc in enumerate(docs): + sentences = [s for s in doc.sents] + if len(doc) > 0: # Looping through each sentence and each entity # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent in doc.sents: - sent_doc = sent.as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + for sent_index, sent in enumerate(sentences): + if sent.ents: + # get n_neightbour sentences, clipped to the length of the document + start_sentence = max(0, sent_index - self.n_sents) + end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - for ent in sent_doc.ents: - entity_count += 1 + start_token = sentences[start_sentence].start + end_token = sentences[end_sentence].end - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - final_tensors.append(sentence_encoding) + sent_doc = doc[start_token:end_token].as_doc() - else: - candidates = self.kb.get_candidates(ent.text) - if not candidates: - # no prediction possible for this entity - setting to NIL + # currently, the context is the same for each entity in a sentence (should be refined) + sentence_encoding = self.model([sent_doc])[0] + xp = get_array_module(sentence_encoding) + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) + + for ent in sent.ents: + entity_count += 1 + + to_discard = self.cfg.get("labels_discard", []) + if to_discard and ent.label_ in to_discard: + # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) final_tensors.append(sentence_encoding) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - final_tensors.append(sentence_encoding) - else: - random.shuffle(candidates) + candidates = self.kb.get_candidates(ent.text) + if not candidates: + # no prediction possible for this entity - setting to NIL + final_kb_ids.append(self.NIL) + final_tensors.append(sentence_encoding) - # this will set all prior probabilities to 0 if they should be excluded from the model - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.cfg.get("incl_prior", True): - prior_probs = xp.asarray([0.0 for c in candidates]) - scores = prior_probs + elif len(candidates) == 1: + # shortcut for efficiency reasons: take the 1 candidate - # add in similarity from the context - if self.cfg.get("incl_context", True): - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) + # TODO: thresholding + final_kb_ids.append(candidates[0].entity_) + final_tensors.append(sentence_encoding) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + else: + random.shuffle(candidates) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs*sims) + # this will set all prior probabilities to 0 if they should be excluded from the model + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.cfg.get("incl_prior", True): + prior_probs = xp.asarray([0.0 for c in candidates]) + scores = prior_probs - # TODO: thresholding - best_index = scores.argmax() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - final_tensors.append(sentence_encoding) + # add in similarity from the context + if self.cfg.get("incl_context", True): + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + entity_norm = xp.linalg.norm(entity_encodings, axis=1) + + if len(entity_encodings) != len(prior_probs): + raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) + + # cosine similarity + sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) + if sims.shape != prior_probs.shape: + raise ValueError(Errors.E161) + scores = prior_probs + sims - (prior_probs*sims) + + # TODO: thresholding + best_index = scores.argmax() + best_candidate = candidates[best_index] + final_kb_ids.append(best_candidate.entity_) + final_tensors.append(sentence_encoding) if not (len(final_tensors) == len(final_kb_ids) == entity_count): raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) From aa5b40fa6423916ae79bf6e750a17c50020f4078 Mon Sep 17 00:00:00 2001 From: Arvind Srinivasan Date: Sat, 13 Jun 2020 19:26:26 +0530 Subject: [PATCH 08/11] Added Tamil Example Sentences (#5583) * Added Examples for Tamil Sentences #### Description This PR add example sentences for the Tamil language which were missing as per issue #1107 #### Type of Change This is an enhancement. * Accepting spaCy Contributor Agreement * Signed on my behalf as an individual --- .github/contributors/Arvindcheenu.md | 106 +++++++++++++++++++++++++++ spacy/lang/ta/examples.py | 5 ++ 2 files changed, 111 insertions(+) create mode 100644 .github/contributors/Arvindcheenu.md diff --git a/.github/contributors/Arvindcheenu.md b/.github/contributors/Arvindcheenu.md new file mode 100644 index 000000000..707a9821d --- /dev/null +++ b/.github/contributors/Arvindcheenu.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Arvind Srinivasan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2020-06-13 | +| GitHub username | arvindcheenu | +| Website (optional) | | diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 3ce3c3544..c34e77129 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -18,4 +18,9 @@ sentences = [ "இந்த ஃபோனுடன் சுமார் ரூ.2,990 மதிப்புள்ள போட் ராக்கர்ஸ் நிறுவனத்தின் ஸ்போர்ட் புளூடூத் ஹெட்போன்ஸ் இலவசமாக வழங்கப்படவுள்ளது.", "மட்டக்களப்பில் பல இடங்களில் வீட்டுத் திட்டங்களுக்கு இன்று அடிக்கல் நாட்டல்", "ஐ போன்க்கு முகத்தை வைத்து அன்லாக் செய்யும் முறை மற்றும் விரலால் தொட்டு அன்லாக் செய்யும் முறையை வாட்ஸ் ஆப் நிறுவனம் இதற்கு முன் கண்டுபிடித்தது", + "இது ஒரு வாக்கியம்.", + "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது", + "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன", + "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது", + "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்." ] From c482f20778f3464fefbc7aa57782de5fe713a77f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 15 Jun 2020 14:56:04 +0200 Subject: [PATCH 09/11] Fix and add warnings related to spacy-lookups-data (#5588) * Fix warning message for lemmatization tables * Add a warning when the `lexeme_norm` table is empty. (Given the relatively lang-specific loading for `Lookups`, it seemed like too much overhead to dynamically extract the list of languages, so for now it's hard-coded.) --- spacy/errors.py | 13 ++++++++++--- spacy/pipeline/pipes.pyx | 2 ++ spacy/syntax/nn_parser.pyx | 5 ++++- spacy/tests/parser/test_ner.py | 17 +++++++++++++++++ spacy/tests/test_lemmatizer.py | 6 +++--- 5 files changed, 36 insertions(+), 7 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index baed574f8..a25661a20 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -92,9 +92,9 @@ class Warnings(object): W022 = ("Training a new part-of-speech tagger using a model with no " "lemmatization rules or data. This means that the trained model " "may not be able to lemmatize correctly. If this is intentional " - "or the language you're using doesn't have lemmatization data. " - "If this is surprising, make sure you have the spacy-lookups-data " - "package installed.") + "or the language you're using doesn't have lemmatization data, " + "please ignore this warning. If this is surprising, make sure you " + "have the spacy-lookups-data package installed.") W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " @@ -127,6 +127,13 @@ class Warnings(object): "this, download a newer compatible model or retrain your custom " "model with the current spaCy version. For more details and " "available updates, run: python -m spacy validate") + W033 = ("Training a new {model} using a model with no lexeme normalization " + "table. This may degrade the performance of the model to some " + "degree. If this is intentional or the language you're using " + "doesn't have a normalization table, please ignore this warning. " + "If this is surprising, make sure you have the spacy-lookups-data " + "package installed. The languages with lexeme normalization tables " + "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 01472a6d0..3f40cb545 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -516,6 +516,8 @@ class Tagger(Pipe): lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] if not any(table in self.vocab.lookups for table in lemma_tables): warnings.warn(Warnings.W022) + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d5c6bf2a8..6944e9113 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -26,6 +26,7 @@ from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec import srsly +import warnings from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid @@ -37,7 +38,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse -from ..errors import Errors, TempErrors +from ..errors import Errors, TempErrors, Warnings from .. import util from .stateclass cimport StateClass from ._state cimport StateC @@ -601,6 +602,8 @@ cdef class Parser: **self.cfg.get('optimizer', {})) def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg): + if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: + warnings.warn(Warnings.W033.format(model="parser or NER")) if 'model' in cfg: self.model = cfg['model'] if not hasattr(get_gold_tuples, '__call__'): diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 244e9fa25..dd623e07f 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals import pytest from spacy.lang.en import English +from spacy.language import Language +from spacy.lookups import Lookups from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.vocab import Vocab from spacy.syntax.ner import BiluoPushDown @@ -305,6 +307,21 @@ def test_change_number_features(): nlp("hello world") +def test_ner_warns_no_lookups(): + nlp = Language() + nlp.vocab.lookups = Lookups() + assert not len(nlp.vocab.lookups) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + with pytest.warns(UserWarning): + nlp.begin_training() + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" + with pytest.warns(None) as record: + nlp.begin_training() + assert not record.list + + class BlockerComponent1(object): name = "my_blocker" diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index bcda2999a..fce3772c4 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -33,17 +33,17 @@ def test_lemmatizer_reflects_lookups_changes(): assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" -def test_tagger_warns_no_lemma_lookups(): +def test_tagger_warns_no_lookups(): nlp = Language() nlp.vocab.lookups = Lookups() assert not len(nlp.vocab.lookups) tagger = nlp.create_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() nlp.add_pipe(tagger) with pytest.warns(UserWarning): nlp.begin_training() nlp.vocab.lookups.add_table("lemma_lookup") + nlp.vocab.lookups.add_table("lexeme_norm") + nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" with pytest.warns(None) as record: nlp.begin_training() assert not record.list From c94f7d0e75e9e4ce25b719edee3adb4ecd74ee50 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 15 Jun 2020 14:56:51 +0200 Subject: [PATCH 10/11] Updates to docstrings (#5589) --- spacy/gold.pyx | 1 + spacy/vocab.pyx | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index cf67a2ac7..e69ff5933 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -640,6 +640,7 @@ cdef class GoldParse: representing the external IDs in a knowledge base (KB) mapped to either 1.0 or 0.0, indicating positive and negative examples respectively. + make_projective (bool): Whether to projectivize the dependency tree. RETURNS (GoldParse): The newly constructed object. """ self.mem = Pool() diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 68f0ac0db..1b1b04e13 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -46,7 +46,8 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. lookups_extra (Lookups): Container for optional lookup tables and dictionaries. - name (unicode): Optional name to identify the vectors table. + oov_prob (float): Default OOV probability. + vectors_name (unicode): Optional name to identify the vectors table. RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} From bb54f54369be830651658191807c4e8625abb48c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 16 Jun 2020 16:10:12 +0200 Subject: [PATCH 11/11] Fix model accuracy table [ci skip] --- website/src/templates/models.js | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 845fec65d..3c5e9d2a4 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -1,4 +1,4 @@ -import React, { useEffect, useState, useMemo } from 'react' +import React, { useEffect, useState, useMemo, Fragment } from 'react' import { StaticQuery, graphql } from 'gatsby' import { window } from 'browser-monads' @@ -83,15 +83,24 @@ function formatVectors(data) { function formatAccuracy(data) { if (!data) return null - const labels = { tags_acc: 'POS', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } + const labels = { + las: 'LAS', + uas: 'UAS', + tags_acc: 'TAG', + ents_f: 'NER F', + ents_p: 'NER P', + ents_r: 'NER R', + } const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) const isNer = key => key.startsWith('ents_') - return Object.keys(data).map(key => ({ - label: labels[key] || key.toUpperCase(), - value: data[key].toFixed(2), - help: MODEL_META[key], - type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, - })) + return Object.keys(data) + .filter(key => labels[key]) + .map(key => ({ + label: labels[key], + value: data[key].toFixed(2), + help: MODEL_META[key], + type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, + })) } function formatModelMeta(data) { @@ -115,11 +124,11 @@ function formatModelMeta(data) { function formatSources(data = []) { const sources = data.map(s => (isString(s) ? { name: s } : s)) return sources.map(({ name, url, author }, i) => ( - <> + {i > 0 &&
} {name && url ? {name} : name} {author && ` (${author})`} - +
)) } @@ -308,12 +317,12 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl {labelNames.map((label, i) => ( - <> + {i > 0 && ', '} {label} - + ))}