From a6e521cd7919ed16b6bcc089aadbac8b5d160fd1 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Apr 2020 12:53:16 +0200 Subject: [PATCH] Add is_sent_end token property (#5375) Reconstruction of the original PR #4697 by @MiniLau. Removes unused `SENT_END` symbol and `IS_SENT_END` from `Matcher` schema because the Matcher is only going to be able to support `IS_SENT_START`. --- .github/contributors/MiniLau.md | 106 +++++++++++++++++++++++ spacy/attrs.pxd | 1 + spacy/attrs.pyx | 1 + spacy/errors.py | 2 + spacy/structs.pxd | 2 +- spacy/symbols.pxd | 2 +- spacy/tests/doc/test_token_api.py | 14 +++ spacy/tests/pipeline/test_sentencizer.py | 17 +++- spacy/tokens/token.pyx | 22 +++++ website/docs/api/token.md | 17 +++- 10 files changed, 177 insertions(+), 7 deletions(-) create mode 100644 .github/contributors/MiniLau.md diff --git a/.github/contributors/MiniLau.md b/.github/contributors/MiniLau.md new file mode 100644 index 000000000..14d6fe328 --- /dev/null +++ b/.github/contributors/MiniLau.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Desausoi Laurent | +| Company name (if applicable) | / | +| Title or role (if applicable) | / | +| Date | 22 November 2019 | +| GitHub username | MiniLau | +| Website (optional) | / | diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 4638fcb82..8f583b3a3 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -94,3 +94,4 @@ cdef enum attr_id_t: ENT_ID = symbols.ENT_ID IDX + SENT_END \ No newline at end of file diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index f14cd6ddc..2187f3c65 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -88,6 +88,7 @@ IDS = { "ENT_KB_ID": ENT_KB_ID, "HEAD": HEAD, "SENT_START": SENT_START, + "SENT_END": SENT_END, "SPACY": SPACY, "PROB": PROB, "LANG": LANG, diff --git a/spacy/errors.py b/spacy/errors.py index e52241be1..6191570ee 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -559,6 +559,8 @@ class Errors(object): "({curr_dim}).") E194 = ("Unable to aligned mismatched text '{text}' and words '{words}'.") E195 = ("Matcher can be called on {good} only, got {got}.") + E196 = ("Refusing to write to token.is_sent_end. Sentence boundaries can " + "only be fixed with token.is_sent_start.") @add_codes diff --git a/spacy/structs.pxd b/spacy/structs.pxd index b3878db3f..b8e63a725 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -84,7 +84,7 @@ cdef struct TokenC: cdef struct MorphAnalysisC: univ_pos_t pos int length - + attr_t abbr attr_t adp_type attr_t adv_type diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index b24891fdd..9229c9970 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -464,4 +464,4 @@ cdef enum symbol_t: ENT_KB_ID ENT_ID - IDX \ No newline at end of file + IDX diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 8c749b26d..1c2253dfa 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -181,6 +181,14 @@ def test_is_sent_start(en_tokenizer): doc.is_parsed = True assert len(list(doc.sents)) == 2 +def test_is_sent_end(en_tokenizer): + doc = en_tokenizer("This is a sentence. This is another.") + assert doc[4].is_sent_end is None + doc[5].is_sent_start = True + assert doc[4].is_sent_end is True + doc.is_parsed = True + assert len(list(doc.sents)) == 2 + def test_set_pos(): doc = Doc(Vocab(), words=["hello", "world"]) @@ -205,6 +213,12 @@ def test_token0_has_sent_start_true(): assert doc[1].is_sent_start is None assert not doc.is_sentenced +def test_tokenlast_has_sent_end_true(): + doc = Doc(Vocab(), words=["hello", "world"]) + assert doc[0].is_sent_end is None + assert doc[1].is_sent_end is True + assert not doc.is_sentenced + def test_token_api_conjuncts_chain(en_vocab): words = "The boy and the girl and the man went .".split() diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index d690958cc..7e58b3e98 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -14,7 +14,9 @@ def test_sentencizer(en_vocab): doc = sentencizer(doc) assert doc.is_sentenced sent_starts = [t.is_sent_start for t in doc] + sent_ends = [t.is_sent_end for t in doc] assert sent_starts == [True, False, True, False, False, False, False] + assert sent_ends == [False, True, False, False, False, False, True] assert len(list(doc.sents)) == 2 @@ -46,13 +48,14 @@ def test_sentencizer_empty_docs(): @pytest.mark.parametrize( - "words,sent_starts,n_sents", + "words,sent_starts,sent_ends,n_sents", [ # The expected result here is that the duplicate punctuation gets merged # onto the same sentence and no one-token sentence is created for them. ( ["Hello", "!", ".", "Test", ".", ".", "ok"], [True, False, False, True, False, False, True], + [False, False, True, False, False, True, True], 3, ), # We also want to make sure ¡ and ¿ aren't treated as sentence end @@ -60,32 +63,36 @@ def test_sentencizer_empty_docs(): ( ["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"], [True, False, False, False, True, False, False, False, False, False], + [False, False, False, True, False, False, False, False, False, True], 2, ), # The Token.is_punct check ensures that quotes are handled as well ( ['"', "Nice", "!", '"', "I", "am", "happy", "."], [True, False, False, False, True, False, False, False], + [False, False, False, True, False, False, False, True], 2, ), ], ) -def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents): +def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer() doc = sentencizer(doc) assert doc.is_sentenced assert [t.is_sent_start for t in doc] == sent_starts + assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents @pytest.mark.parametrize( - "punct_chars,words,sent_starts,n_sents", + "punct_chars,words,sent_starts,sent_ends,n_sents", [ ( ["~", "?"], ["Hello", "world", "~", "A", ".", "B", "."], [True, False, False, True, False, False, False], + [False, False, True, False, False, False, True], 2, ), # Even thought it's not common, the punct_chars should be able to @@ -94,16 +101,18 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents): [".", "ö"], ["Hello", ".", "Test", "ö", "Ok", "."], [True, False, True, False, True, False], + [False, True, False, True, False, True], 3, ), ], ) -def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents): +def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, sent_ends, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer(punct_chars=punct_chars) doc = sentencizer(doc) assert doc.is_sentenced assert [t.is_sent_start for t in doc] == sent_starts + assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8019e3b4f..194f16c5a 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -493,6 +493,28 @@ cdef class Token: else: raise ValueError(Errors.E044.format(value=value)) + property is_sent_end: + """A boolean value indicating whether the token ends a sentence. + `None` if unknown. Defaults to `True` for the last token in the `Doc`. + + RETURNS (bool / None): Whether the token ends a sentence. + None if unknown. + + DOCS: https://spacy.io/api/token#is_sent_end + """ + def __get__(self): + if self.i + 1 == len(self.doc): + return True + elif self.doc[self.i+1].is_sent_start == None: + return None + elif self.doc[self.i+1].is_sent_start == True: + return True + else: + return False + + def __set__(self, value): + raise ValueError(Errors.E196) + @property def lefts(self): """The leftward immediate children of the word, in the syntactic diff --git a/website/docs/api/token.md b/website/docs/api/token.md index c30c01c20..7280ac796 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -352,7 +352,22 @@ property to `0` for the first word of the document. + assert doc[4].is_sent_start == True ``` - +## Token.is_sent_end {#is_sent_end tag="property" new="2"} + +A boolean value indicating whether the token ends a sentence. `None` if +unknown. Defaults to `True` for the last token in the `Doc`. + +> #### Example +> +> ```python +> doc = nlp("Give it back! He pleaded.") +> assert doc[3].is_sent_end +> assert not doc[4].is_sent_end +> ``` + +| Name | Type | Description | +| ----------- | ---- | ------------------------------------ | +| **RETURNS** | bool | Whether the token ends a sentence. | ## Token.has_vector {#has_vector tag="property" model="vectors"}