diff --git a/.github/contributors/grivaz.md b/.github/contributors/grivaz.md new file mode 100644 index 000000000..0ebdbcca9 --- /dev/null +++ b/.github/contributors/grivaz.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name |C. Grivaz | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date |08.22.2018 | +| GitHub username |grivaz | +| Website (optional) | | diff --git a/spacy/errors.py b/spacy/errors.py index 6f80e917d..7c0f0efd3 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -249,6 +249,7 @@ class Errors(object): "error. Are you writing to a default function argument?") E096 = ("Invalid object passed to displaCy: Can only visualize Doc or " "Span objects, or dicts if set to manual=True.") + E097 = ("Can't merge non-disjoint spans. '{token}' is already part of tokens to merge") @add_codes diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index d9db0916b..f00668d83 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -5,6 +5,7 @@ from ..util import get_doc from ...tokens import Doc from ...vocab import Vocab from ...attrs import LEMMA +from ...tokens import Span import pytest import numpy @@ -156,6 +157,23 @@ def test_doc_api_merge(en_tokenizer): assert doc[7].text == 'all night' assert doc[7].text_with_ws == 'all night' + # merge both with bulk merge + doc = en_tokenizer(text) + assert len(doc) == 9 + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4: 7], attrs={'tag':'NAMED', 'lemma':'LEMMA', + 'ent_type':'TYPE'}) + retokenizer.merge(doc[7: 9], attrs={'tag':'NAMED', 'lemma':'LEMMA', + 'ent_type':'TYPE'}) + + assert len(doc) == 6 + assert doc[4].text == 'the beach boys' + assert doc[4].text_with_ws == 'the beach boys ' + assert doc[4].tag_ == 'NAMED' + assert doc[5].text == 'all night' + assert doc[5].text_with_ws == 'all night' + assert doc[5].tag_ == 'NAMED' + def test_doc_api_merge_children(en_tokenizer): """Test that attachments work correctly after merging.""" diff --git a/spacy/tests/doc/test_span_merge.py b/spacy/tests/doc/test_span_merge.py index ae1f4f4a1..baa683452 100644 --- a/spacy/tests/doc/test_span_merge.py +++ b/spacy/tests/doc/test_span_merge.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from ..util import get_doc from ...vocab import Vocab from ...tokens import Doc +from ...tokens import Span import pytest @@ -16,16 +17,8 @@ def test_spans_merge_tokens(en_tokenizer): assert len(doc) == 4 assert doc[0].head.text == 'Angeles' assert doc[1].head.text == 'start' - doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', ent_type='GPE') - assert len(doc) == 3 - assert doc[0].text == 'Los Angeles' - assert doc[0].head.text == 'start' - - doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) - assert len(doc) == 4 - assert doc[0].head.text == 'Angeles' - assert doc[1].head.text == 'start' - doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', label='GPE') + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0 : 2], attrs={'tag':'NNP', 'lemma':'Los Angeles', 'ent_type':'GPE'}) assert len(doc) == 3 assert doc[0].text == 'Los Angeles' assert doc[0].head.text == 'start' @@ -38,8 +31,8 @@ def test_spans_merge_heads(en_tokenizer): doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) assert len(doc) == 8 - doc.merge(doc[3].idx, doc[4].idx + len(doc[4]), tag=doc[4].tag_, - lemma='pilates class', ent_type='O') + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[3 : 5], attrs={'tag':doc[4].tag_, 'lemma':'pilates class', 'ent_type':'O'}) assert len(doc) == 7 assert doc[0].head.i == 1 assert doc[1].head.i == 1 @@ -48,6 +41,14 @@ def test_spans_merge_heads(en_tokenizer): assert doc[4].head.i in [1, 3] assert doc[5].head.i == 4 +def test_spans_merge_non_disjoint(en_tokenizer): + text = "Los Angeles start." + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens]) + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0: 2], attrs={'tag': 'NNP', 'lemma': 'Los Angeles', 'ent_type': 'GPE'}) + retokenizer.merge(doc[0: 1], attrs={'tag': 'NNP', 'lemma': 'Los Angeles', 'ent_type': 'GPE'}) def test_span_np_merges(en_tokenizer): text = "displaCy is a parse tool built with Javascript" @@ -111,6 +112,25 @@ def test_spans_entity_merge_iob(): assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" + words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] + doc = Doc(Vocab(), words=words) + doc.ents = [(doc.vocab.strings.add('ent-de'), 3, 5), + (doc.vocab.strings.add('ent-fg'), 5, 7)] + assert doc[3].ent_iob_ == "B" + assert doc[4].ent_iob_ == "I" + assert doc[5].ent_iob_ == "B" + assert doc[6].ent_iob_ == "I" + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[2 : 4]) + retokenizer.merge(doc[4 : 6]) + retokenizer.merge(doc[7 : 9]) + for token in doc: + print(token) + print(token.ent_iob) + assert len(doc) == 6 + assert doc[3].ent_iob_ == "B" + assert doc[4].ent_iob_ == "I" + def test_spans_sentence_update_after_merge(en_tokenizer): text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale." diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index b405dd000..60ed63ee7 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -5,6 +5,9 @@ from __future__ import unicode_literals from libc.string cimport memcpy, memset +from libc.stdlib cimport malloc, free + +from cymem.cymem cimport Pool from .doc cimport Doc, set_children_from_heads, token_by_start, token_by_end from .span cimport Span @@ -14,24 +17,31 @@ from ..structs cimport LexemeC, TokenC from ..attrs cimport TAG from ..attrs import intify_attrs from ..util import SimpleFrozenDict - +from ..errors import Errors cdef class Retokenizer: """Helper class for doc.retokenize() context manager.""" cdef Doc doc cdef list merges cdef list splits + cdef set tokens_to_merge def __init__(self, doc): self.doc = doc self.merges = [] self.splits = [] + self.tokens_to_merge = set() def merge(self, Span span, attrs=SimpleFrozenDict()): """Mark a span for merging. The attrs will be applied to the resulting token. """ + for token in span: + if token.i in self.tokens_to_merge: + raise ValueError(Errors.E097.format(token=repr(token))) + self.tokens_to_merge.add(token.i) + attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) - self.merges.append((span.start_char, span.end_char, attrs)) + self.merges.append((span, attrs)) def split(self, Token token, orths, attrs=SimpleFrozenDict()): """Mark a Token for splitting, into the specified orths. The attrs @@ -47,20 +57,22 @@ cdef class Retokenizer: def __exit__(self, *args): # Do the actual merging here - for start_char, end_char, attrs in self.merges: - start = token_by_start(self.doc.c, self.doc.length, start_char) - end = token_by_end(self.doc.c, self.doc.length, end_char) - _merge(self.doc, start, end+1, attrs) + if len(self.merges) > 1: + _bulk_merge(self.doc, self.merges) + elif len(self.merges) == 1: + (span, attrs) = self.merges[0] + start = span.start + end = span.end + _merge(self.doc, start, end, attrs) + for start_char, orths, attrs in self.splits: raise NotImplementedError - def _merge(Doc doc, int start, int end, attributes): """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` is merged into a single token. If `start_idx` and `end_idx `do not mark start and end token boundaries, the document remains unchanged. - start_idx (int): Character index of the start of the slice to merge. end_idx (int): Character index after the end of the slice to merge. **attributes: Attributes to assign to the merged token. By default, @@ -131,3 +143,139 @@ def _merge(Doc doc, int start, int end, attributes): # Clear the cached Python objects # Return the merged Python object return doc[start] + +def _bulk_merge(Doc doc, merges): + """Retokenize the document, such that the spans described in 'merges' + are merged into a single token. This method assumes that the merges + are in the same order at which they appear in the doc, and that merges + do not intersect each other in any way. + + merges: Tokens to merge, and corresponding attributes to assign to the + merged token. By default, attributes are inherited from the + syntactic root of the span. + RETURNS (Token): The first newly merged token. + """ + cdef Span span + cdef const LexemeC* lex + cdef Pool mem = Pool() + tokens = mem.alloc(len(merges), sizeof(TokenC)) + spans = [] + + def _get_start(merge): + return merge[0].start + merges.sort(key=_get_start) + + for merge_index, (span, attributes) in enumerate(merges): + start = span.start + end = span.end + spans.append(span) + + # House the new merged token where it starts + token = &doc.c[start] + + tokens[merge_index] = token + + # Assign attributes + for attr_name, attr_value in attributes.items(): + if attr_name == TAG: + doc.vocab.morphology.assign_tag(token, attr_value) + else: + Token.set_struct_attr(token, attr_name, attr_value) + + + # Memorize span roots and sets dependencies of the newly merged + # tokens to the dependencies of their roots. + span_roots = [] + for i, span in enumerate(spans): + span_roots.append(span.root.i) + tokens[i].dep = span.root.dep + + # We update token.lex after keeping span root and dep, since + # setting token.lex will change span.start and span.end properties + # as it modifies the character offsets in the doc + for token_index in range(len(merges)): + new_orth = ''.join([t.text_with_ws for t in spans[token_index]]) + if spans[token_index][-1].whitespace_: + new_orth = new_orth[:-len(spans[token_index][-1].whitespace_)] + lex = doc.vocab.get(doc.mem, new_orth) + tokens[token_index].lex = lex + # We set trailing space here too + tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy + + # Begin by setting all the head indices to absolute token positions + # This is easier to work with for now than the offsets + # Before thinking of something simpler, beware the case where a + # dependency bridges over the entity. Here the alignment of the + # tokens changes. + for i in range(doc.length): + doc.c[i].head += i + + # Set the head of the merged token from the Span + for i in range(len(merges)): + tokens[i].head = doc.c[span_roots[i]].head + + # Adjust deps before shrinking tokens + # Tokens which point into the merged token should now point to it + # Subtract the offset from all tokens which point to >= end + offsets = [] + current_span_index = 0 + current_offset = 0 + for i in range(doc.length): + if current_span_index < len(spans) and i == spans[current_span_index].end: + #last token was the last of the span + current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1 + current_span_index += 1 + + if current_span_index < len(spans) and \ + spans[current_span_index].start <= i < spans[current_span_index].end: + offsets.append(spans[current_span_index].start - current_offset) + else: + offsets.append(i - current_offset) + + for i in range(doc.length): + doc.c[i].head = offsets[doc.c[i].head] + + # Now compress the token array + offset = 0 + in_span = False + span_index = 0 + for i in range(doc.length): + if in_span and i == spans[span_index].end: + # First token after a span + in_span = False + span_index += 1 + if span_index < len(spans) and i == spans[span_index].start: + # First token in a span + doc.c[i - offset] = doc.c[i] # move token to its place + offset += (spans[span_index].end - spans[span_index].start) - 1 + in_span = True + if not in_span: + doc.c[i - offset] = doc.c[i] # move token to its place + + for i in range(doc.length - offset, doc.length): + memset(&doc.c[i], 0, sizeof(TokenC)) + doc.c[i].lex = &EMPTY_LEXEME + doc.length -= offset + + # ...And, set heads back to a relative position + for i in range(doc.length): + doc.c[i].head -= i + + # Set the left/right children, left/right edges + set_children_from_heads(doc.c, doc.length) + + # Make sure ent_iob remains consistent + for (span, _) in merges: + if(span.end < len(offsets)): + #if it's not the last span + token_after_span_position = offsets[span.end] + if doc.c[token_after_span_position].ent_iob == 1\ + and doc.c[token_after_span_position - 1].ent_iob in (0, 2): + if doc.c[token_after_span_position - 1].ent_type == doc.c[token_after_span_position].ent_type: + doc.c[token_after_span_position - 1].ent_iob = 3 + else: + # If they're not the same entity type, let them be two entities + doc.c[token_after_span_position].ent_iob = 3 + + # Return the merged Python object + return doc[spans[0].start] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 9f6d5f0a3..83c226297 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -884,6 +884,28 @@ cdef class Doc: ''' return Retokenizer(self) + def _bulk_merge(self, spans, attributes): + """Retokenize the document, such that the spans given as arguments + are merged into single tokens. The spans need to be in document + order, and no span intersection is allowed. + + spans (Span[]): Spans to merge, in document order, with all span + intersections empty. Cannot be emty. + attributes (Dictionary[]): Attributes to assign to the merged tokens. By default, + must be the same lenghth as spans, emty dictionaries are allowed. + attributes are inherited from the syntactic root of the span. + RETURNS (Token): The first newly merged token. + """ + cdef unicode tag, lemma, ent_type + + assert len(attributes) == len(spans), "attribute length should be equal to span length" + str(len(attributes)) +\ + str(len(spans)) + with self.retokenize() as retokenizer: + for i, span in enumerate(spans): + fix_attributes(self, attributes[i]) + remove_label_if_necessary(attributes[i]) + retokenizer.merge(span, attributes[i]) + def merge(self, int start_idx, int end_idx, *args, **attributes): """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` is merged into a single token. If @@ -905,20 +927,12 @@ cdef class Doc: attributes[LEMMA] = lemma attributes[ENT_TYPE] = ent_type elif not args: - if 'label' in attributes and 'ent_type' not in attributes: - if isinstance(attributes['label'], int): - attributes[ENT_TYPE] = attributes['label'] - else: - attributes[ENT_TYPE] = self.vocab.strings[attributes['label']] - if 'ent_type' in attributes: - attributes[ENT_TYPE] = attributes['ent_type'] + fix_attributes(self, attributes) elif args: raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args), kwargs=repr(attributes))) - # More deprecated attribute handling =/ - if 'label' in attributes: - attributes['ent_type'] = attributes.pop('label') + remove_label_if_necessary(attributes) attributes = intify_attrs(attributes, strings_map=self.vocab.strings) @@ -1034,3 +1048,17 @@ def unpickle_doc(vocab, hooks_and_data, bytes_data): copy_reg.pickle(Doc, pickle_doc, unpickle_doc) + +def remove_label_if_necessary(attributes): + # More deprecated attribute handling =/ + if 'label' in attributes: + attributes['ent_type'] = attributes.pop('label') + +def fix_attributes(doc, attributes): + if 'label' in attributes and 'ent_type' not in attributes: + if isinstance(attributes['label'], int): + attributes[ENT_TYPE] = attributes['label'] + else: + attributes[ENT_TYPE] = doc.vocab.strings[attributes['label']] + if 'ent_type' in attributes: + attributes[ENT_TYPE] = attributes['ent_type']