diff --git a/.github/contributors/kabirkhan.md b/.github/contributors/kabirkhan.md new file mode 100644 index 000000000..4dfb3dfa1 --- /dev/null +++ b/.github/contributors/kabirkhan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Kabir Khan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-04-08 | +| GitHub username | kabirkhan | +| Website (optional) | | diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index cd399a4fe..54fd4a062 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -48,6 +48,7 @@ class EntityRuler(object): self.phrase_patterns = defaultdict(list) self.matcher = Matcher(nlp.vocab) self.phrase_matcher = PhraseMatcher(nlp.vocab) + self.ent_id_sep = cfg.get("ent_id_sep", "||") patterns = cfg.get("patterns") if patterns is not None: self.add_patterns(patterns) @@ -84,7 +85,16 @@ class EntityRuler(object): continue # check for end - 1 here because boundaries are inclusive if start not in seen_tokens and end - 1 not in seen_tokens: - new_entities.append(Span(doc, start, end, label=match_id)) + if self.ent_ids: + label_ = self.nlp.vocab.strings[match_id] + ent_label, ent_id = self._split_label(label_) + span = Span(doc, start, end, label=ent_label) + if ent_id: + for token in span: + token.ent_id_ = ent_id + else: + span = Span(doc, start, end, label=match_id) + new_entities.append(span) entities = [ e for e in entities if not (e.start < end and e.end > start) ] @@ -104,6 +114,21 @@ class EntityRuler(object): all_labels.update(self.phrase_patterns.keys()) return tuple(all_labels) + @property + def ent_ids(self): + """All entity ids present in the match patterns meta dicts. + + RETURNS (set): The string entity ids. + + DOCS: https://spacy.io/api/entityruler#labels + """ + all_ent_ids = set() + for l in self.labels: + if self.ent_id_sep in l: + _, ent_id = self._split_label(l) + all_ent_ids.add(ent_id) + return tuple(all_ent_ids) + @property def patterns(self): """Get all patterns that were added to the entity ruler. @@ -115,10 +140,19 @@ class EntityRuler(object): all_patterns = [] for label, patterns in self.token_patterns.items(): for pattern in patterns: - all_patterns.append({"label": label, "pattern": pattern}) + ent_label, ent_id = self._split_label(label) + p = {"label": ent_label, "pattern": pattern} + if ent_id: + p["id"] = ent_id + all_patterns.append(p) for label, patterns in self.phrase_patterns.items(): for pattern in patterns: - all_patterns.append({"label": label, "pattern": pattern.text}) + ent_label, ent_id = self._split_label(label) + p = {"label": ent_label, "pattern": pattern.text} + if ent_id: + p["id"] = ent_id + all_patterns.append(p) + return all_patterns def add_patterns(self, patterns): @@ -133,6 +167,8 @@ class EntityRuler(object): """ for entry in patterns: label = entry["label"] + if "id" in entry: + label = self._create_label(label, entry["id"]) pattern = entry["pattern"] if isinstance(pattern, basestring_): self.phrase_patterns[label].append(self.nlp(pattern)) @@ -145,6 +181,28 @@ class EntityRuler(object): for label, patterns in self.phrase_patterns.items(): self.phrase_matcher.add(label, None, *patterns) + def _split_label(self, label): + """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep + + RETURNS (tuple): ent_label, ent_id + """ + if self.ent_id_sep in label: + ent_label, ent_id = label.rsplit(self.ent_id_sep, 1) + else: + ent_label = label + ent_id = None + + return ent_label, ent_id + + def _create_label(self, label, ent_id): + """Join Entity label with ent_id if the pattern has an `id` attribute + + RETURNS (str): The ent_label joined with configured `ent_id_sep` + """ + if isinstance(ent_id, basestring_): + label = "{}{}{}".format(label, self.ent_id_sep, ent_id) + return label + def from_bytes(self, patterns_bytes, **kwargs): """Load the entity ruler from a bytestring. diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 8757c9af5..040d5ff22 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -19,6 +19,7 @@ def patterns(): {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, ] @@ -34,7 +35,7 @@ def add_ent(): def test_entity_ruler_init(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns) assert len(ruler) == len(patterns) - assert len(ruler.labels) == 3 + assert len(ruler.labels) == 4 assert "HELLO" in ruler assert "BYE" in ruler nlp.add_pipe(ruler) @@ -77,14 +78,33 @@ def test_entity_ruler_existing_complex(nlp, patterns, add_ent): assert len(doc.ents[1]) == 2 +def test_entity_ruler_entity_id(nlp, patterns): + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + nlp.add_pipe(ruler) + doc = nlp("Apple is a technology company") + assert len(doc.ents) == 1 + assert doc.ents[0].label_ == "TECH_ORG" + assert doc.ents[0].ent_id_ == "a1" + + +def test_entity_ruler_cfg_ent_id_sep(nlp, patterns): + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True, ent_id_sep="**") + assert "TECH_ORG**a1" in ruler.phrase_patterns + nlp.add_pipe(ruler) + doc = nlp("Apple is a technology company") + assert len(doc.ents) == 1 + assert doc.ents[0].label_ == "TECH_ORG" + assert doc.ents[0].ent_id_ == "a1" + + def test_entity_ruler_serialize_bytes(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns) assert len(ruler) == len(patterns) - assert len(ruler.labels) == 3 + assert len(ruler.labels) == 4 ruler_bytes = ruler.to_bytes() new_ruler = EntityRuler(nlp) assert len(new_ruler) == 0 assert len(new_ruler.labels) == 0 new_ruler = new_ruler.from_bytes(ruler_bytes) assert len(ruler) == len(patterns) - assert len(ruler.labels) == 3 + assert len(ruler.labels) == 4