mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge pull request #6089 from adrianeboyd/feature/doc-ents-v3-2
This commit is contained in:
		
						commit
						58dde293ce
					
				|  | @ -696,6 +696,12 @@ class Errors: | ||||||
|     E1009 = ("String for hash '{val}' not found in StringStore. Set the value " |     E1009 = ("String for hash '{val}' not found in StringStore. Set the value " | ||||||
|              "through token.morph_ instead or add the string to the " |              "through token.morph_ instead or add the string to the " | ||||||
|              "StringStore with `nlp.vocab.strings.add(string)`.") |              "StringStore with `nlp.vocab.strings.add(string)`.") | ||||||
|  |     E1010 = ("Unable to set entity information for token {i} which is included " | ||||||
|  |              "in more than one span in entities, blocked, missing or outside.") | ||||||
|  |     E1011 = ("Unsupported default '{default}' in doc.set_ents. Available " | ||||||
|  |              "options: {modes}") | ||||||
|  |     E1012 = ("Entity spans and blocked/missing/outside spans should be " | ||||||
|  |              "provided to doc.set_ents as lists of `Span` objects.") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @add_codes | @add_codes | ||||||
|  |  | ||||||
|  | @ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab): | ||||||
|     ner.begin_training(lambda: [_ner_example(ner)]) |     ner.begin_training(lambda: [_ner_example(ner)]) | ||||||
|     ner(doc) |     ner(doc) | ||||||
| 
 | 
 | ||||||
|     doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] |     doc.ents = [("ANIMAL", 3, 4)] | ||||||
|     assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] |     assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] | ||||||
| 
 | 
 | ||||||
|     doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] |     doc.ents = [("WORD", 0, 2)] | ||||||
|     assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"] |     assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -152,7 +152,7 @@ def test_doc_api_set_ents(en_tokenizer): | ||||||
|     assert len(tokens.ents) == 0 |     assert len(tokens.ents) == 0 | ||||||
|     tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)] |     tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)] | ||||||
|     assert len(list(tokens.ents)) == 1 |     assert len(list(tokens.ents)) == 1 | ||||||
|     assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0] |     assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2] | ||||||
|     assert tokens.ents[0].label_ == "PRODUCT" |     assert tokens.ents[0].label_ == "PRODUCT" | ||||||
|     assert tokens.ents[0].start == 2 |     assert tokens.ents[0].start == 2 | ||||||
|     assert tokens.ents[0].end == 4 |     assert tokens.ents[0].end == 4 | ||||||
|  | @ -427,7 +427,7 @@ def test_has_annotation(en_vocab): | ||||||
|     doc[0].lemma_ = "a" |     doc[0].lemma_ = "a" | ||||||
|     doc[0].dep_ = "dep" |     doc[0].dep_ = "dep" | ||||||
|     doc[0].head = doc[1] |     doc[0].head = doc[1] | ||||||
|     doc.ents = [Span(doc, 0, 1, label="HELLO")] |     doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing") | ||||||
| 
 | 
 | ||||||
|     for attr in attrs: |     for attr in attrs: | ||||||
|         assert doc.has_annotation(attr) |         assert doc.has_annotation(attr) | ||||||
|  | @ -457,7 +457,74 @@ def test_is_flags_deprecated(en_tokenizer): | ||||||
|         doc.is_sentenced |         doc.is_sentenced | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_doc_set_ents(): | def test_doc_set_ents(en_tokenizer): | ||||||
|  |     # set ents | ||||||
|  |     doc = en_tokenizer("a b c d e") | ||||||
|  |     doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)]) | ||||||
|  |     assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2] | ||||||
|  |     assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] | ||||||
|  | 
 | ||||||
|  |     # add ents, invalid IOB repaired | ||||||
|  |     doc = en_tokenizer("a b c d e") | ||||||
|  |     doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)]) | ||||||
|  |     doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified") | ||||||
|  |     assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2] | ||||||
|  |     assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0] | ||||||
|  | 
 | ||||||
|  |     # missing ents | ||||||
|  |     doc = en_tokenizer("a b c d e") | ||||||
|  |     doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]]) | ||||||
|  |     assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0] | ||||||
|  |     assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] | ||||||
|  | 
 | ||||||
|  |     # outside ents | ||||||
|  |     doc = en_tokenizer("a b c d e") | ||||||
|  |     doc.set_ents( | ||||||
|  |         [Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], | ||||||
|  |         outside=[doc[4:5]], | ||||||
|  |         default="missing", | ||||||
|  |     ) | ||||||
|  |     assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2] | ||||||
|  |     assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] | ||||||
|  | 
 | ||||||
|  |     # blocked ents | ||||||
|  |     doc = en_tokenizer("a b c d e") | ||||||
|  |     doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified") | ||||||
|  |     assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3] | ||||||
|  |     assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0] | ||||||
|  |     assert doc.ents == tuple() | ||||||
|  | 
 | ||||||
|  |     # invalid IOB repaired after blocked | ||||||
|  |     doc.ents = [Span(doc, 3, 5, "ENT")] | ||||||
|  |     assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1] | ||||||
|  |     doc.set_ents([], blocked=[doc[3:4]], default="unmodified") | ||||||
|  |     assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3] | ||||||
|  | 
 | ||||||
|  |     # all types | ||||||
|  |     doc = en_tokenizer("a b c d e") | ||||||
|  |     doc.set_ents( | ||||||
|  |         [Span(doc, 0, 1, 10)], | ||||||
|  |         blocked=[doc[1:2]], | ||||||
|  |         missing=[doc[2:3]], | ||||||
|  |         outside=[doc[3:4]], | ||||||
|  |         default="unmodified", | ||||||
|  |     ) | ||||||
|  |     assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0] | ||||||
|  |     assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0] | ||||||
|  | 
 | ||||||
|  |     doc = en_tokenizer("a b c d e") | ||||||
|  |     # single span instead of a list | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         doc.set_ents([], missing=doc[1:2]) | ||||||
|  |     # invalid default mode | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         doc.set_ents([], missing=[doc[1:2]], default="none") | ||||||
|  |     # conflicting/overlapping specifications | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]]) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_doc_ents_setter(): | ||||||
|     """Test that both strings and integers can be used to set entities in |     """Test that both strings and integers can be used to set entities in | ||||||
|     tuple format via doc.ents.""" |     tuple format via doc.ents.""" | ||||||
|     words = ["a", "b", "c", "d", "e"] |     words = ["a", "b", "c", "d", "e"] | ||||||
|  |  | ||||||
|  | @ -168,7 +168,7 @@ def test_accept_blocked_token(): | ||||||
|     ner2 = nlp2.create_pipe("ner", config=config) |     ner2 = nlp2.create_pipe("ner", config=config) | ||||||
| 
 | 
 | ||||||
|     # set "New York" to a blocked entity |     # set "New York" to a blocked entity | ||||||
|     doc2.ents = [(0, 3, 5)] |     doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified") | ||||||
|     assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] |     assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] | ||||||
|     assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] |     assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] | ||||||
| 
 | 
 | ||||||
|  | @ -358,5 +358,5 @@ class BlockerComponent1: | ||||||
|         self.name = name |         self.name = name | ||||||
| 
 | 
 | ||||||
|     def __call__(self, doc): |     def __call__(self, doc): | ||||||
|         doc.ents = [(0, self.start, self.end)] |         doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified") | ||||||
|         return doc |         return doc | ||||||
|  |  | ||||||
|  | @ -7,6 +7,8 @@ from libc.stdint cimport int32_t, uint64_t | ||||||
| 
 | 
 | ||||||
| import copy | import copy | ||||||
| from collections import Counter | from collections import Counter | ||||||
|  | from enum import Enum | ||||||
|  | import itertools | ||||||
| import numpy | import numpy | ||||||
| import srsly | import srsly | ||||||
| from thinc.api import get_array_module | from thinc.api import get_array_module | ||||||
|  | @ -86,6 +88,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) | ||||||
|         return get_token_attr(token, feat_name) |         return get_token_attr(token, feat_name) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class SetEntsDefault(str, Enum): | ||||||
|  |     blocked = "blocked" | ||||||
|  |     missing = "missing" | ||||||
|  |     outside = "outside" | ||||||
|  |     unmodified = "unmodified" | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def values(cls): | ||||||
|  |         return list(cls.__members__.keys()) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| cdef class Doc: | cdef class Doc: | ||||||
|     """A sequence of Token objects. Access sentences and named entities, export |     """A sequence of Token objects. Access sentences and named entities, export | ||||||
|     annotations to numpy arrays, losslessly serialize to compressed binary |     annotations to numpy arrays, losslessly serialize to compressed binary | ||||||
|  | @ -660,50 +673,100 @@ cdef class Doc: | ||||||
|             # TODO: |             # TODO: | ||||||
|             # 1. Test basic data-driven ORTH gazetteer |             # 1. Test basic data-driven ORTH gazetteer | ||||||
|             # 2. Test more nuanced date and currency regex |             # 2. Test more nuanced date and currency regex | ||||||
|             tokens_in_ents = {} |             cdef attr_t entity_type, kb_id | ||||||
|             cdef attr_t entity_type |             cdef int ent_start, ent_end | ||||||
|             cdef attr_t kb_id |             ent_spans = [] | ||||||
|             cdef int ent_start, ent_end, token_index |  | ||||||
|             for ent_info in ents: |             for ent_info in ents: | ||||||
|                 entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info) |                 entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info) | ||||||
|                 if isinstance(entity_type_, str): |                 if isinstance(entity_type_, str): | ||||||
|                     self.vocab.strings.add(entity_type_) |                     self.vocab.strings.add(entity_type_) | ||||||
|                 entity_type = self.vocab.strings.as_int(entity_type_) |                 span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id) | ||||||
|                 for token_index in range(ent_start, ent_end): |                 ent_spans.append(span) | ||||||
|                     if token_index in tokens_in_ents: |             self.set_ents(ent_spans, default=SetEntsDefault.outside) | ||||||
|                         raise ValueError(Errors.E103.format( | 
 | ||||||
|                             span1=(tokens_in_ents[token_index][0], |     def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): | ||||||
|                                    tokens_in_ents[token_index][1], |         """Set entity annotation. | ||||||
|                                    self.vocab.strings[tokens_in_ents[token_index][2]]), | 
 | ||||||
|                             span2=(ent_start, ent_end, self.vocab.strings[entity_type]))) |         entities (List[Span]): Spans with labels to set as entities. | ||||||
|                     tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id) |         blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an | ||||||
|  |             entity) for spacy's built-in NER component. Other components may | ||||||
|  |             ignore this setting. | ||||||
|  |         missing (Optional[List[Span]]): Spans with missing/unknown entity | ||||||
|  |             information. | ||||||
|  |         outside (Optional[List[Span]]): Spans outside of entities (O in IOB). | ||||||
|  |         default (str): How to set entity annotation for tokens outside of any | ||||||
|  |             provided spans. Options: "blocked", "missing", "outside" and | ||||||
|  |             "unmodified" (preserve current state). Defaults to "outside". | ||||||
|  |         """ | ||||||
|  |         if default not in SetEntsDefault.values(): | ||||||
|  |             raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault))) | ||||||
|  | 
 | ||||||
|  |         # Ignore spans with missing labels | ||||||
|  |         entities = [ent for ent in entities if ent.label > 0] | ||||||
|  | 
 | ||||||
|  |         if blocked is None: | ||||||
|  |             blocked = tuple() | ||||||
|  |         if missing is None: | ||||||
|  |             missing = tuple() | ||||||
|  |         if outside is None: | ||||||
|  |             outside = tuple() | ||||||
|  | 
 | ||||||
|  |         # Find all tokens covered by spans and check that none are overlapping | ||||||
|         cdef int i |         cdef int i | ||||||
|             for i in range(self.length): |         seen_tokens = set() | ||||||
|                 # default values |         for span in itertools.chain.from_iterable([entities, blocked, missing, outside]): | ||||||
|                 entity_type = 0 |             if not isinstance(span, Span): | ||||||
|                 kb_id = 0 |                 raise ValueError(Errors.E1012.format(span=span)) | ||||||
|  |             for i in range(span.start, span.end): | ||||||
|  |                 if i in seen_tokens: | ||||||
|  |                     raise ValueError(Errors.E1010.format(i=i)) | ||||||
|  |                 seen_tokens.add(i) | ||||||
| 
 | 
 | ||||||
|                 # Set ent_iob to Missing (0) by default unless this token was nered before |         # Set all specified entity information | ||||||
|                 ent_iob = 0 |         for span in entities: | ||||||
|                 if self.c[i].ent_iob != 0: |             for i in range(span.start, span.end): | ||||||
|                     ent_iob = 2 |                 if i == span.start: | ||||||
| 
 |                     self.c[i].ent_iob = 3 | ||||||
|                 # overwrite if the token was part of a specified entity |  | ||||||
|                 if i in tokens_in_ents.keys(): |  | ||||||
|                     ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] |  | ||||||
|                     if entity_type is None or entity_type <= 0: |  | ||||||
|                         # Blocking this token from being overwritten by downstream NER |  | ||||||
|                         ent_iob = 3 |  | ||||||
|                     elif ent_start == i: |  | ||||||
|                         # Marking the start of an entity |  | ||||||
|                         ent_iob = 3 |  | ||||||
|                 else: |                 else: | ||||||
|                         # Marking the inside of an entity |                     self.c[i].ent_iob = 1 | ||||||
|                         ent_iob = 1 |                 self.c[i].ent_type = span.label | ||||||
|  |                 self.c[i].ent_kb_id = span.kb_id | ||||||
|  |         for span in blocked: | ||||||
|  |             for i in range(span.start, span.end): | ||||||
|  |                 self.c[i].ent_iob = 3 | ||||||
|  |                 self.c[i].ent_type = 0 | ||||||
|  |         for span in missing: | ||||||
|  |             for i in range(span.start, span.end): | ||||||
|  |                 self.c[i].ent_iob = 0 | ||||||
|  |                 self.c[i].ent_type = 0 | ||||||
|  |         for span in outside: | ||||||
|  |             for i in range(span.start, span.end): | ||||||
|  |                 self.c[i].ent_iob = 2 | ||||||
|  |                 self.c[i].ent_type = 0 | ||||||
| 
 | 
 | ||||||
|                 self.c[i].ent_type = entity_type |         # Set tokens outside of all provided spans | ||||||
|                 self.c[i].ent_kb_id = kb_id |         if default != SetEntsDefault.unmodified: | ||||||
|                 self.c[i].ent_iob = ent_iob |             for i in range(self.length): | ||||||
|  |                 if i not in seen_tokens: | ||||||
|  |                     self.c[i].ent_type = 0 | ||||||
|  |                     if default == SetEntsDefault.outside: | ||||||
|  |                         self.c[i].ent_iob = 2 | ||||||
|  |                     elif default == SetEntsDefault.missing: | ||||||
|  |                         self.c[i].ent_iob = 0 | ||||||
|  |                     elif default == SetEntsDefault.blocked: | ||||||
|  |                         self.c[i].ent_iob = 3 | ||||||
|  | 
 | ||||||
|  |         # Fix any resulting inconsistent annotation | ||||||
|  |         for i in range(self.length - 1): | ||||||
|  |             # I must follow B or I: convert I to B | ||||||
|  |             if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \ | ||||||
|  |                     self.c[i+1].ent_iob == 1: | ||||||
|  |                 self.c[i+1].ent_iob = 3 | ||||||
|  |             # Change of type with BI or II: convert second I to B | ||||||
|  |             if self.c[i].ent_type != self.c[i+1].ent_type and \ | ||||||
|  |                     (self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \ | ||||||
|  |                     self.c[i+1].ent_iob == 1: | ||||||
|  |                 self.c[i+1].ent_iob = 3 | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def noun_chunks(self): |     def noun_chunks(self): | ||||||
|  |  | ||||||
|  | @ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _add_entities_to_doc(doc, ner_data): | def _add_entities_to_doc(doc, ner_data): | ||||||
|  |     print(ner_data) | ||||||
|     if ner_data is None: |     if ner_data is None: | ||||||
|         return |         return | ||||||
|     elif ner_data == []: |     elif ner_data == []: | ||||||
|  | @ -303,9 +304,14 @@ def _add_entities_to_doc(doc, ner_data): | ||||||
|             biluo_tags_to_spans(doc, ner_data) |             biluo_tags_to_spans(doc, ner_data) | ||||||
|         ) |         ) | ||||||
|     elif isinstance(ner_data[0], Span): |     elif isinstance(ner_data[0], Span): | ||||||
|         # Ugh, this is super messy. Really hard to set O entities |         entities = [] | ||||||
|         doc.ents = ner_data |         missing = [] | ||||||
|         doc.ents = [span for span in ner_data if span.label_] |         for span in ner_data: | ||||||
|  |             if span.label: | ||||||
|  |                 entities.append(span) | ||||||
|  |             else: | ||||||
|  |                 missing.append(span) | ||||||
|  |         doc.set_ents(entities, missing=missing) | ||||||
|     else: |     else: | ||||||
|         raise ValueError(Errors.E973) |         raise ValueError(Errors.E973) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -151,9 +151,10 @@ def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]: | ||||||
| 
 | 
 | ||||||
|     doc (Doc): The document that the BILUO tags refer to. |     doc (Doc): The document that the BILUO tags refer to. | ||||||
|     entities (iterable): A sequence of BILUO tags with each tag describing one |     entities (iterable): A sequence of BILUO tags with each tag describing one | ||||||
|         token. Each tags string will be of the form of either "", "O" or |         token. Each tag string will be of the form of either "", "O" or | ||||||
|         "{action}-{label}", where action is one of "B", "I", "L", "U". |         "{action}-{label}", where action is one of "B", "I", "L", "U". | ||||||
|     RETURNS (list): A sequence of Span objects. |     RETURNS (list): A sequence of Span objects. Each token with a missing IOB | ||||||
|  |         tag is returned as a Span with an empty label. | ||||||
|     """ |     """ | ||||||
|     token_offsets = tags_to_entities(tags) |     token_offsets = tags_to_entities(tags) | ||||||
|     spans = [] |     spans = [] | ||||||
|  | @ -186,22 +187,18 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]: | ||||||
|     entities = [] |     entities = [] | ||||||
|     start = None |     start = None | ||||||
|     for i, tag in enumerate(tags): |     for i, tag in enumerate(tags): | ||||||
|         if tag is None: |         if tag is None or tag.startswith("-"): | ||||||
|             continue |  | ||||||
|         if tag.startswith("O"): |  | ||||||
|             # TODO: We shouldn't be getting these malformed inputs. Fix this. |             # TODO: We shouldn't be getting these malformed inputs. Fix this. | ||||||
|             if start is not None: |             if start is not None: | ||||||
|                 start = None |                 start = None | ||||||
|             else: |             else: | ||||||
|                 entities.append(("", i, i)) |                 entities.append(("", i, i)) | ||||||
|             continue |         elif tag.startswith("O"): | ||||||
|         elif tag == "-": |             pass | ||||||
|             continue |  | ||||||
|         elif tag.startswith("I"): |         elif tag.startswith("I"): | ||||||
|             if start is None: |             if start is None: | ||||||
|                 raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1])) |                 raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1])) | ||||||
|             continue |         elif tag.startswith("U"): | ||||||
|         if tag.startswith("U"): |  | ||||||
|             entities.append((tag[2:], i, i)) |             entities.append((tag[2:], i, i)) | ||||||
|         elif tag.startswith("B"): |         elif tag.startswith("B"): | ||||||
|             start = i |             start = i | ||||||
|  |  | ||||||
|  | @ -219,6 +219,30 @@ alignment mode `"strict". | ||||||
| | `alignment_mode`                     | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | | `alignment_mode`                     | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | ||||||
| | **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   | | | **RETURNS**                          | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   | | ||||||
| 
 | 
 | ||||||
|  | ## Doc.set_ents {#ents tag="method" new="3"} | ||||||
|  | 
 | ||||||
|  | Set the named entities in the document. | ||||||
|  | 
 | ||||||
|  | > #### Example | ||||||
|  | > | ||||||
|  | > ```python | ||||||
|  | > from spacy.tokens import Span | ||||||
|  | > doc = nlp("Mr. Best flew to New York on Saturday morning.") | ||||||
|  | > doc.set_ents([Span(doc, 0, 2, "PERSON")]) | ||||||
|  | > ents = list(doc.ents) | ||||||
|  | > assert ents[0].label_ == "PERSON" | ||||||
|  | > assert ents[0].text == "Mr. Best" | ||||||
|  | > ``` | ||||||
|  | 
 | ||||||
|  | | Name           | Description                                                                                                                                                                               | | ||||||
|  | | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
|  | | entities       | Spans with labels to set as entities. ~~List[Span]~~                                                                                                                                      | | ||||||
|  | | _keyword-only_ |                                                                                                                                                                                           | | ||||||
|  | | blocked        | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~                                        | | ||||||
|  | | missing        | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~                                                                                                                   | | ||||||
|  | | outside        | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~                                                                                                                            | | ||||||
|  | | default        | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ | | ||||||
|  | 
 | ||||||
| ## Doc.similarity {#similarity tag="method" model="vectors"} | ## Doc.similarity {#similarity tag="method" model="vectors"} | ||||||
| 
 | 
 | ||||||
| Make a semantic similarity estimate. The default estimate is cosine similarity | Make a semantic similarity estimate. The default estimate is cosine similarity | ||||||
|  | @ -542,7 +566,6 @@ objects, if the entity recognizer has been applied. | ||||||
| > ```python | > ```python | ||||||
| > doc = nlp("Mr. Best flew to New York on Saturday morning.") | > doc = nlp("Mr. Best flew to New York on Saturday morning.") | ||||||
| > ents = list(doc.ents) | > ents = list(doc.ents) | ||||||
| > assert ents[0].label == 346 |  | ||||||
| > assert ents[0].label_ == "PERSON" | > assert ents[0].label_ == "PERSON" | ||||||
| > assert ents[0].text == "Mr. Best" | > assert ents[0].text == "Mr. Best" | ||||||
| > ``` | > ``` | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user