diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index fd76c6e4d..40ea2bf9c 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -599,18 +599,27 @@ ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('Before', ents) # The model didn't recognize "fb" as an entity :( -fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity +# Create a span for the new entity +fb_ent = Span(doc, 0, 1, label="ORG") + +# Option 1: Modify the provided entity spans, leaving the rest unmodified +doc.set_ents([fb_ent], default="unmodified") + +# Option 2: Assign a complete list of ents to doc.ents doc.ents = list(doc.ents) + [fb_ent] -ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] +ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents] print('After', ents) -# [('fb', 0, 2, 'ORG')] 🎉 +# [('fb', 0, 1, 'ORG')] 🎉 ``` -Keep in mind that you need to create a `Span` with the start and end index of -the **token**, not the start and end index of the entity in the document. In -this case, "fb" is token `(0, 1)` – but at the document level, the entity will -have the start and end indices `(0, 2)`. +Keep in mind that `Span` is initialized with the start and end **token** +indices, not the character offsets. To create a span from character offsets, use +[`Doc.char_span`](/api/doc#char_span): + +```python +fb_ent = doc.char_span(0, 2, label="ORG") +``` #### Setting entity annotations from array {#setting-from-array} @@ -645,9 +654,10 @@ write efficient native code. ```python # cython: infer_types=True +from spacy.typedefs cimport attr_t from spacy.tokens.doc cimport Doc -cpdef set_entity(Doc doc, int start, int end, int ent_type): +cpdef set_entity(Doc doc, int start, int end, attr_t ent_type): for i in range(start, end): doc.c[i].ent_type = ent_type doc.c[start].ent_iob = 3