diff --git a/setup.cfg b/setup.cfg index 4d0a88c35..bcb85eef3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,7 @@ install_requires = [options.extras_require] lookups = - spacy_lookups_data>=0.0.4<0.2.0 + spacy_lookups_data>=0.0.5<0.2.0 cuda = thinc_gpu_ops>=0.0.1,<0.1.0 cupy>=5.0.0b4 diff --git a/spacy/errors.py b/spacy/errors.py index 2ef5d1ce4..51565ade6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -324,7 +324,9 @@ class Errors(object): E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have " "have been declared in previous edges.") E102 = ("Can't merge non-disjoint spans. '{token}' is already part of " - "tokens to merge.") + "tokens to merge. If you want to find the longest non-overlapping " + "spans, you can use the util.filter_spans helper:\n" + "https://spacy.io/api/top-level#util.filter_spans") E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A " "token can only be part of one entity, so make sure the entities " "you're setting don't overlap.") diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 4128fa73f..e593731d4 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1086,6 +1086,14 @@ with doc.retokenize() as retokenizer: print("After:", [token.text for token in doc]) ``` +> #### Tip: merging entities and noun phrases +> +> If you need to merge named entities or noun chunks, check out the built-in +> [`merge_entities`](/api/pipeline-functions#merge_entities) and +> [`merge_noun_chunks`](/api/pipeline-functions#merge_noun_chunks) pipeline +> components. When added to your pipeline using `nlp.add_pipe`, they'll take +> care of merging the spans automatically. + If an attribute in the `attrs` is a context-dependent token attribute, it will be applied to the underlying [`Token`](/api/token). For example `LEMMA`, `POS` or `DEP` only apply to a word in context, so they're token attributes. If an @@ -1094,16 +1102,24 @@ underlying [`Lexeme`](/api/lexeme), the entry in the vocabulary. For example, `LOWER` or `IS_STOP` apply to all words of the same spelling, regardless of the context. - + -If you need to merge named entities or noun chunks, check out the built-in -[`merge_entities`](/api/pipeline-functions#merge_entities) and -[`merge_noun_chunks`](/api/pipeline-functions#merge_noun_chunks) pipeline -components. When added to your pipeline using `nlp.add_pipe`, they'll take care -of merging the spans automatically. +If you're trying to merge spans that overlap, spaCy will raise an error because +it's unclear how the result should look. Depending on the application, you may +want to match the shortest or longest possible span, so it's up to you to filter +them. If you're looking for the longest non-overlapping span, you can use the +[`util.filter_spans`](/api/top-level#util.filter_spans) helper: + +```python +doc = nlp("I live in Berlin Kreuzberg") +spans = [doc[3:5], doc[3:4], doc[4:5]] +filtered_spans = filter_spans(spans) +``` +### Splitting tokens + The [`retokenizer.split`](/api/doc#retokenizer.split) method allows splitting one token into two or more tokens. This can be useful for cases where tokenization rules alone aren't sufficient. For example, you might want to split @@ -1168,7 +1184,7 @@ with doc.retokenize() as retokenizer: When splitting tokens, the subtoken texts always have to match the original -token text – or, put differently `''.join(subtokens) == token.text` always needs +token text – or, put differently `"".join(subtokens) == token.text` always needs to hold true. If this wasn't the case, splitting tokens could easily end up producing confusing and unexpected results that would contradict spaCy's non-destructive tokenization policy.