diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 4317bdeb4..4a40e28c2 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -23,29 +23,25 @@ def noun_chunks(doclike): conj = doc.vocab.strings.add("conj") nmod = doc.vocab.strings.add("nmod") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue flag = False if word.pos == NOUN: # check for patterns such as γραμμή παραγωγής for potential_nmod in word.rights: if potential_nmod.dep == nmod: - seen.update( - j for j in range(word.left_edge.i, potential_nmod.i + 1) - ) + prev_end = potential_nmod.i yield word.left_edge.i, potential_nmod.i + 1, np_label flag = True break if flag is False: - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: # covers the case: έχει όμορφα και έξυπνα παιδιά @@ -54,9 +50,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 6d366ec90..0f2b28b58 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -28,17 +28,15 @@ def noun_chunks(doclike): np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index 6d366ec90..0f2b28b58 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -28,17 +28,15 @@ def noun_chunks(doclike): np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.i + 1)) + prev_end = word.i yield word.left_edge.i, word.i + 1, np_label diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 2ed2c1b35..d6c12e69f 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -27,17 +27,15 @@ def noun_chunks(doclike): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 2ed2c1b35..d6c12e69f 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -27,17 +27,15 @@ def noun_chunks(doclike): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 2ed2c1b35..d6c12e69f 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -27,17 +27,15 @@ def noun_chunks(doclike): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -45,9 +43,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 84493ae79..84d295f96 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -28,17 +28,15 @@ def noun_chunks(doclike): np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") np_label = doc.vocab.strings.add("NP") - seen = set() + prev_end = -1 for i, word in enumerate(doclike): if word.pos not in (NOUN, PROPN, PRON): continue # Prevent nested chunks from being produced - if word.i in seen: + if word.left_edge.i <= prev_end: continue if word.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label elif word.dep == conj: head = word.head @@ -46,9 +44,7 @@ def noun_chunks(doclike): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: - if any(w.i in seen for w in word.subtree): - continue - seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1)) + prev_end = word.right_edge.i yield word.left_edge.i, word.right_edge.i + 1, np_label diff --git a/spacy/language.py b/spacy/language.py index dae7d96a2..e6f8bb8e0 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -417,7 +417,7 @@ class Language(object): def __call__(self, text, disable=[], component_cfg=None): """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string + and can contain arbitrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py new file mode 100644 index 000000000..33281c858 --- /dev/null +++ b/spacy/tests/regression/test_issue5458.py @@ -0,0 +1,21 @@ +from spacy.lang.en import English +from spacy.lang.en.syntax_iterators import noun_chunks +from spacy.tests.util import get_doc +from spacy.vocab import Vocab + + +def test_issue5458(): + # Test that the noun chuncker does not generate overlapping spans + words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] + vocab = Vocab(strings=words) + dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] + pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] + heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10] + + en_doc = get_doc(vocab, words, pos_tags, heads, dependencies) + en_doc.noun_chunks_iterator = noun_chunks + + # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" + nlp = English() + merge_nps = nlp.create_pipe("merge_noun_chunks") + merge_nps(en_doc)