Merge pull request #5470 from svlandeg/bugfix/noun-chunks

Bugfix in noun chunks
This commit is contained in:
Matthew Honnibal 2020-05-21 20:51:31 +02:00 committed by GitHub
commit 5ce02c1b17
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 51 additions and 60 deletions

View File

@ -23,29 +23,25 @@ def noun_chunks(doclike):
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
nmod = doc.vocab.strings.add("nmod") nmod = doc.vocab.strings.add("nmod")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced
if word.i in seen: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
flag = False flag = False
if word.pos == NOUN: if word.pos == NOUN:
# check for patterns such as γραμμή παραγωγής # check for patterns such as γραμμή παραγωγής
for potential_nmod in word.rights: for potential_nmod in word.rights:
if potential_nmod.dep == nmod: if potential_nmod.dep == nmod:
seen.update( prev_end = potential_nmod.i
j for j in range(word.left_edge.i, potential_nmod.i + 1)
)
yield word.left_edge.i, potential_nmod.i + 1, np_label yield word.left_edge.i, potential_nmod.i + 1, np_label
flag = True flag = True
break break
if flag is False: if flag is False:
seen.update(j for j in range(word.left_edge.i, word.i + 1)) prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj: elif word.dep == conj:
# covers the case: έχει όμορφα και έξυπνα παιδιά # covers the case: έχει όμορφα και έξυπνα παιδιά
@ -54,9 +50,7 @@ def noun_chunks(doclike):
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.i
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
yield word.left_edge.i, word.i + 1, np_label yield word.left_edge.i, word.i + 1, np_label

View File

@ -28,17 +28,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced
if word.i in seen: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.i
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
yield word.left_edge.i, word.i + 1, np_label yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj: elif word.dep == conj:
head = word.head head = word.head
@ -46,9 +44,7 @@ def noun_chunks(doclike):
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.i
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
yield word.left_edge.i, word.i + 1, np_label yield word.left_edge.i, word.i + 1, np_label

View File

@ -28,17 +28,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced
if word.i in seen: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.i
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
yield word.left_edge.i, word.i + 1, np_label yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj: elif word.dep == conj:
head = word.head head = word.head
@ -46,9 +44,7 @@ def noun_chunks(doclike):
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.i
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
yield word.left_edge.i, word.i + 1, np_label yield word.left_edge.i, word.i + 1, np_label

View File

@ -27,17 +27,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced
if word.i in seen: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.right_edge.i
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
yield word.left_edge.i, word.right_edge.i + 1, np_label yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj: elif word.dep == conj:
head = word.head head = word.head
@ -45,9 +43,7 @@ def noun_chunks(doclike):
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.right_edge.i
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
yield word.left_edge.i, word.right_edge.i + 1, np_label yield word.left_edge.i, word.right_edge.i + 1, np_label

View File

@ -27,17 +27,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced
if word.i in seen: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.right_edge.i
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
yield word.left_edge.i, word.right_edge.i + 1, np_label yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj: elif word.dep == conj:
head = word.head head = word.head
@ -45,9 +43,7 @@ def noun_chunks(doclike):
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.right_edge.i
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
yield word.left_edge.i, word.right_edge.i + 1, np_label yield word.left_edge.i, word.right_edge.i + 1, np_label

View File

@ -27,17 +27,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced
if word.i in seen: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.right_edge.i
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
yield word.left_edge.i, word.right_edge.i + 1, np_label yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj: elif word.dep == conj:
head = word.head head = word.head
@ -45,9 +43,7 @@ def noun_chunks(doclike):
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.right_edge.i
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
yield word.left_edge.i, word.right_edge.i + 1, np_label yield word.left_edge.i, word.right_edge.i + 1, np_label

View File

@ -28,17 +28,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() prev_end = -1
for i, word in enumerate(doclike): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced
if word.i in seen: if word.left_edge.i <= prev_end:
continue continue
if word.dep in np_deps: if word.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.right_edge.i
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
yield word.left_edge.i, word.right_edge.i + 1, np_label yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj: elif word.dep == conj:
head = word.head head = word.head
@ -46,9 +44,7 @@ def noun_chunks(doclike):
head = head.head head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP # If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps: if head.dep in np_deps:
if any(w.i in seen for w in word.subtree): prev_end = word.right_edge.i
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
yield word.left_edge.i, word.right_edge.i + 1, np_label yield word.left_edge.i, word.right_edge.i + 1, np_label

View File

@ -417,7 +417,7 @@ class Language(object):
def __call__(self, text, disable=[], component_cfg=None): def __call__(self, text, disable=[], component_cfg=None):
"""Apply the pipeline to some text. The text can span multiple sentences, """Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string and can contain arbitrary whitespace. Alignment into the original string
is preserved. is preserved.
text (unicode): The text to be processed. text (unicode): The text to be processed.

View File

@ -0,0 +1,21 @@
from spacy.lang.en import English
from spacy.lang.en.syntax_iterators import noun_chunks
from spacy.tests.util import get_doc
from spacy.vocab import Vocab
def test_issue5458():
# Test that the noun chuncker does not generate overlapping spans
words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
vocab = Vocab(strings=words)
dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
en_doc.noun_chunks_iterator = noun_chunks
# if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
nlp = English()
merge_nps = nlp.create_pipe("merge_noun_chunks")
merge_nps(en_doc)