Merge pull request #5470 from svlandeg/bugfix/noun-chunks

Bugfix in noun chunks
This commit is contained in:
Matthew Honnibal 2020-05-21 20:51:31 +02:00 committed by GitHub
commit 5ce02c1b17
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 51 additions and 60 deletions

View File

@ -23,29 +23,25 @@ def noun_chunks(doclike):
conj = doc.vocab.strings.add("conj")
nmod = doc.vocab.strings.add("nmod")
np_label = doc.vocab.strings.add("NP")
seen = set()
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
flag = False
if word.pos == NOUN:
# check for patterns such as γραμμή παραγωγής
for potential_nmod in word.rights:
if potential_nmod.dep == nmod:
seen.update(
j for j in range(word.left_edge.i, potential_nmod.i + 1)
)
prev_end = potential_nmod.i
yield word.left_edge.i, potential_nmod.i + 1, np_label
flag = True
break
if flag is False:
seen.update(j for j in range(word.left_edge.i, word.i + 1))
prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
# covers the case: έχει όμορφα και έξυπνα παιδιά
@ -54,9 +50,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label

View File

@ -28,17 +28,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
head = word.head
@ -46,9 +44,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label

View File

@ -28,17 +28,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label
elif word.dep == conj:
head = word.head
@ -46,9 +44,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.i + 1))
prev_end = word.i
yield word.left_edge.i, word.i + 1, np_label

View File

@ -27,17 +27,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@ -45,9 +43,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label

View File

@ -27,17 +27,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@ -45,9 +43,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label

View File

@ -27,17 +27,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@ -45,9 +43,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label

View File

@ -28,17 +28,15 @@ def noun_chunks(doclike):
np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP")
seen = set()
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON):
continue
# Prevent nested chunks from being produced
if word.i in seen:
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label
elif word.dep == conj:
head = word.head
@ -46,9 +44,7 @@ def noun_chunks(doclike):
head = head.head
# If the head is an NP, and we're coordinated to it, we're an NP
if head.dep in np_deps:
if any(w.i in seen for w in word.subtree):
continue
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
prev_end = word.right_edge.i
yield word.left_edge.i, word.right_edge.i + 1, np_label

View File

@ -417,7 +417,7 @@ class Language(object):
def __call__(self, text, disable=[], component_cfg=None):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
and can contain arbitrary whitespace. Alignment into the original string
is preserved.
text (unicode): The text to be processed.

View File

@ -0,0 +1,21 @@
from spacy.lang.en import English
from spacy.lang.en.syntax_iterators import noun_chunks
from spacy.tests.util import get_doc
from spacy.vocab import Vocab
def test_issue5458():
# Test that the noun chuncker does not generate overlapping spans
words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
vocab = Vocab(strings=words)
dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
en_doc.noun_chunks_iterator = noun_chunks
# if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
nlp = English()
merge_nps = nlp.create_pipe("merge_noun_chunks")
merge_nps(en_doc)