mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge pull request #5470 from svlandeg/bugfix/noun-chunks
Bugfix in noun chunks
This commit is contained in:
commit
5ce02c1b17
|
@ -23,29 +23,25 @@ def noun_chunks(doclike):
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
nmod = doc.vocab.strings.add("nmod")
|
nmod = doc.vocab.strings.add("nmod")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
|
||||||
continue
|
|
||||||
flag = False
|
flag = False
|
||||||
if word.pos == NOUN:
|
if word.pos == NOUN:
|
||||||
# check for patterns such as γραμμή παραγωγής
|
# check for patterns such as γραμμή παραγωγής
|
||||||
for potential_nmod in word.rights:
|
for potential_nmod in word.rights:
|
||||||
if potential_nmod.dep == nmod:
|
if potential_nmod.dep == nmod:
|
||||||
seen.update(
|
prev_end = potential_nmod.i
|
||||||
j for j in range(word.left_edge.i, potential_nmod.i + 1)
|
|
||||||
)
|
|
||||||
yield word.left_edge.i, potential_nmod.i + 1, np_label
|
yield word.left_edge.i, potential_nmod.i + 1, np_label
|
||||||
flag = True
|
flag = True
|
||||||
break
|
break
|
||||||
if flag is False:
|
if flag is False:
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
prev_end = word.i
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
# covers the case: έχει όμορφα και έξυπνα παιδιά
|
# covers the case: έχει όμορφα και έξυπνα παιδιά
|
||||||
|
@ -54,9 +50,7 @@ def noun_chunks(doclike):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -28,17 +28,15 @@ def noun_chunks(doclike):
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -46,9 +44,7 @@ def noun_chunks(doclike):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -28,17 +28,15 @@ def noun_chunks(doclike):
|
||||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -46,9 +44,7 @@ def noun_chunks(doclike):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
|
||||||
yield word.left_edge.i, word.i + 1, np_label
|
yield word.left_edge.i, word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,17 +27,15 @@ def noun_chunks(doclike):
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -45,9 +43,7 @@ def noun_chunks(doclike):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,17 +27,15 @@ def noun_chunks(doclike):
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -45,9 +43,7 @@ def noun_chunks(doclike):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,17 +27,15 @@ def noun_chunks(doclike):
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -45,9 +43,7 @@ def noun_chunks(doclike):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -28,17 +28,15 @@ def noun_chunks(doclike):
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
prev_end = -1
|
||||||
for i, word in enumerate(doclike):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
if word.i in seen:
|
if word.left_edge.i <= prev_end:
|
||||||
continue
|
continue
|
||||||
if word.dep in np_deps:
|
if word.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
elif word.dep == conj:
|
elif word.dep == conj:
|
||||||
head = word.head
|
head = word.head
|
||||||
|
@ -46,9 +44,7 @@ def noun_chunks(doclike):
|
||||||
head = head.head
|
head = head.head
|
||||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||||
if head.dep in np_deps:
|
if head.dep in np_deps:
|
||||||
if any(w.i in seen for w in word.subtree):
|
prev_end = word.right_edge.i
|
||||||
continue
|
|
||||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
|
||||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -417,7 +417,7 @@ class Language(object):
|
||||||
|
|
||||||
def __call__(self, text, disable=[], component_cfg=None):
|
def __call__(self, text, disable=[], component_cfg=None):
|
||||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
and can contain arbitrary whitespace. Alignment into the original string
|
||||||
is preserved.
|
is preserved.
|
||||||
|
|
||||||
text (unicode): The text to be processed.
|
text (unicode): The text to be processed.
|
||||||
|
|
21
spacy/tests/regression/test_issue5458.py
Normal file
21
spacy/tests/regression/test_issue5458.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||||
|
from spacy.tests.util import get_doc
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue5458():
|
||||||
|
# Test that the noun chuncker does not generate overlapping spans
|
||||||
|
words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
|
||||||
|
vocab = Vocab(strings=words)
|
||||||
|
dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
|
||||||
|
pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
|
||||||
|
heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
|
||||||
|
|
||||||
|
en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
|
||||||
|
en_doc.noun_chunks_iterator = noun_chunks
|
||||||
|
|
||||||
|
# if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
|
||||||
|
nlp = English()
|
||||||
|
merge_nps = nlp.create_pipe("merge_noun_chunks")
|
||||||
|
merge_nps(en_doc)
|
Loading…
Reference in New Issue
Block a user