mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Merge pull request #5470 from svlandeg/bugfix/noun-chunks
Bugfix in noun chunks
This commit is contained in:
commit
5ce02c1b17
|
@ -23,29 +23,25 @@ def noun_chunks(doclike):
|
|||
conj = doc.vocab.strings.add("conj")
|
||||
nmod = doc.vocab.strings.add("nmod")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
seen = set()
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
flag = False
|
||||
if word.pos == NOUN:
|
||||
# check for patterns such as γραμμή παραγωγής
|
||||
for potential_nmod in word.rights:
|
||||
if potential_nmod.dep == nmod:
|
||||
seen.update(
|
||||
j for j in range(word.left_edge.i, potential_nmod.i + 1)
|
||||
)
|
||||
prev_end = potential_nmod.i
|
||||
yield word.left_edge.i, potential_nmod.i + 1, np_label
|
||||
flag = True
|
||||
break
|
||||
if flag is False:
|
||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
||||
prev_end = word.i
|
||||
yield word.left_edge.i, word.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
# covers the case: έχει όμορφα και έξυπνα παιδιά
|
||||
|
@ -54,9 +50,7 @@ def noun_chunks(doclike):
|
|||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
||||
prev_end = word.i
|
||||
yield word.left_edge.i, word.i + 1, np_label
|
||||
|
||||
|
||||
|
|
|
@ -28,17 +28,15 @@ def noun_chunks(doclike):
|
|||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
seen = set()
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
||||
prev_end = word.i
|
||||
yield word.left_edge.i, word.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
|
@ -46,9 +44,7 @@ def noun_chunks(doclike):
|
|||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
||||
prev_end = word.i
|
||||
yield word.left_edge.i, word.i + 1, np_label
|
||||
|
||||
|
||||
|
|
|
@ -28,17 +28,15 @@ def noun_chunks(doclike):
|
|||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
seen = set()
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
||||
prev_end = word.i
|
||||
yield word.left_edge.i, word.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
|
@ -46,9 +44,7 @@ def noun_chunks(doclike):
|
|||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.i + 1))
|
||||
prev_end = word.i
|
||||
yield word.left_edge.i, word.i + 1, np_label
|
||||
|
||||
|
||||
|
|
|
@ -27,17 +27,15 @@ def noun_chunks(doclike):
|
|||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
seen = set()
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
|
@ -45,9 +43,7 @@ def noun_chunks(doclike):
|
|||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
|
||||
|
||||
|
|
|
@ -27,17 +27,15 @@ def noun_chunks(doclike):
|
|||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
seen = set()
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
|
@ -45,9 +43,7 @@ def noun_chunks(doclike):
|
|||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
|
||||
|
||||
|
|
|
@ -27,17 +27,15 @@ def noun_chunks(doclike):
|
|||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
seen = set()
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
|
@ -45,9 +43,7 @@ def noun_chunks(doclike):
|
|||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
|
||||
|
||||
|
|
|
@ -28,17 +28,15 @@ def noun_chunks(doclike):
|
|||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
seen = set()
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
|
@ -46,9 +44,7 @@ def noun_chunks(doclike):
|
|||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
|
||||
|
||||
|
|
|
@ -417,7 +417,7 @@ class Language(object):
|
|||
|
||||
def __call__(self, text, disable=[], component_cfg=None):
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
and can contain arbitrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
text (unicode): The text to be processed.
|
||||
|
|
21
spacy/tests/regression/test_issue5458.py
Normal file
21
spacy/tests/regression/test_issue5458.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||
from spacy.tests.util import get_doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue5458():
|
||||
# Test that the noun chuncker does not generate overlapping spans
|
||||
words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
|
||||
vocab = Vocab(strings=words)
|
||||
dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
|
||||
pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
|
||||
heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
|
||||
|
||||
en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
|
||||
en_doc.noun_chunks_iterator = noun_chunks
|
||||
|
||||
# if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
|
||||
nlp = English()
|
||||
merge_nps = nlp.create_pipe("merge_noun_chunks")
|
||||
merge_nps(en_doc)
|
Loading…
Reference in New Issue
Block a user