mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge pull request #5470 from svlandeg/bugfix/noun-chunks
Bugfix in noun chunks
This commit is contained in:
		
						commit
						5ce02c1b17
					
				| 
						 | 
				
			
			@ -23,29 +23,25 @@ def noun_chunks(doclike):
 | 
			
		|||
    conj = doc.vocab.strings.add("conj")
 | 
			
		||||
    nmod = doc.vocab.strings.add("nmod")
 | 
			
		||||
    np_label = doc.vocab.strings.add("NP")
 | 
			
		||||
    seen = set()
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    for i, word in enumerate(doclike):
 | 
			
		||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
			
		||||
            continue
 | 
			
		||||
        # Prevent nested chunks from being produced
 | 
			
		||||
        if word.i in seen:
 | 
			
		||||
        if word.left_edge.i <= prev_end:
 | 
			
		||||
            continue
 | 
			
		||||
        if word.dep in np_deps:
 | 
			
		||||
            if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                continue
 | 
			
		||||
            flag = False
 | 
			
		||||
            if word.pos == NOUN:
 | 
			
		||||
                #  check for patterns such as γραμμή παραγωγής
 | 
			
		||||
                for potential_nmod in word.rights:
 | 
			
		||||
                    if potential_nmod.dep == nmod:
 | 
			
		||||
                        seen.update(
 | 
			
		||||
                            j for j in range(word.left_edge.i, potential_nmod.i + 1)
 | 
			
		||||
                        )
 | 
			
		||||
                        prev_end = potential_nmod.i
 | 
			
		||||
                        yield word.left_edge.i, potential_nmod.i + 1, np_label
 | 
			
		||||
                        flag = True
 | 
			
		||||
                        break
 | 
			
		||||
            if flag is False:
 | 
			
		||||
                seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
			
		||||
                prev_end = word.i
 | 
			
		||||
                yield word.left_edge.i, word.i + 1, np_label
 | 
			
		||||
        elif word.dep == conj:
 | 
			
		||||
            # covers the case: έχει όμορφα και έξυπνα παιδιά
 | 
			
		||||
| 
						 | 
				
			
			@ -54,9 +50,7 @@ def noun_chunks(doclike):
 | 
			
		|||
                head = head.head
 | 
			
		||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
			
		||||
            if head.dep in np_deps:
 | 
			
		||||
                if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                    continue
 | 
			
		||||
                seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
			
		||||
                prev_end = word.i
 | 
			
		||||
                yield word.left_edge.i, word.i + 1, np_label
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,17 +28,15 @@ def noun_chunks(doclike):
 | 
			
		|||
    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
			
		||||
    conj = doc.vocab.strings.add("conj")
 | 
			
		||||
    np_label = doc.vocab.strings.add("NP")
 | 
			
		||||
    seen = set()
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    for i, word in enumerate(doclike):
 | 
			
		||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
			
		||||
            continue
 | 
			
		||||
        # Prevent nested chunks from being produced
 | 
			
		||||
        if word.i in seen:
 | 
			
		||||
        if word.left_edge.i <= prev_end:
 | 
			
		||||
            continue
 | 
			
		||||
        if word.dep in np_deps:
 | 
			
		||||
            if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                continue
 | 
			
		||||
            seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
			
		||||
            prev_end = word.i
 | 
			
		||||
            yield word.left_edge.i, word.i + 1, np_label
 | 
			
		||||
        elif word.dep == conj:
 | 
			
		||||
            head = word.head
 | 
			
		||||
| 
						 | 
				
			
			@ -46,9 +44,7 @@ def noun_chunks(doclike):
 | 
			
		|||
                head = head.head
 | 
			
		||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
			
		||||
            if head.dep in np_deps:
 | 
			
		||||
                if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                    continue
 | 
			
		||||
                seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
			
		||||
                prev_end = word.i
 | 
			
		||||
                yield word.left_edge.i, word.i + 1, np_label
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,17 +28,15 @@ def noun_chunks(doclike):
 | 
			
		|||
    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
			
		||||
    conj = doc.vocab.strings.add("conj")
 | 
			
		||||
    np_label = doc.vocab.strings.add("NP")
 | 
			
		||||
    seen = set()
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    for i, word in enumerate(doclike):
 | 
			
		||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
			
		||||
            continue
 | 
			
		||||
        # Prevent nested chunks from being produced
 | 
			
		||||
        if word.i in seen:
 | 
			
		||||
        if word.left_edge.i <= prev_end:
 | 
			
		||||
            continue
 | 
			
		||||
        if word.dep in np_deps:
 | 
			
		||||
            if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                continue
 | 
			
		||||
            seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
			
		||||
            prev_end = word.i
 | 
			
		||||
            yield word.left_edge.i, word.i + 1, np_label
 | 
			
		||||
        elif word.dep == conj:
 | 
			
		||||
            head = word.head
 | 
			
		||||
| 
						 | 
				
			
			@ -46,9 +44,7 @@ def noun_chunks(doclike):
 | 
			
		|||
                head = head.head
 | 
			
		||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
			
		||||
            if head.dep in np_deps:
 | 
			
		||||
                if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                    continue
 | 
			
		||||
                seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
			
		||||
                prev_end = word.i
 | 
			
		||||
                yield word.left_edge.i, word.i + 1, np_label
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -27,17 +27,15 @@ def noun_chunks(doclike):
 | 
			
		|||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
			
		||||
    conj = doc.vocab.strings.add("conj")
 | 
			
		||||
    np_label = doc.vocab.strings.add("NP")
 | 
			
		||||
    seen = set()
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    for i, word in enumerate(doclike):
 | 
			
		||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
			
		||||
            continue
 | 
			
		||||
        # Prevent nested chunks from being produced
 | 
			
		||||
        if word.i in seen:
 | 
			
		||||
        if word.left_edge.i <= prev_end:
 | 
			
		||||
            continue
 | 
			
		||||
        if word.dep in np_deps:
 | 
			
		||||
            if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                continue
 | 
			
		||||
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
			
		||||
            prev_end = word.right_edge.i
 | 
			
		||||
            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
			
		||||
        elif word.dep == conj:
 | 
			
		||||
            head = word.head
 | 
			
		||||
| 
						 | 
				
			
			@ -45,9 +43,7 @@ def noun_chunks(doclike):
 | 
			
		|||
                head = head.head
 | 
			
		||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
			
		||||
            if head.dep in np_deps:
 | 
			
		||||
                if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                    continue
 | 
			
		||||
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
			
		||||
                prev_end = word.right_edge.i
 | 
			
		||||
                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -27,17 +27,15 @@ def noun_chunks(doclike):
 | 
			
		|||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
			
		||||
    conj = doc.vocab.strings.add("conj")
 | 
			
		||||
    np_label = doc.vocab.strings.add("NP")
 | 
			
		||||
    seen = set()
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    for i, word in enumerate(doclike):
 | 
			
		||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
			
		||||
            continue
 | 
			
		||||
        # Prevent nested chunks from being produced
 | 
			
		||||
        if word.i in seen:
 | 
			
		||||
        if word.left_edge.i <= prev_end:
 | 
			
		||||
            continue
 | 
			
		||||
        if word.dep in np_deps:
 | 
			
		||||
            if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                continue
 | 
			
		||||
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
			
		||||
            prev_end = word.right_edge.i
 | 
			
		||||
            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
			
		||||
        elif word.dep == conj:
 | 
			
		||||
            head = word.head
 | 
			
		||||
| 
						 | 
				
			
			@ -45,9 +43,7 @@ def noun_chunks(doclike):
 | 
			
		|||
                head = head.head
 | 
			
		||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
			
		||||
            if head.dep in np_deps:
 | 
			
		||||
                if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                    continue
 | 
			
		||||
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
			
		||||
                prev_end = word.right_edge.i
 | 
			
		||||
                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -27,17 +27,15 @@ def noun_chunks(doclike):
 | 
			
		|||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
			
		||||
    conj = doc.vocab.strings.add("conj")
 | 
			
		||||
    np_label = doc.vocab.strings.add("NP")
 | 
			
		||||
    seen = set()
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    for i, word in enumerate(doclike):
 | 
			
		||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
			
		||||
            continue
 | 
			
		||||
        # Prevent nested chunks from being produced
 | 
			
		||||
        if word.i in seen:
 | 
			
		||||
        if word.left_edge.i <= prev_end:
 | 
			
		||||
            continue
 | 
			
		||||
        if word.dep in np_deps:
 | 
			
		||||
            if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                continue
 | 
			
		||||
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
			
		||||
            prev_end = word.right_edge.i
 | 
			
		||||
            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
			
		||||
        elif word.dep == conj:
 | 
			
		||||
            head = word.head
 | 
			
		||||
| 
						 | 
				
			
			@ -45,9 +43,7 @@ def noun_chunks(doclike):
 | 
			
		|||
                head = head.head
 | 
			
		||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
			
		||||
            if head.dep in np_deps:
 | 
			
		||||
                if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                    continue
 | 
			
		||||
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
			
		||||
                prev_end = word.right_edge.i
 | 
			
		||||
                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,17 +28,15 @@ def noun_chunks(doclike):
 | 
			
		|||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
			
		||||
    conj = doc.vocab.strings.add("conj")
 | 
			
		||||
    np_label = doc.vocab.strings.add("NP")
 | 
			
		||||
    seen = set()
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    for i, word in enumerate(doclike):
 | 
			
		||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
			
		||||
            continue
 | 
			
		||||
        # Prevent nested chunks from being produced
 | 
			
		||||
        if word.i in seen:
 | 
			
		||||
        if word.left_edge.i <= prev_end:
 | 
			
		||||
            continue
 | 
			
		||||
        if word.dep in np_deps:
 | 
			
		||||
            if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                continue
 | 
			
		||||
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
			
		||||
            prev_end = word.right_edge.i
 | 
			
		||||
            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
			
		||||
        elif word.dep == conj:
 | 
			
		||||
            head = word.head
 | 
			
		||||
| 
						 | 
				
			
			@ -46,9 +44,7 @@ def noun_chunks(doclike):
 | 
			
		|||
                head = head.head
 | 
			
		||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
			
		||||
            if head.dep in np_deps:
 | 
			
		||||
                if any(w.i in seen for w in word.subtree):
 | 
			
		||||
                    continue
 | 
			
		||||
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
			
		||||
                prev_end = word.right_edge.i
 | 
			
		||||
                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -417,7 +417,7 @@ class Language(object):
 | 
			
		|||
 | 
			
		||||
    def __call__(self, text, disable=[], component_cfg=None):
 | 
			
		||||
        """Apply the pipeline to some text. The text can span multiple sentences,
 | 
			
		||||
        and can contain arbtrary whitespace. Alignment into the original string
 | 
			
		||||
        and can contain arbitrary whitespace. Alignment into the original string
 | 
			
		||||
        is preserved.
 | 
			
		||||
 | 
			
		||||
        text (unicode): The text to be processed.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										21
									
								
								spacy/tests/regression/test_issue5458.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								spacy/tests/regression/test_issue5458.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,21 @@
 | 
			
		|||
from spacy.lang.en import English
 | 
			
		||||
from spacy.lang.en.syntax_iterators import noun_chunks
 | 
			
		||||
from spacy.tests.util import get_doc
 | 
			
		||||
from spacy.vocab import Vocab
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_issue5458():
 | 
			
		||||
    # Test that the noun chuncker does not generate overlapping spans
 | 
			
		||||
    words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
 | 
			
		||||
    vocab = Vocab(strings=words)
 | 
			
		||||
    dependencies = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
 | 
			
		||||
    pos_tags = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
 | 
			
		||||
    heads = [0, 1, -2, 6, 2, 1, -4, -1, -1, -2, -10]
 | 
			
		||||
 | 
			
		||||
    en_doc = get_doc(vocab, words, pos_tags, heads, dependencies)
 | 
			
		||||
    en_doc.noun_chunks_iterator = noun_chunks
 | 
			
		||||
 | 
			
		||||
    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    merge_nps = nlp.create_pipe("merge_noun_chunks")
 | 
			
		||||
    merge_nps(en_doc)
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user