spaCy/spacy/syntax/iterators.pyx

# coding: utf-8
from __future__ import unicode_literals

from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX


def english_noun_chunks(obj):
    """
    Detect base noun phrases from a dependency parse.
    Works on both Doc and Span.
    """
    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
              'attr', 'ROOT']
    doc = obj.doc # Ensure works on both Doc and Span.
    np_deps = [doc.vocab.strings.add(label) for label in labels]
    conj = doc.vocab.strings.add('conj')
    np_label = doc.vocab.strings.add('NP')
    seen = set()
    for i, word in enumerate(obj):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.i in seen:
            continue
        if word.dep in np_deps:
            if any(w.i in seen for w in word.subtree):
                continue
            seen.update(j for j in range(word.left_edge.i, word.i+1))
            yield word.left_edge.i, word.i+1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                if any(w.i in seen for w in word.subtree):
                    continue
                seen.update(j for j in range(word.left_edge.i, word.i+1))
                yield word.left_edge.i, word.i+1, np_label


# this iterator extracts spans headed by NOUNs starting from the left-most
# syntactic dependent until the NOUN itself
# for close apposition and measurement construction, the span is sometimes
# extended to the right of the NOUN
# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not
# just "eine Tasse", same for "das Thema Familie"
def german_noun_chunks(obj):
    labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
    doc = obj.doc # Ensure works on both Doc and Span.
    np_label = doc.vocab.strings.add('NP')
    np_deps = set(doc.vocab.strings.add(label) for label in labels)
    close_app = doc.vocab.strings.add('nk')

    rbracket = 0
    for i, word in enumerate(obj):
        if i < rbracket:
            continue
        if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
            rbracket = word.i+1
            # try to extend the span to the right
            # to capture close apposition/measurement constructions
            for rdep in doc[word.i].rights:
                if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
                    rbracket = rdep.i+1
            yield word.left_edge.i, rbracket, np_label


def es_noun_chunks(obj):
    doc = obj.doc
    np_label = doc.vocab.strings['NP']
    left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']
    right_labels = ['flat', 'fixed', 'compound', 'neg']
    stop_labels = ['punct']
    np_left_deps = [doc.vocab.strings[label] for label in left_labels]
    np_right_deps = [doc.vocab.strings[label] for label in right_labels]
    stop_deps = [doc.vocab.strings[label] for label in stop_labels]

    def next_token(token):
        try:
            return token.nbor()
        except:
            return None

    def noun_bounds(root):
        def is_verb_token(token):
            return token.pos in [VERB, AUX]

        left_bound = root
        for token in reversed(list(root.lefts)):
            if token.dep in np_left_deps:
                left_bound = token
        right_bound = root
        for token in root.rights:
            if (token.dep in np_right_deps):
                left, right = noun_bounds(token)
                if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,
                               doc[left_bound.i: right.i])):
                    break
                else:
                    right_bound = right
        return left_bound, right_bound

    token = doc[0]
    while token and token.i < len(doc):
        if token.pos in [PROPN, NOUN, PRON]:
            left, right = noun_bounds(token)
            yield left.i, right.i+1, np_label
            token = right
        token = next_token(token)


def french_noun_chunks(obj):
    labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
    doc = obj.doc  # Ensure works on both Doc and Span.
    np_deps = [doc.vocab.strings[label] for label in labels]
    conj = doc.vocab.strings.add('conj')
    np_label = doc.vocab.strings.add('NP')
    seen = set()
    for i, word in enumerate(obj):
        if word.pos not in (NOUN, PROPN, PRON):
            continue
        # Prevent nested chunks from being produced
        if word.i in seen:
            continue
        if word.dep in np_deps:
            if any(w.i in seen for w in word.subtree):
                continue
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
            yield word.left_edge.i, word.right_edge.i+1, np_label
        elif word.dep == conj:
            head = word.head
            while head.dep == conj and head.head.i < head.i:
                head = head.head
            # If the head is an NP, and we're coordinated to it, we're an NP
            if head.dep in np_deps:
                if any(w.i in seen for w in word.subtree):
                    continue
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
                yield word.left_edge.i, word.right_edge.i+1, np_label


CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,
            'es': es_noun_chunks, 'fr': french_noun_chunks}
Tidy up and fix formatting and imports 2017-04-15 14:05:15 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

Port over Spanish changes from #1096 2017-06-02 20:09:58 +03:00			`from ..parts_of_speech cimport NOUN, PROPN, PRON, VERB, AUX`
add baseclass DocIterator for iterators over documents add classes for English and German noun chunks the respective iterators are set for the document when created by the parser as they depend on the annotation scheme of the parsing model 2016-03-16 17:53:35 +03:00

Add noun_chunks to Span 2016-11-24 13:47:20 +03:00			`def english_noun_chunks(obj):`
Tidy up and fix formatting and imports 2017-04-15 14:05:15 +03:00			`"""`
			`Detect base noun phrases from a dependency parse.`
			`Works on both Doc and Span.`
			`"""`
* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples. 2016-05-02 15:25:10 +03:00			`labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',`
Ensure noun chunks can't be nested. Closes #955 2017-04-23 18:56:39 +03:00			`'attr', 'ROOT']`
Allow German noun chunks to work on Span Update the German noun chunks iterator, so that it also works on Span objects. 2016-11-24 15:30:15 +03:00			`doc = obj.doc # Ensure works on both Doc and Span.`
Fix noun chunks iterator for new stringstore 2017-05-28 21:12:10 +03:00			`np_deps = [doc.vocab.strings.add(label) for label in labels]`
			`conj = doc.vocab.strings.add('conj')`
			`np_label = doc.vocab.strings.add('NP')`
Ensure noun chunks can't be nested. Closes #955 2017-04-23 18:56:39 +03:00			`seen = set()`
Add noun_chunks to Span 2016-11-24 13:47:20 +03:00			`for i, word in enumerate(obj):`
Fix noun_chunk rules around coordination Closes #693. 2017-04-07 18:06:40 +03:00			`if word.pos not in (NOUN, PROPN, PRON):`
			`continue`
Ensure noun chunks can't be nested. Closes #955 2017-04-23 18:56:39 +03:00			`# Prevent nested chunks from being produced`
			`if word.i in seen:`
			`continue`
Fix noun_chunk rules around coordination Closes #693. 2017-04-07 18:06:40 +03:00			`if word.dep in np_deps:`
Ensure noun chunks can't be nested. Closes #955 2017-04-23 18:56:39 +03:00			`if any(w.i in seen for w in word.subtree):`
			`continue`
			`seen.update(j for j in range(word.left_edge.i, word.i+1))`
* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples. 2016-05-02 15:25:10 +03:00			`yield word.left_edge.i, word.i+1, np_label`
Fix noun_chunk rules around coordination Closes #693. 2017-04-07 18:06:40 +03:00			`elif word.dep == conj:`
* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples. 2016-05-02 15:25:10 +03:00			`head = word.head`
			`while head.dep == conj and head.head.i < head.i:`
			`head = head.head`
			`# If the head is an NP, and we're coordinated to it, we're an NP`
			`if head.dep in np_deps:`
Ensure noun chunks can't be nested. Closes #955 2017-04-23 18:56:39 +03:00			`if any(w.i in seen for w in word.subtree):`
			`continue`
			`seen.update(j for j in range(word.left_edge.i, word.i+1))`
* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples. 2016-05-02 15:25:10 +03:00			`yield word.left_edge.i, word.i+1, np_label`
add baseclass DocIterator for iterators over documents add classes for English and German noun chunks the respective iterators are set for the document when created by the parser as they depend on the annotation scheme of the parsing model 2016-03-16 17:53:35 +03:00

			`# this iterator extracts spans headed by NOUNs starting from the left-most`
			`# syntactic dependent until the NOUN itself`
			`# for close apposition and measurement construction, the span is sometimes`
			`# extended to the right of the NOUN`
			`# example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee" and not`
			`# just "eine Tasse", same for "das Thema Familie"`
Allow German noun chunks to work on Span Update the German noun chunks iterator, so that it also works on Span objects. 2016-11-24 15:30:15 +03:00			`def german_noun_chunks(obj):`
Fix Issue #469: Incorrectly cased root label in noun chunk iterator 2016-09-27 14:13:01 +03:00			`labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']`
Allow German noun chunks to work on Span Update the German noun chunks iterator, so that it also works on Span objects. 2016-11-24 15:30:15 +03:00			`doc = obj.doc # Ensure works on both Doc and Span.`
Fix german noun chunks iterator 2017-05-28 21:13:03 +03:00			`np_label = doc.vocab.strings.add('NP')`
			`np_deps = set(doc.vocab.strings.add(label) for label in labels)`
			`close_app = doc.vocab.strings.add('nk')`
* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples. 2016-05-02 15:25:10 +03:00
make the code less cryptic 2016-05-03 18:19:05 +03:00			`rbracket = 0`
Allow German noun chunks to work on Span Update the German noun chunks iterator, so that it also works on Span objects. 2016-11-24 15:30:15 +03:00			`for i, word in enumerate(obj):`
make the code less cryptic 2016-05-03 18:19:05 +03:00			`if i < rbracket:`
			`continue`
add fix for German noun chunk iterator (issue #365) 2016-05-06 02:41:26 +03:00			`if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:`
* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples. 2016-05-02 15:25:10 +03:00			`rbracket = word.i+1`
			`# try to extend the span to the right`
			`# to capture close apposition/measurement constructions`
			`for rdep in doc[word.i].rights:`
add fix for German noun chunk iterator (issue #365) 2016-05-06 02:41:26 +03:00			`if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:`
* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples. 2016-05-02 15:25:10 +03:00			`rbracket = rdep.i+1`
fix whitespace 2016-05-04 08:40:38 +03:00			`yield word.left_edge.i, rbracket, np_label`

* Refactor noun chunk iterators, so that they're simple functions. Install the iterator when the Doc is created, but allow users to write to the noun_chunk_iterator attribute. The iterator functions accept an object and yield (int start, int end, int label) triples. 2016-05-02 15:25:10 +03:00
Port over Spanish changes from #1096 2017-06-02 20:09:58 +03:00			`def es_noun_chunks(obj):`
			`doc = obj.doc`
			`np_label = doc.vocab.strings['NP']`
			`left_labels = ['det', 'fixed', 'neg'] #['nunmod', 'det', 'appos', 'fixed']`
			`right_labels = ['flat', 'fixed', 'compound', 'neg']`
			`stop_labels = ['punct']`
			`np_left_deps = [doc.vocab.strings[label] for label in left_labels]`
			`np_right_deps = [doc.vocab.strings[label] for label in right_labels]`
			`stop_deps = [doc.vocab.strings[label] for label in stop_labels]`

			`def next_token(token):`
			`try:`
			`return token.nbor()`
			`except:`
			`return None`

			`def noun_bounds(root):`
			`def is_verb_token(token):`
			`return token.pos in [VERB, AUX]`

			`left_bound = root`
			`for token in reversed(list(root.lefts)):`
			`if token.dep in np_left_deps:`
			`left_bound = token`
			`right_bound = root`
			`for token in root.rights:`
			`if (token.dep in np_right_deps):`
			`left, right = noun_bounds(token)`
			`if list(filter(lambda t: is_verb_token(t) or t.dep in stop_deps,`
			`doc[left_bound.i: right.i])):`
			`break`
			`else:`
			`right_bound = right`
			`return left_bound, right_bound`

			`token = doc[0]`
			`while token and token.i < len(doc):`
			`if token.pos in [PROPN, NOUN, PRON]:`
			`left, right = noun_bounds(token)`
			`yield left.i, right.i+1, np_label`
			`token = right`
			`token = next_token(token)`


Adds function to extract french noun chunks 2017-06-12 16:20:49 +03:00			`def french_noun_chunks(obj):`
			`labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']`
			`doc = obj.doc # Ensure works on both Doc and Span.`
			`np_deps = [doc.vocab.strings[label] for label in labels]`
			`conj = doc.vocab.strings.add('conj')`
			`np_label = doc.vocab.strings.add('NP')`
			`seen = set()`
			`for i, word in enumerate(obj):`
			`if word.pos not in (NOUN, PROPN, PRON):`
			`continue`
			`# Prevent nested chunks from being produced`
			`if word.i in seen:`
			`continue`
			`if word.dep in np_deps:`
			`if any(w.i in seen for w in word.subtree):`
			`continue`
			`seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))`
			`yield word.left_edge.i, word.right_edge.i+1, np_label`
			`elif word.dep == conj:`
			`head = word.head`
			`while head.dep == conj and head.head.i < head.i:`
			`head = head.head`
			`# If the head is an NP, and we're coordinated to it, we're an NP`
			`if head.dep in np_deps:`
			`if any(w.i in seen for w in word.subtree):`
			`continue`
			`seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))`
			`yield word.left_edge.i, word.right_edge.i+1, np_label`


Port over Spanish changes from #1096 2017-06-02 20:09:58 +03:00			`CHUNKERS = {'en': english_noun_chunks, 'de': german_noun_chunks,`
Adds function to extract french noun chunks 2017-06-12 16:20:49 +03:00			`'es': es_noun_chunks, 'fr': french_noun_chunks}`