mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
7e4cd7575c
* Refactor Docs.is_ flags * Add derived `Doc.has_annotation` method * `Doc.has_annotation(attr)` returns `True` for partial annotation * `Doc.has_annotation(attr, require_complete=True)` returns `True` for complete annotation * Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced` and `is_nered` * Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The list is the `DocBin` attributes list plus `SPACY` and `LENGTH`. Notes on `Doc.has_annotation`: * `HEAD` is converted to `DEP` because heads don't have an unset state * Accept `IS_SENT_START` as a synonym of `SENT_START` Additional changes: * Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for `DocBin` * In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override `SENT_START` * In `Doc.from_array()` using `attrs` other than `Doc._get_array_attrs()` (i.e., a user's custom list rather than our default internal list) with both `HEAD` and `SENT_START` shows a warning that `HEAD` will override `SENT_START` * `set_children_from_heads` does not require dependency labels to set sentence boundaries and sets `sent_start` for all non-sentence starts to `-1` * Fix call to set_children_form_heads Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
39 lines
1.6 KiB
Python
39 lines
1.6 KiB
Python
from typing import Union, Iterator
|
|
|
|
from ...symbols import NOUN, PROPN, PRON
|
|
from ...errors import Errors
|
|
from ...tokens import Doc, Span
|
|
|
|
|
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
|
# this iterator extracts spans headed by NOUNs starting from the left-most
|
|
# syntactic dependent until the NOUN itself for close apposition and
|
|
# measurement construction, the span is sometimes extended to the right of
|
|
# the NOUN. Example: "eine Tasse Tee" (a cup (of) tea) returns "eine Tasse Tee"
|
|
# and not just "eine Tasse", same for "das Thema Familie".
|
|
# fmt: off
|
|
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
|
|
# fmt: on
|
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
|
if not doc.has_annotation("DEP"):
|
|
raise ValueError(Errors.E029)
|
|
np_label = doc.vocab.strings.add("NP")
|
|
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
|
close_app = doc.vocab.strings.add("nk")
|
|
rbracket = 0
|
|
for i, word in enumerate(doclike):
|
|
if i < rbracket:
|
|
continue
|
|
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
|
rbracket = word.i + 1
|
|
# try to extend the span to the right
|
|
# to capture close apposition/measurement constructions
|
|
for rdep in doc[word.i].rights:
|
|
if rdep.pos in (NOUN, PROPN) and rdep.dep == close_app:
|
|
rbracket = rdep.i + 1
|
|
yield word.left_edge.i, rbracket, np_label
|
|
|
|
|
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|