spaCy/spacy/lang/es/syntax_iterators.py
Adriane Boyd 7e4cd7575c
Refactor Docs.is_ flags (#6044)
* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-09-17 00:14:01 +02:00

69 lines
2.2 KiB
Python

from typing import Union, Iterator, Optional, List, Tuple
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors
from ...tokens import Doc, Span, Token
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
if not len(doc):
return
np_label = doc.vocab.strings.add("NP")
left_labels = ["det", "fixed", "neg"] # ['nunmod', 'det', 'appos', 'fixed']
right_labels = ["flat", "fixed", "compound", "neg"]
stop_labels = ["punct"]
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
for token in doclike:
if token.pos in [PROPN, NOUN, PRON]:
left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps
)
yield left.i, right.i + 1, np_label
token = right
token = next_token(token)
def is_verb_token(token: Token) -> bool:
return token.pos in [VERB, AUX]
def next_token(token: Token) -> Optional[Token]:
try:
return token.nbor()
except IndexError:
return None
def noun_bounds(
doc: Doc,
root: Token,
np_left_deps: List[str],
np_right_deps: List[str],
stop_deps: List[str],
) -> Tuple[Token, Token]:
left_bound = root
for token in reversed(list(root.lefts)):
if token.dep in np_left_deps:
left_bound = token
right_bound = root
for token in root.rights:
if token.dep in np_right_deps:
left, right = noun_bounds(
doc, token, np_left_deps, np_right_deps, stop_deps
)
filter_func = lambda t: is_verb_token(t) or t.dep in stop_deps
if list(filter(filter_func, doc[left_bound.i : right.i],)):
break
else:
right_bound = right
return left_bound, right_bound
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}