obj -> doclike

This commit is contained in:
Ines Montani 2020-05-21 14:19:58 +02:00
parent 69fb4bedf2
commit b1f45c9da3
10 changed files with 38 additions and 38 deletions

View File

@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
def noun_chunks(obj): def noun_chunks(doclike):
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """
@ -28,7 +28,7 @@ def noun_chunks(obj):
"og", "og",
"app", "app",
] ]
doc = obj.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
@ -38,7 +38,7 @@ def noun_chunks(obj):
close_app = doc.vocab.strings.add("nk") close_app = doc.vocab.strings.add("nk")
rbracket = 0 rbracket = 0
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if i < rbracket: if i < rbracket:
continue continue
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:

View File

@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
def noun_chunks(obj): def noun_chunks(doclike):
""" """
Detect base noun phrases. Works on both Doc and Span. Detect base noun phrases. Works on both Doc and Span.
""" """
@ -14,7 +14,7 @@ def noun_chunks(obj):
# obj tag corrects some DEP tagger mistakes. # obj tag corrects some DEP tagger mistakes.
# Further improvement of the models will eliminate the need for this tag. # Further improvement of the models will eliminate the need for this tag.
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"] labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
doc = obj.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
@ -24,7 +24,7 @@ def noun_chunks(obj):
nmod = doc.vocab.strings.add("nmod") nmod = doc.vocab.strings.add("nmod")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced

View File

@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
def noun_chunks(obj): def noun_chunks(doclike):
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """
@ -20,7 +20,7 @@ def noun_chunks(obj):
"attr", "attr",
"ROOT", "ROOT",
] ]
doc = obj.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
@ -29,7 +29,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced

View File

@ -5,8 +5,8 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors from ...errors import Errors
def noun_chunks(obj): def noun_chunks(doclike):
doc = obj.doc doc = doclike.doc
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)

View File

@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
def noun_chunks(obj): def noun_chunks(doclike):
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """
@ -20,7 +20,7 @@ def noun_chunks(obj):
"attr", "attr",
"ROOT", "ROOT",
] ]
doc = obj.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
@ -29,7 +29,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced

View File

@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
def noun_chunks(obj): def noun_chunks(doclike):
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """
@ -19,7 +19,7 @@ def noun_chunks(obj):
"nmod", "nmod",
"nmod:poss", "nmod:poss",
] ]
doc = obj.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
@ -28,7 +28,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced

View File

@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
def noun_chunks(obj): def noun_chunks(doclike):
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """
@ -19,7 +19,7 @@ def noun_chunks(obj):
"nmod", "nmod",
"nmod:poss", "nmod:poss",
] ]
doc = obj.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
@ -28,7 +28,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced

View File

@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
def noun_chunks(obj): def noun_chunks(doclike):
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """
@ -19,7 +19,7 @@ def noun_chunks(obj):
"nmod", "nmod",
"nmod:poss", "nmod:poss",
] ]
doc = obj.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
@ -28,7 +28,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced

View File

@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
def noun_chunks(obj): def noun_chunks(doclike):
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
""" """
@ -20,7 +20,7 @@ def noun_chunks(obj):
"nmod", "nmod",
"nmod:poss", "nmod:poss",
] ]
doc = obj.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.is_parsed:
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
@ -29,7 +29,7 @@ def noun_chunks(obj):
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
seen = set() seen = set()
for i, word in enumerate(obj): for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN, PRON): if word.pos not in (NOUN, PROPN, PRON):
continue continue
# Prevent nested chunks from being produced # Prevent nested chunks from being produced

View File

@ -213,28 +213,28 @@ cdef class Matcher:
else: else:
yield doc yield doc
def __call__(self, object doc_or_span): def __call__(self, object doclike):
"""Find all token sequences matching the supplied pattern. """Find all token sequences matching the supplied pattern.
doc_or_span (Doc or Span): The document to match over. doclike (Doc or Span): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples, RETURNS (list): A list of `(key, start, end)` tuples,
describing the matches. A match tuple describes a span describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers. `doc[start:end]`. The `label_id` and `key` are both integers.
""" """
if isinstance(doc_or_span, Doc): if isinstance(doclike, Doc):
doc = doc_or_span doc = doclike
length = len(doc) length = len(doc)
elif isinstance(doc_or_span, Span): elif isinstance(doclike, Span):
doc = doc_or_span.doc doc = doclike.doc
length = doc_or_span.end - doc_or_span.start length = doclike.end - doclike.start
else: else:
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__)) raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
and not doc.is_tagged: and not doc.is_tagged:
raise ValueError(Errors.E155.format()) raise ValueError(Errors.E155.format())
if DEP in self._seen_attrs and not doc.is_parsed: if DEP in self._seen_attrs and not doc.is_parsed:
raise ValueError(Errors.E156.format()) raise ValueError(Errors.E156.format())
matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length, matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
extensions=self._extensions, predicates=self._extra_predicates) extensions=self._extensions, predicates=self._extra_predicates)
for i, (key, start, end) in enumerate(matches): for i, (key, start, end) in enumerate(matches):
on_match = self._callbacks.get(key, None) on_match = self._callbacks.get(key, None)
@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
return matcher return matcher
cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()): cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple()):
"""Find matches in a doc, with a compiled array of patterns. Matches are """Find matches in a doc, with a compiled array of patterns. Matches are
returned as a list of (id, start, end) tuples. returned as a list of (id, start, end) tuples.
@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
else: else:
nr_extra_attr = 0 nr_extra_attr = 0
extra_attr_values = <attr_t*>mem.alloc(length, sizeof(attr_t)) extra_attr_values = <attr_t*>mem.alloc(length, sizeof(attr_t))
for i, token in enumerate(doc_or_span): for i, token in enumerate(doclike):
for name, index in extensions.items(): for name, index in extensions.items():
value = token._.get(name) value = token._.get(name)
if isinstance(value, basestring): if isinstance(value, basestring):
@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
for j in range(n): for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0)) states.push_back(PatternStateC(patterns[j], i, 0))
transition_states(states, matches, predicate_cache, transition_states(states, matches, predicate_cache,
doc_or_span[i], extra_attr_values, predicates) doclike[i], extra_attr_values, predicates)
extra_attr_values += nr_extra_attr extra_attr_values += nr_extra_attr
predicate_cache += len(predicates) predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns # Handle matches that end in 0-width patterns