mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
obj -> doclike
This commit is contained in:
parent
69fb4bedf2
commit
b1f45c9da3
|
@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -28,7 +28,7 @@ def noun_chunks(obj):
|
||||||
"og",
|
"og",
|
||||||
"app",
|
"app",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
@ -38,7 +38,7 @@ def noun_chunks(obj):
|
||||||
close_app = doc.vocab.strings.add("nk")
|
close_app = doc.vocab.strings.add("nk")
|
||||||
|
|
||||||
rbracket = 0
|
rbracket = 0
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if i < rbracket:
|
if i < rbracket:
|
||||||
continue
|
continue
|
||||||
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps:
|
||||||
|
|
|
@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases. Works on both Doc and Span.
|
Detect base noun phrases. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -14,7 +14,7 @@ def noun_chunks(obj):
|
||||||
# obj tag corrects some DEP tagger mistakes.
|
# obj tag corrects some DEP tagger mistakes.
|
||||||
# Further improvement of the models will eliminate the need for this tag.
|
# Further improvement of the models will eliminate the need for this tag.
|
||||||
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
@ -24,7 +24,7 @@ def noun_chunks(obj):
|
||||||
nmod = doc.vocab.strings.add("nmod")
|
nmod = doc.vocab.strings.add("nmod")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
|
|
|
@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -20,7 +20,7 @@ def noun_chunks(obj):
|
||||||
"attr",
|
"attr",
|
||||||
"ROOT",
|
"ROOT",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
@ -29,7 +29,7 @@ def noun_chunks(obj):
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
|
|
|
@ -5,8 +5,8 @@ from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
doc = obj.doc
|
doc = doclike.doc
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
|
|
@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -20,7 +20,7 @@ def noun_chunks(obj):
|
||||||
"attr",
|
"attr",
|
||||||
"ROOT",
|
"ROOT",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
@ -29,7 +29,7 @@ def noun_chunks(obj):
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
|
|
|
@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -19,7 +19,7 @@ def noun_chunks(obj):
|
||||||
"nmod",
|
"nmod",
|
||||||
"nmod:poss",
|
"nmod:poss",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
@ -28,7 +28,7 @@ def noun_chunks(obj):
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
|
|
|
@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -19,7 +19,7 @@ def noun_chunks(obj):
|
||||||
"nmod",
|
"nmod",
|
||||||
"nmod:poss",
|
"nmod:poss",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
@ -28,7 +28,7 @@ def noun_chunks(obj):
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
|
|
|
@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -19,7 +19,7 @@ def noun_chunks(obj):
|
||||||
"nmod",
|
"nmod",
|
||||||
"nmod:poss",
|
"nmod:poss",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
@ -28,7 +28,7 @@ def noun_chunks(obj):
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
|
|
|
@ -5,7 +5,7 @@ from ...symbols import NOUN, PROPN, PRON
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(doclike):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
"""
|
"""
|
||||||
|
@ -20,7 +20,7 @@ def noun_chunks(obj):
|
||||||
"nmod",
|
"nmod",
|
||||||
"nmod:poss",
|
"nmod:poss",
|
||||||
]
|
]
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
raise ValueError(Errors.E029)
|
raise ValueError(Errors.E029)
|
||||||
|
@ -29,7 +29,7 @@ def noun_chunks(obj):
|
||||||
conj = doc.vocab.strings.add("conj")
|
conj = doc.vocab.strings.add("conj")
|
||||||
np_label = doc.vocab.strings.add("NP")
|
np_label = doc.vocab.strings.add("NP")
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(doclike):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
continue
|
continue
|
||||||
# Prevent nested chunks from being produced
|
# Prevent nested chunks from being produced
|
||||||
|
|
|
@ -213,28 +213,28 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def __call__(self, object doc_or_span):
|
def __call__(self, object doclike):
|
||||||
"""Find all token sequences matching the supplied pattern.
|
"""Find all token sequences matching the supplied pattern.
|
||||||
|
|
||||||
doc_or_span (Doc or Span): The document to match over.
|
doclike (Doc or Span): The document to match over.
|
||||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||||
describing the matches. A match tuple describes a span
|
describing the matches. A match tuple describes a span
|
||||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||||
"""
|
"""
|
||||||
if isinstance(doc_or_span, Doc):
|
if isinstance(doclike, Doc):
|
||||||
doc = doc_or_span
|
doc = doclike
|
||||||
length = len(doc)
|
length = len(doc)
|
||||||
elif isinstance(doc_or_span, Span):
|
elif isinstance(doclike, Span):
|
||||||
doc = doc_or_span.doc
|
doc = doclike.doc
|
||||||
length = doc_or_span.end - doc_or_span.start
|
length = doclike.end - doclike.start
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doc_or_span).__name__))
|
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
||||||
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
|
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
|
||||||
and not doc.is_tagged:
|
and not doc.is_tagged:
|
||||||
raise ValueError(Errors.E155.format())
|
raise ValueError(Errors.E155.format())
|
||||||
if DEP in self._seen_attrs and not doc.is_parsed:
|
if DEP in self._seen_attrs and not doc.is_parsed:
|
||||||
raise ValueError(Errors.E156.format())
|
raise ValueError(Errors.E156.format())
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doc_or_span, length,
|
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||||
extensions=self._extensions, predicates=self._extra_predicates)
|
extensions=self._extensions, predicates=self._extra_predicates)
|
||||||
for i, (key, start, end) in enumerate(matches):
|
for i, (key, start, end) in enumerate(matches):
|
||||||
on_match = self._callbacks.get(key, None)
|
on_match = self._callbacks.get(key, None)
|
||||||
|
@ -257,7 +257,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
|
||||||
return matcher
|
return matcher
|
||||||
|
|
||||||
|
|
||||||
cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int length, extensions=None, predicates=tuple()):
|
cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple()):
|
||||||
"""Find matches in a doc, with a compiled array of patterns. Matches are
|
"""Find matches in a doc, with a compiled array of patterns. Matches are
|
||||||
returned as a list of (id, start, end) tuples.
|
returned as a list of (id, start, end) tuples.
|
||||||
|
|
||||||
|
@ -286,7 +286,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
|
||||||
else:
|
else:
|
||||||
nr_extra_attr = 0
|
nr_extra_attr = 0
|
||||||
extra_attr_values = <attr_t*>mem.alloc(length, sizeof(attr_t))
|
extra_attr_values = <attr_t*>mem.alloc(length, sizeof(attr_t))
|
||||||
for i, token in enumerate(doc_or_span):
|
for i, token in enumerate(doclike):
|
||||||
for name, index in extensions.items():
|
for name, index in extensions.items():
|
||||||
value = token._.get(name)
|
value = token._.get(name)
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, basestring):
|
||||||
|
@ -298,7 +298,7 @@ cdef find_matches(TokenPatternC** patterns, int n, object doc_or_span, int lengt
|
||||||
for j in range(n):
|
for j in range(n):
|
||||||
states.push_back(PatternStateC(patterns[j], i, 0))
|
states.push_back(PatternStateC(patterns[j], i, 0))
|
||||||
transition_states(states, matches, predicate_cache,
|
transition_states(states, matches, predicate_cache,
|
||||||
doc_or_span[i], extra_attr_values, predicates)
|
doclike[i], extra_attr_values, predicates)
|
||||||
extra_attr_values += nr_extra_attr
|
extra_attr_values += nr_extra_attr
|
||||||
predicate_cache += len(predicates)
|
predicate_cache += len(predicates)
|
||||||
# Handle matches that end in 0-width patterns
|
# Handle matches that end in 0-width patterns
|
||||||
|
|
Loading…
Reference in New Issue
Block a user