mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 21:54:54 +03:00
Removal of formatting changes
This commit is contained in:
parent
57ec153587
commit
1e1acf640a
|
@ -25,8 +25,10 @@ from ..errors import Errors, MatchPatternError, Warnings
|
|||
from ..strings import get_string_id
|
||||
from ..attrs import IDS
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
||||
cdef class Matcher:
|
||||
"""Match sequences of tokens, based on pattern rules.
|
||||
|
||||
|
@ -253,8 +255,7 @@ cdef class Matcher:
|
|||
matches = []
|
||||
else:
|
||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||
extensions=self._extensions, predicates=self._extra_predicates,
|
||||
with_alignments=with_alignments)
|
||||
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
|
||||
final_matches = []
|
||||
pairs_by_id = {}
|
||||
# For each key, either add all matches, or only the filtered,
|
||||
|
@ -326,6 +327,7 @@ cdef class Matcher:
|
|||
else:
|
||||
return key
|
||||
|
||||
|
||||
def unpickle_matcher(vocab, patterns, callbacks):
|
||||
matcher = Matcher(vocab)
|
||||
for key, pattern in patterns.items():
|
||||
|
@ -333,8 +335,7 @@ def unpickle_matcher(vocab, patterns, callbacks):
|
|||
matcher.add(key, pattern, on_match=callback)
|
||||
return matcher
|
||||
|
||||
cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(),
|
||||
bint with_alignments=0):
|
||||
cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0):
|
||||
"""Find matches in a doc, with a compiled array of patterns. Matches are
|
||||
returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
|
||||
|
||||
|
@ -378,8 +379,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
|||
states.push_back(PatternStateC(patterns[j], i, 0))
|
||||
if with_alignments != 0:
|
||||
align_states.resize(states.size())
|
||||
transition_states(states, matches, align_states, align_matches, predicate_cache, doclike[i], extra_attr_values,
|
||||
predicates, with_alignments)
|
||||
transition_states(states, matches, align_states, align_matches, predicate_cache,
|
||||
doclike[i], extra_attr_values, predicates, with_alignments)
|
||||
extra_attr_values += nr_extra_attr
|
||||
predicate_cache += len(predicates)
|
||||
# Handle matches that end in 0-width patterns
|
||||
|
@ -404,19 +405,21 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
|||
seen.add(match)
|
||||
return output
|
||||
|
||||
|
||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
||||
vector[vector[MatchAlignmentC]]& align_states,
|
||||
vector[vector[MatchAlignmentC]]& align_matches,
|
||||
int8_t * cached_py_predicates, Token token,
|
||||
const attr_t * extra_attrs, py_predicates, bint with_alignments) except *:
|
||||
vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
|
||||
int8_t * cached_py_predicates,
|
||||
Token token, const attr_t * extra_attrs, py_predicates, bint with_alignments) except *:
|
||||
cdef int q = 0
|
||||
cdef vector[PatternStateC] new_states
|
||||
cdef vector[vector[MatchAlignmentC]] align_new_states
|
||||
cdef int nr_predicate = len(py_predicates)
|
||||
for i in range(states.size()):
|
||||
if states[i].pattern.nr_py >= 1:
|
||||
update_predicate_cache(cached_py_predicates, states[i].pattern, token, py_predicates)
|
||||
action = get_action(states[i], token.c, extra_attrs, cached_py_predicates)
|
||||
update_predicate_cache(cached_py_predicates,
|
||||
states[i].pattern, token, py_predicates)
|
||||
action = get_action(states[i], token.c, extra_attrs,
|
||||
cached_py_predicates)
|
||||
if action == REJECT:
|
||||
continue
|
||||
# Keep only a subset of states (the active ones). Index q is the
|
||||
|
@ -437,19 +440,23 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
|||
if action in [RETRY_EXTEND, RETRY_OR_EXTEND]:
|
||||
# This handles the 'extend'
|
||||
new_states.push_back(
|
||||
PatternStateC(pattern=states[q].pattern, start=state.start, length=state.length + 1))
|
||||
PatternStateC(pattern=states[q].pattern, start=state.start,
|
||||
length=state.length + 1))
|
||||
if with_alignments != 0:
|
||||
align_new_states.push_back(align_states[q])
|
||||
if action == RETRY_ADVANCE:
|
||||
# This handles the 'advance'
|
||||
new_states.push_back(
|
||||
PatternStateC(pattern=states[q].pattern + 1, start=state.start, length=state.length + 1))
|
||||
PatternStateC(pattern=states[q].pattern + 1, start=state.start,
|
||||
length=state.length + 1))
|
||||
if with_alignments != 0:
|
||||
align_new_states.push_back(align_states[q])
|
||||
states[q].pattern += 1
|
||||
if states[q].pattern.nr_py != 0:
|
||||
update_predicate_cache(cached_py_predicates, states[q].pattern, token, py_predicates)
|
||||
next_action = get_action(states[q], token.c, extra_attrs, cached_py_predicates)
|
||||
update_predicate_cache(cached_py_predicates,
|
||||
states[q].pattern, token, py_predicates)
|
||||
next_action = get_action(states[q], token.c, extra_attrs,
|
||||
cached_py_predicates)
|
||||
# To account for *? and +?
|
||||
if get_quantifier(state) == ZERO_MINUS:
|
||||
next_action = cast_to_non_greedy_action(action, next_action, new_states, align_new_states,
|
||||
|
@ -470,37 +477,49 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
|||
else:
|
||||
ent_id = get_ent_id(state.pattern)
|
||||
if action == MATCH:
|
||||
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length + 1))
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length + 1))
|
||||
# `align_matches` always corresponds to `matches` 1:1
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
elif action == MATCH_DOUBLE:
|
||||
# push match without last token if length > 0
|
||||
if state.length > 0:
|
||||
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length))
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length))
|
||||
# MATCH_DOUBLE emits matches twice,
|
||||
# add one more to align_matches in order to keep 1:1 relationship
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
# push match with last token
|
||||
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length + 1))
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length + 1))
|
||||
# `align_matches` always corresponds to `matches` 1:1
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
elif action == MATCH_REJECT:
|
||||
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length))
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length))
|
||||
# `align_matches` always corresponds to `matches` 1:1
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
elif action == MATCH_EXTEND:
|
||||
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length))
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length))
|
||||
# `align_matches` always corresponds to `matches` 1:1
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
states[q].length += 1
|
||||
q += 1
|
||||
elif action == MATCH_ADVANCE:
|
||||
matches.push_back(MatchC(pattern_id=ent_id, start=state.start, length=state.length + 1))
|
||||
matches.push_back(
|
||||
MatchC(pattern_id=ent_id, start=state.start,
|
||||
length=state.length + 1))
|
||||
# `align_matches` always corresponds to `matches` 1:1
|
||||
if with_alignments != 0:
|
||||
align_matches.push_back(align_states[q])
|
||||
|
@ -535,6 +554,7 @@ cdef int update_predicate_cache(int8_t * cache,
|
|||
else:
|
||||
raise ValueError(Errors.E125.format(value=result))
|
||||
|
||||
|
||||
cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
||||
vector[vector[MatchAlignmentC]]& align_matches,
|
||||
vector[vector[MatchAlignmentC]]& align_states,
|
||||
|
@ -565,7 +585,8 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
|||
else:
|
||||
state.pattern += 1
|
||||
|
||||
cdef action_t get_action(PatternStateC state, const TokenC * token, const attr_t * extra_attrs,
|
||||
cdef action_t get_action(PatternStateC state,
|
||||
const TokenC* token, const attr_t* extra_attrs,
|
||||
const int8_t* predicate_matches) nogil:
|
||||
"""We need to consider:
|
||||
a) Does the token match the specification? [Yes, No]
|
||||
|
@ -656,6 +677,7 @@ cdef action_t get_action(PatternStateC state, const TokenC * token, const attr_t
|
|||
# is_non_greedy_plus() verifies that the current state's pattern is +?
|
||||
# has_star_tail() verifies the remaining pattern tokens are either * or *?,
|
||||
# so that it is valid for the current match to exist.
|
||||
# TODO if this impacts the performance, "ONE_MINUS" could be created
|
||||
return MATCH_ADVANCE
|
||||
elif is_match and not is_final:
|
||||
# Yes, non-final: 0100
|
||||
|
|
Loading…
Reference in New Issue
Block a user