mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Update notes on matcher2
This commit is contained in:
parent
b4cc39eb74
commit
0004331895
|
@ -49,54 +49,53 @@ def get_action(state, token):
|
||||||
'''We need to consider:
|
'''We need to consider:
|
||||||
|
|
||||||
a) Does the token match the specification? [Yes, No]
|
a) Does the token match the specification? [Yes, No]
|
||||||
b) What's the quantifier? [1, 1+, 0+]
|
b) What's the quantifier? [1, 0+, ?]
|
||||||
c) Is this the last specification? [final, non-final]
|
c) Is this the last specification? [final, non-final]
|
||||||
|
|
||||||
We therefore have 12 cases to consider. For each case, we need to know
|
We can transition in the following ways:
|
||||||
whether to emit a match, whether to keep the current state in the partials,
|
|
||||||
and whether to add an advanced state to the partials.
|
|
||||||
|
|
||||||
We therefore have eight possible results for these three booleans, which
|
a) Do we emit a match?
|
||||||
we'll code as 000, 001 etc.
|
b) Do we add a state with (next state, next token)?
|
||||||
|
c) Do we add a state with (next state, same token)?
|
||||||
|
d) Do we add a state with (same state, next token)?
|
||||||
|
|
||||||
- No match:
|
We'll code the actions as boolean strings, so 0000 means no to all 4,
|
||||||
000
|
1000 means match but no states added, etc.
|
||||||
- Match, final:
|
|
||||||
1: 100
|
1:
|
||||||
1+: 110
|
Yes, final:
|
||||||
- Match, non-final:
|
1000
|
||||||
1: 001
|
Yes, non-final:
|
||||||
1+: 011
|
0100
|
||||||
|
No, final:
|
||||||
|
0000
|
||||||
|
No, non-final
|
||||||
|
0000
|
||||||
|
0+:
|
||||||
|
Yes, final:
|
||||||
|
1001
|
||||||
|
Yes, non-final:
|
||||||
|
0111
|
||||||
|
No, final:
|
||||||
|
1000 (note: Don't include last token!)
|
||||||
|
No, non-final:
|
||||||
|
0010
|
||||||
|
?:
|
||||||
|
Yes, final:
|
||||||
|
1000
|
||||||
|
Yes, non-final:
|
||||||
|
0100
|
||||||
|
No, final:
|
||||||
|
1000 (note: Don't include last token!)
|
||||||
|
No, non-final:
|
||||||
|
0010
|
||||||
|
|
||||||
Problem: If a quantifier is matching, we're adding a lot of open partials
|
Problem: If a quantifier is matching, we're adding a lot of open partials
|
||||||
'''
|
'''
|
||||||
is_match = get_is_match(state, token)
|
is_match = get_is_match(state, token)
|
||||||
operator = get_operator(state, token)
|
operator = get_operator(state, token)
|
||||||
is_final = get_is_final(state, token)
|
is_final = get_is_final(state, token)
|
||||||
if operator == '1':
|
raise NotImplementedError
|
||||||
if not is_match:
|
|
||||||
return '000'
|
|
||||||
elif is_final:
|
|
||||||
return '100'
|
|
||||||
else:
|
|
||||||
return '001'
|
|
||||||
elif operator == '1+':
|
|
||||||
if not is_match:
|
|
||||||
return '000'
|
|
||||||
if is_final:
|
|
||||||
return '110'
|
|
||||||
else:
|
|
||||||
return '011'
|
|
||||||
elif operator == '0+':
|
|
||||||
if is_final:
|
|
||||||
return '100'
|
|
||||||
elif is_match:
|
|
||||||
return '011'
|
|
||||||
else:
|
|
||||||
return '001'
|
|
||||||
else:
|
|
||||||
print(operator, is_match, is_final)
|
|
||||||
raise ValueError
|
|
||||||
|
|
||||||
|
|
||||||
def get_is_match(state, token):
|
def get_is_match(state, token):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user