mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
commit
6a8cb905aa
|
@ -8,9 +8,15 @@ from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libcpp.pair cimport pair
|
from libcpp.pair cimport pair
|
||||||
|
from cython.operator cimport dereference as deref
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# from libcpp.unordered_map cimport unordered_map as umap
|
||||||
|
# except:
|
||||||
|
# from libcpp.map cimport map as umap
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
|
@ -62,10 +68,11 @@ cdef enum action_t:
|
||||||
REPEAT
|
REPEAT
|
||||||
ACCEPT
|
ACCEPT
|
||||||
ADVANCE_ZERO
|
ADVANCE_ZERO
|
||||||
|
ADVANCE_PLUS
|
||||||
ACCEPT_PREV
|
ACCEPT_PREV
|
||||||
PANIC
|
PANIC
|
||||||
|
|
||||||
# A "match expression" conists of one or more token patterns
|
|
||||||
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
|
# Each token pattern consists of a quantifier and 0+ (attr, value) pairs.
|
||||||
# A state is an (int, pattern pointer) pair, where the int is the start
|
# A state is an (int, pattern pointer) pair, where the int is the start
|
||||||
# position, and the pattern pointer shows where we're up to
|
# position, and the pattern pointer shows where we're up to
|
||||||
|
@ -83,7 +90,25 @@ cdef struct TokenPatternC:
|
||||||
|
|
||||||
|
|
||||||
ctypedef TokenPatternC* TokenPatternC_ptr
|
ctypedef TokenPatternC* TokenPatternC_ptr
|
||||||
ctypedef pair[int, TokenPatternC_ptr] StateC
|
# ctypedef pair[int, TokenPatternC_ptr] StateC
|
||||||
|
|
||||||
|
# Match Dictionary entry type
|
||||||
|
cdef struct MatchEntryC:
|
||||||
|
int32_t start
|
||||||
|
int32_t end
|
||||||
|
int32_t offset
|
||||||
|
|
||||||
|
# A state instance represents the information that defines a
|
||||||
|
# partial match
|
||||||
|
# start: the index of the first token in the partial match
|
||||||
|
# pattern: a pointer to the current token pattern in the full
|
||||||
|
# pattern
|
||||||
|
# last_match: The entry of the last span matched by the
|
||||||
|
# same pattern
|
||||||
|
cdef struct StateC:
|
||||||
|
int32_t start
|
||||||
|
TokenPatternC_ptr pattern
|
||||||
|
MatchEntryC* last_match
|
||||||
|
|
||||||
|
|
||||||
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
||||||
|
@ -128,7 +153,10 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||||
if pattern.quantifier == ZERO:
|
if pattern.quantifier == ZERO:
|
||||||
return REJECT
|
return REJECT
|
||||||
elif lookahead.nr_attr == 0:
|
elif lookahead.nr_attr == 0:
|
||||||
return ACCEPT
|
if pattern.quantifier == ZERO_PLUS:
|
||||||
|
return REPEAT
|
||||||
|
else:
|
||||||
|
return ACCEPT
|
||||||
elif pattern.quantifier in (ONE, ZERO_ONE):
|
elif pattern.quantifier in (ONE, ZERO_ONE):
|
||||||
return ADVANCE
|
return ADVANCE
|
||||||
elif pattern.quantifier == ZERO_PLUS:
|
elif pattern.quantifier == ZERO_PLUS:
|
||||||
|
@ -138,7 +166,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||||
if next_action is REJECT:
|
if next_action is REJECT:
|
||||||
return REPEAT
|
return REPEAT
|
||||||
else:
|
else:
|
||||||
return ADVANCE_ZERO
|
return ADVANCE_PLUS
|
||||||
else:
|
else:
|
||||||
return PANIC
|
return PANIC
|
||||||
|
|
||||||
|
@ -339,77 +367,223 @@ cdef class Matcher:
|
||||||
cdef int i, token_i
|
cdef int i, token_i
|
||||||
cdef const TokenC* token
|
cdef const TokenC* token
|
||||||
cdef StateC state
|
cdef StateC state
|
||||||
|
cdef int j = 0
|
||||||
|
cdef int k
|
||||||
|
cdef bint overlap = False
|
||||||
|
cdef MatchEntryC* state_match
|
||||||
|
cdef MatchEntryC* last_matches = <MatchEntryC*>self.mem.alloc(self.patterns.size(),sizeof(MatchEntryC))
|
||||||
|
|
||||||
|
for i in range(self.patterns.size()):
|
||||||
|
last_matches[i].start = 0
|
||||||
|
last_matches[i].end = 0
|
||||||
|
last_matches[i].offset = 0
|
||||||
|
|
||||||
matches = []
|
matches = []
|
||||||
for token_i in range(doc.length):
|
for token_i in range(doc.length):
|
||||||
token = &doc.c[token_i]
|
token = &doc.c[token_i]
|
||||||
q = 0
|
q = 0
|
||||||
# Go over the open matches, extending or finalizing if able.
|
# Go over the open matches, extending or finalizing if able.
|
||||||
# Otherwise, we over-write them (q doesn't advance)
|
# Otherwise, we over-write them (q doesn't advance)
|
||||||
for state in partials:
|
#for state in partials:
|
||||||
action = get_action(state.second, token)
|
j=0
|
||||||
|
while j < n_partials:
|
||||||
|
state = partials[j]
|
||||||
|
action = get_action(state.pattern, token)
|
||||||
|
j += 1
|
||||||
|
# Skip patterns that would overlap with an existing match
|
||||||
|
# Patterns overlap an existing match if they point to the
|
||||||
|
# same final state and start between the start and end
|
||||||
|
# of said match.
|
||||||
|
# Different patterns with the same label are allowed to
|
||||||
|
# overlap.
|
||||||
|
state_match = state.last_match
|
||||||
|
if (state.start > state_match.start
|
||||||
|
and state.start < state_match.end):
|
||||||
|
continue
|
||||||
if action == PANIC:
|
if action == PANIC:
|
||||||
raise Exception("Error selecting action in matcher")
|
raise Exception("Error selecting action in matcher")
|
||||||
while action == ADVANCE_ZERO:
|
while action == ADVANCE_ZERO:
|
||||||
state.second += 1
|
state.pattern += 1
|
||||||
action = get_action(state.second, token)
|
action = get_action(state.pattern, token)
|
||||||
if action == PANIC:
|
if action == PANIC:
|
||||||
raise Exception("Error selecting action in matcher")
|
raise Exception("Error selecting action in matcher")
|
||||||
|
|
||||||
|
# ADVANCE_PLUS acts like REPEAT, but also pushes a partial that
|
||||||
|
# acts like and ADVANCE_ZERO
|
||||||
|
if action == ADVANCE_PLUS:
|
||||||
|
state.pattern += 1
|
||||||
|
partials.push_back(state)
|
||||||
|
n_partials += 1
|
||||||
|
state.pattern -= 1
|
||||||
|
action = REPEAT
|
||||||
|
|
||||||
|
if action == ADVANCE:
|
||||||
|
state.pattern += 1
|
||||||
|
|
||||||
|
# Check for partial matches that are at the same spec in the same pattern
|
||||||
|
# Keep the longer of the matches
|
||||||
|
# This ensures that there are never more then 2 partials for every spec
|
||||||
|
# in a pattern (one of which gets pruned in this step)
|
||||||
|
|
||||||
|
overlap=False
|
||||||
|
for i in range(q):
|
||||||
|
if state.pattern == partials[i].pattern and state.start < partials[i].start:
|
||||||
|
partials[i] = state
|
||||||
|
j = i
|
||||||
|
overlap = True
|
||||||
|
break
|
||||||
|
if overlap:
|
||||||
|
continue
|
||||||
|
overlap=False
|
||||||
|
for i in range(q):
|
||||||
|
if state.pattern == partials[i].pattern:
|
||||||
|
overlap = True
|
||||||
|
break
|
||||||
|
if overlap:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
if action == REPEAT:
|
if action == REPEAT:
|
||||||
# Leave the state in the queue, and advance to next slot
|
# Leave the state in the queue, and advance to next slot
|
||||||
# (i.e. we don't overwrite -- we want to greedily match
|
# (i.e. we don't overwrite -- we want to greedily match
|
||||||
# more pattern.
|
# more pattern.
|
||||||
|
partials[q] = state
|
||||||
q += 1
|
q += 1
|
||||||
elif action == REJECT:
|
elif action == REJECT:
|
||||||
pass
|
pass
|
||||||
elif action == ADVANCE:
|
elif action == ADVANCE:
|
||||||
partials[q] = state
|
partials[q] = state
|
||||||
partials[q].second += 1
|
|
||||||
q += 1
|
q += 1
|
||||||
elif action in (ACCEPT, ACCEPT_PREV):
|
elif action in (ACCEPT, ACCEPT_PREV):
|
||||||
# TODO: What to do about patterns starting with ZERO? Need
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
# to adjust the start position.
|
# to adjust the start position.
|
||||||
start = state.first
|
start = state.start
|
||||||
end = token_i+1 if action == ACCEPT else token_i
|
end = token_i+1 if action == ACCEPT else token_i
|
||||||
ent_id = state.second[1].attrs[0].value
|
ent_id = state.pattern[1].attrs[0].value
|
||||||
label = state.second[1].attrs[1].value
|
label = state.pattern[1].attrs[1].value
|
||||||
matches.append((ent_id, start, end))
|
# Check that this match doesn't overlap with an earlier match.
|
||||||
|
# Only overwrite an earlier match if it is a substring of this
|
||||||
|
# match (i.e. it starts after this match starts).
|
||||||
|
state_match = state.last_match
|
||||||
|
|
||||||
|
if start >= state_match.end:
|
||||||
|
state_match.start = start
|
||||||
|
state_match.end = end
|
||||||
|
state_match.offset = len(matches)
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
elif start <= state_match.start and end >= state_match.end:
|
||||||
|
if len(matches) == 0:
|
||||||
|
assert state_match.offset==0
|
||||||
|
state_match.offset = 0
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
else:
|
||||||
|
i = state_match.offset
|
||||||
|
matches[i] = (ent_id,start,end)
|
||||||
|
state_match.start = start
|
||||||
|
state_match.end = end
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
partials.resize(q)
|
partials.resize(q)
|
||||||
|
n_partials = q
|
||||||
# Check whether we open any new patterns on this token
|
# Check whether we open any new patterns on this token
|
||||||
|
i=0
|
||||||
for pattern in self.patterns:
|
for pattern in self.patterns:
|
||||||
|
# Skip patterns that would overlap with an existing match
|
||||||
|
# state_match = pattern.last_match
|
||||||
|
state_match = &last_matches[i]
|
||||||
|
i+=1
|
||||||
|
if (token_i > state_match.start
|
||||||
|
and token_i < state_match.end):
|
||||||
|
continue
|
||||||
action = get_action(pattern, token)
|
action = get_action(pattern, token)
|
||||||
if action == PANIC:
|
if action == PANIC:
|
||||||
raise Exception("Error selecting action in matcher")
|
raise Exception("Error selecting action in matcher")
|
||||||
while action == ADVANCE_ZERO:
|
while action in (ADVANCE_PLUS,ADVANCE_ZERO):
|
||||||
|
if action == ADVANCE_PLUS:
|
||||||
|
state.start = token_i
|
||||||
|
state.pattern = pattern
|
||||||
|
state.last_match = state_match
|
||||||
|
partials.push_back(state)
|
||||||
|
n_partials += 1
|
||||||
pattern += 1
|
pattern += 1
|
||||||
action = get_action(pattern, token)
|
action = get_action(pattern, token)
|
||||||
|
|
||||||
|
if action == ADVANCE:
|
||||||
|
pattern += 1
|
||||||
|
j=0
|
||||||
|
overlap = False
|
||||||
|
for j in range(q):
|
||||||
|
if pattern == partials[j].pattern:
|
||||||
|
overlap = True
|
||||||
|
break
|
||||||
|
if overlap:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
if action == REPEAT:
|
if action == REPEAT:
|
||||||
state.first = token_i
|
state.start = token_i
|
||||||
state.second = pattern
|
state.pattern = pattern
|
||||||
|
state.last_match = state_match
|
||||||
partials.push_back(state)
|
partials.push_back(state)
|
||||||
|
n_partials += 1
|
||||||
elif action == ADVANCE:
|
elif action == ADVANCE:
|
||||||
# TODO: What to do about patterns starting with ZERO? Need
|
# TODO: What to do about patterns starting with ZERO? Need
|
||||||
# to adjust the start position.
|
# to adjust the start position.
|
||||||
state.first = token_i
|
state.start = token_i
|
||||||
state.second = pattern + 1
|
state.pattern = pattern
|
||||||
|
state.last_match = state_match
|
||||||
partials.push_back(state)
|
partials.push_back(state)
|
||||||
|
n_partials += 1
|
||||||
elif action in (ACCEPT, ACCEPT_PREV):
|
elif action in (ACCEPT, ACCEPT_PREV):
|
||||||
start = token_i
|
start = token_i
|
||||||
end = token_i+1 if action == ACCEPT else token_i
|
end = token_i+1 if action == ACCEPT else token_i
|
||||||
ent_id = pattern[1].attrs[0].value
|
ent_id = pattern[1].attrs[0].value
|
||||||
|
|
||||||
label = pattern[1].attrs[1].value
|
label = pattern[1].attrs[1].value
|
||||||
matches.append((ent_id, start, end))
|
if start >= state_match.end:
|
||||||
|
state_match.start = start
|
||||||
|
state_match.end = end
|
||||||
|
state_match.offset = len(matches)
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
if start <= state_match.start and end >= state_match.end:
|
||||||
|
if len(matches) == 0:
|
||||||
|
state_match.offset = 0
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
else:
|
||||||
|
j = state_match.offset
|
||||||
|
matches[j] = (ent_id,start,end)
|
||||||
|
state_match.start = start
|
||||||
|
state_match.end = end
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
# Look for open patterns that are actually satisfied
|
# Look for open patterns that are actually satisfied
|
||||||
for state in partials:
|
for state in partials:
|
||||||
while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
|
while state.pattern.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
|
||||||
state.second += 1
|
state.pattern += 1
|
||||||
if state.second.nr_attr == 0:
|
if state.pattern.nr_attr == 0:
|
||||||
start = state.first
|
start = state.start
|
||||||
end = len(doc)
|
end = len(doc)
|
||||||
ent_id = state.second.attrs[0].value
|
ent_id = state.pattern.attrs[0].value
|
||||||
label = state.second.attrs[0].value
|
label = state.pattern.attrs[1].value
|
||||||
matches.append((ent_id, start, end))
|
state_match = state.last_match
|
||||||
|
if start >= state_match.end:
|
||||||
|
state_match.start = start
|
||||||
|
state_match.end = end
|
||||||
|
state_match.offset = len(matches)
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
if start <= state_match.start and end >= state_match.end:
|
||||||
|
j = state_match.offset
|
||||||
|
if len(matches) == 0:
|
||||||
|
state_match.offset = 0
|
||||||
|
matches.append((ent_id,start,end))
|
||||||
|
else:
|
||||||
|
matches[j] = (ent_id,start,end)
|
||||||
|
state_match.start = start
|
||||||
|
state_match.end = end
|
||||||
|
else:
|
||||||
|
pass
|
||||||
for i, (ent_id, start, end) in enumerate(matches):
|
for i, (ent_id, start, end) in enumerate(matches):
|
||||||
on_match = self._callbacks.get(ent_id)
|
on_match = self._callbacks.get(ent_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
|
|
|
@ -13,8 +13,8 @@ from ...vocab import Vocab
|
||||||
('a b', 0, 2),
|
('a b', 0, 2),
|
||||||
('a c', 0, 1),
|
('a c', 0, 1),
|
||||||
('a b c', 0, 2),
|
('a b c', 0, 2),
|
||||||
('a b b c', 0, 2),
|
('a b b c', 0, 3),
|
||||||
('a b b', 0, 2),
|
('a b b', 0, 3),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def test_issue1450_matcher_end_zero_plus(string, start, end):
|
def test_issue1450_matcher_end_zero_plus(string, start, end):
|
||||||
|
|
63
spacy/tests/regression/test_issue1855.py
Normal file
63
spacy/tests/regression/test_issue1855.py
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ...matcher import Matcher
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
|
||||||
|
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
|
||||||
|
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
|
||||||
|
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
|
||||||
|
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
|
||||||
|
|
||||||
|
re_pattern1 = 'AA*'
|
||||||
|
re_pattern2 = 'A*A'
|
||||||
|
re_pattern3 = 'AA'
|
||||||
|
re_pattern4 = 'BA*B'
|
||||||
|
re_pattern5 = 'B*A*B'
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def text():
|
||||||
|
return "(ABBAAAAAB)."
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc(en_tokenizer,text):
|
||||||
|
doc = en_tokenizer(' '.join(text))
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('pattern,re_pattern',[
|
||||||
|
(pattern1,re_pattern1),
|
||||||
|
(pattern2,re_pattern2),
|
||||||
|
(pattern3,re_pattern3),
|
||||||
|
(pattern4,re_pattern4),
|
||||||
|
(pattern5,re_pattern5)])
|
||||||
|
def test_greedy_matching(doc,text,pattern,re_pattern):
|
||||||
|
"""
|
||||||
|
Test that the greedy matching behavior of the * op
|
||||||
|
is consistant with other re implementations
|
||||||
|
"""
|
||||||
|
matcher = Matcher(doc.vocab)
|
||||||
|
matcher.add(re_pattern,None,pattern)
|
||||||
|
matches = matcher(doc)
|
||||||
|
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
|
||||||
|
for match,re_match in zip(matches,re_matches):
|
||||||
|
assert match[1:]==re_match
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('pattern,re_pattern',[
|
||||||
|
(pattern1,re_pattern1),
|
||||||
|
(pattern2,re_pattern2),
|
||||||
|
(pattern3,re_pattern3),
|
||||||
|
(pattern4,re_pattern4),
|
||||||
|
(pattern5,re_pattern5)])
|
||||||
|
def test_match_consuming(doc,text,pattern,re_pattern):
|
||||||
|
"""
|
||||||
|
Test that matcher.__call__ consumes tokens on a match
|
||||||
|
similar to re.findall
|
||||||
|
"""
|
||||||
|
matcher = Matcher(doc.vocab)
|
||||||
|
matcher.add(re_pattern,None,pattern)
|
||||||
|
matches = matcher(doc)
|
||||||
|
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
|
||||||
|
assert len(matches)==len(re_matches)
|
63
spacy/tests/test_matcher_greedy.py
Normal file
63
spacy/tests/test_matcher_greedy.py
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ..matcher import Matcher
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
pattern1 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'*'}]
|
||||||
|
pattern2 = [{'ORTH':'A','OP':'*'},{'ORTH':'A','OP':'1'}]
|
||||||
|
pattern3 = [{'ORTH':'A','OP':'1'},{'ORTH':'A','OP':'1'}]
|
||||||
|
pattern4 = [{'ORTH':'B','OP':'1'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
|
||||||
|
pattern5 = [{'ORTH':'B','OP':'*'},{'ORTH':'A','OP':'*'},{'ORTH':'B','OP':'1'}]
|
||||||
|
|
||||||
|
re_pattern1 = 'AA*'
|
||||||
|
re_pattern2 = 'A*A'
|
||||||
|
re_pattern3 = 'AA'
|
||||||
|
re_pattern4 = 'BA*B'
|
||||||
|
re_pattern5 = 'B*A*B'
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def text():
|
||||||
|
return "(ABBAAAAAB)."
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc(en_tokenizer,text):
|
||||||
|
doc = en_tokenizer(' '.join(text))
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('pattern,re_pattern',[
|
||||||
|
(pattern1,re_pattern1),
|
||||||
|
(pattern2,re_pattern2),
|
||||||
|
(pattern3,re_pattern3),
|
||||||
|
(pattern4,re_pattern4),
|
||||||
|
(pattern5,re_pattern5)])
|
||||||
|
def test_greedy_matching(doc,text,pattern,re_pattern):
|
||||||
|
"""
|
||||||
|
Test that the greedy matching behavior of the * op
|
||||||
|
is consistant with other re implementations
|
||||||
|
"""
|
||||||
|
matcher = Matcher(doc.vocab)
|
||||||
|
matcher.add(re_pattern,None,pattern)
|
||||||
|
matches = matcher(doc)
|
||||||
|
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
|
||||||
|
for match,re_match in zip(matches,re_matches):
|
||||||
|
assert match[1:]==re_match
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('pattern,re_pattern',[
|
||||||
|
(pattern1,re_pattern1),
|
||||||
|
(pattern2,re_pattern2),
|
||||||
|
(pattern3,re_pattern3),
|
||||||
|
(pattern4,re_pattern4),
|
||||||
|
(pattern5,re_pattern5)])
|
||||||
|
def test_match_consuming(doc,text,pattern,re_pattern):
|
||||||
|
"""
|
||||||
|
Test that matcher.__call__ consumes tokens on a match
|
||||||
|
similar to re.findall
|
||||||
|
"""
|
||||||
|
matcher = Matcher(doc.vocab)
|
||||||
|
matcher.add(re_pattern,None,pattern)
|
||||||
|
matches = matcher(doc)
|
||||||
|
re_matches = [m.span() for m in re.finditer(re_pattern,text)]
|
||||||
|
assert len(matches)==len(re_matches)
|
|
@ -161,11 +161,7 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| The #[code +] and #[code *] operators are usually interpretted
|
| The #[code +] and #[code *] operators are usually interpretted
|
||||||
| "greedily", i.e. longer matches are returned where possible. However, if
|
| "greedily", i.e. longer matches are returned where possible.
|
||||||
| you specify two #[code +] and #[code *] patterns in a row and their
|
|
||||||
| matches overlap, the first operator will behave non-greedily. This quirk
|
|
||||||
| in the semantics makes the matcher more efficient, by avoiding the need
|
|
||||||
| for back-tracking.
|
|
||||||
|
|
||||||
+h(3, "adding-phrase-patterns") Adding phrase patterns
|
+h(3, "adding-phrase-patterns") Adding phrase patterns
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user